In [None]:
#!pip install numpy
#!pip install pandas
#!pip install scikit-learn
#!pip install seaborn

In [None]:
import numpy as np
import pandas as pd

from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor

from sklearn.model_selection import train_test_split

from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from sklearn.preprocessing import OneHotEncoder, LabelEncoder

In [None]:
df = pd.read_csv('cleaned_data.csv')
df.head()

# Linear Regression

## Numeric Only

#### 'MILEAGE', 'MANUFACTURING YEAR', 'CONDITION'

In [None]:
# Assuming your data is in a pandas DataFrame called 'df'
features = ['MILEAGE', 'MANUFACTURING YEAR', 'CONDITION'] 
target = 'SELLING PRICE'

X = df[features]
y = df[target]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) 

model = LinearRegression()
model.fit(X_train, y_train)

In [None]:
y_pred = model.predict(X_test)  # Predicted prices

mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)

# Print the metrics
print("Mean Squared Error:", mse)
print("Root Mean Squared Error:", rmse)
print("R-squared:", r2)
print("Mean Absolute Error:", mae)

#### 'MILEAGE', 'MANUFACTURING YEAR', 'CONDITION', 'MARKET VALUE'

In [None]:
features = ['MILEAGE', 'MANUFACTURING YEAR', 'CONDITION', 'MARKET VALUE'] 
target = 'SELLING PRICE'

X = df[features]
y = df[target]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) 

model = LinearRegression()
model.fit(X_train, y_train)

In [None]:
y_true = y_test  # Actual prices
y_pred = model.predict(X_test)  # Predicted prices

mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)

# Print the metrics
print("Mean Squared Error:", mse)
print("Root Mean Squared Error:", rmse)
print("R-squared:", r2)
print("Mean Absolute Error:", mae)

## With Categorical (One Hot Encoded)

In [None]:
# Initialisation for One Hot Encoder
encoder = OneHotEncoder(handle_unknown='ignore', sparse_output=False).set_output(transform='pandas') 

#### 'MILEAGE', 'MANUFACTURING YEAR', 'CONDITION', 'BRAND', 'MODEL'

In [None]:
encoded_data = encoder.fit_transform(df[['BRAND', 'MODEL']]) 

In [None]:
features = ['MILEAGE', 'MANUFACTURING YEAR', 'CONDITION']
target = 'SELLING PRICE'

X = pd.concat([df[features], encoded_data], axis=1)
y = df[target]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = LinearRegression()
model.fit(X_train, y_train)

In [None]:
y_pred = model.predict(X_test)

mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)

print("Mean Squared Error:", mse)
print("Root Mean Squared Error:", rmse)
print("R-squared:", r2)
print("Mean Absolute Error:", mae)

## With Categorical (Label Encoded)

#### 'MILEAGE', 'MANUFACTURING YEAR', 'CONDITION', 'BRAND', 'MODEL'

In [None]:
encoder = LabelEncoder()
df['BRAND_encoded'] = encoder.fit_transform(df['BRAND']) 
df['MODEL_encoded'] = encoder.fit_transform(df['MODEL']) 
df['VEHICLE TYPE_encoded'] = encoder.fit_transform(df['VEHICLE TYPE']) 
df['MODEL VERSION_encoded'] = encoder.fit_transform(df['MODEL VERSION']) 

In [None]:
features = ['MILEAGE', 'MANUFACTURING YEAR', 'CONDITION', 'BRAND_encoded', 'MODEL_encoded']
target = 'SELLING PRICE'

X = df[features]
y = df[target]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = LinearRegression()
model.fit(X_train, y_train)

In [None]:
y_pred = model.predict(X_test)  # Predicted prices

mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)

# Print the metrics
print("Mean Squared Error:", mse)
print("Root Mean Squared Error:", rmse)
print("R-squared:", r2)
print("Mean Absolute Error:", mae)

# Regression Tree

## Numeric Only

#### 'MILEAGE', 'MANUFACTURING YEAR', 'CONDITION'

In [None]:
# Assuming your data is in a pandas DataFrame called 'df'
features = ['MILEAGE', 'MANUFACTURING YEAR', 'CONDITION'] 
target = 'SELLING PRICE'

X = df[features]
y = df[target]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) 

model = DecisionTreeRegressor(random_state=42)
model.fit(X_train, y_train)

In [None]:
y_pred = model.predict(X_test)  # Predicted prices

mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)

# Print the metrics
print("Mean Squared Error:", mse)
print("Root Mean Squared Error:", rmse)
print("R-squared:", r2)
print("Mean Absolute Error:", mae)

#### 'MILEAGE', 'MANUFACTURING YEAR', 'CONDITION', 'MARKET VALUE'

In [None]:
features = ['MILEAGE', 'MANUFACTURING YEAR', 'CONDITION', 'MARKET VALUE'] 
target = 'SELLING PRICE'

X = df[features]
y = df[target]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) 

model = DecisionTreeRegressor(random_state=42)
model.fit(X_train, y_train)

In [None]:
y_true = y_test  # Actual prices
y_pred = model.predict(X_test)  # Predicted prices

mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)

# Print the metrics
print("Mean Squared Error:", mse)
print("Root Mean Squared Error:", rmse)
print("R-squared:", r2)
print("Mean Absolute Error:", mae)

## With Categorical (One Hot Encoded)

In [None]:
# Initialisation for One Hot Encoder
encoder = OneHotEncoder(handle_unknown='ignore', sparse_output=False).set_output(transform='pandas') 

#### 'MILEAGE', 'MANUFACTURING YEAR', 'CONDITION', 'BRAND', 'MODEL'

In [None]:
encoded_data = encoder.fit_transform(df[['BRAND', 'MODEL']]) 

In [None]:
features = ['MILEAGE', 'MANUFACTURING YEAR', 'CONDITION']
target = 'SELLING PRICE'

X = pd.concat([df[features], encoded_data], axis=1)
y = df[target]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = DecisionTreeRegressor(random_state=42)
model.fit(X_train, y_train)

In [None]:
y_pred = model.predict(X_test)

mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)

print("Mean Squared Error:", mse)
print("Root Mean Squared Error:", rmse)
print("R-squared:", r2)
print("Mean Absolute Error:", mae)

## With Categorical (Label Encoded)

#### 'MILEAGE', 'MANUFACTURING YEAR', 'CONDITION', 'BRAND', 'MODEL'

In [None]:
encoder = LabelEncoder()
df['BRAND_encoded'] = encoder.fit_transform(df['BRAND']) 
df['MODEL_encoded'] = encoder.fit_transform(df['MODEL']) 
df['VEHICLE TYPE_encoded'] = encoder.fit_transform(df['VEHICLE TYPE']) 
df['MODEL VERSION_encoded'] = encoder.fit_transform(df['MODEL VERSION']) 

In [None]:
features = ['MILEAGE', 'MANUFACTURING YEAR', 'CONDITION', 'BRAND_encoded', 'MODEL_encoded']
target = 'SELLING PRICE'

X = df[features]
y = df[target]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = DecisionTreeRegressor(random_state=42)
model.fit(X_train, y_train)

In [None]:
y_pred = model.predict(X_test)  # Predicted prices

mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)

# Print the metrics
print("Mean Squared Error:", mse)
print("Root Mean Squared Error:", rmse)
print("R-squared:", r2)
print("Mean Absolute Error:", mae)