In [34]:
import pandas as pd

# Load the dataset
df = pd.read_csv('data/processed_cars_data.csv')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4979 entries, 0 to 4978
Data columns (total 9 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   car name         4979 non-null   object 
 1   engine_capacity  4979 non-null   float64
 2   cylinder         4979 non-null   int64  
 3   horse_power      4979 non-null   int64  
 4   top_speed        4979 non-null   int64  
 5   brand            4979 non-null   object 
 6   country          4979 non-null   object 
 7   num_seats        4979 non-null   int64  
 8   Price_USD        4979 non-null   float64
dtypes: float64(2), int64(4), object(3)
memory usage: 350.2+ KB


In [35]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import numpy as np

# Assuming df is your DataFrame
X = df.drop(columns=["Price_USD", "car name",'brand', 'country'])  # Drop 'Price_USD' (target) and 'car name' (irrelevant feature)
y = df["Price_USD"]

# # One-hot encode categorical features (brand, country)
# X = pd.get_dummies(X, columns=[], drop_first=True)
# X = X.astype(np.float64)

# Splitting the dataset into training (60%), validation (20%), and test (20%) sets
X_temp, X_test, y_temp, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_temp, y_temp, test_size=0.25, random_state=42)  # 0.25 * 0.8 = 0.2

# Standardize features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)
X_test_scaled = scaler.transform(X_test)

# Add bias column to scaled datasets
X_train_b = np.c_[np.ones((X_train_scaled.shape[0], 1)), X_train_scaled]
X_val_b = np.c_[np.ones((X_val_scaled.shape[0], 1)), X_val_scaled]
X_test_b = np.c_[np.ones((X_test_scaled.shape[0], 1)), X_test_scaled]

X.info()
model_results = []


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4979 entries, 0 to 4978
Data columns (total 5 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   engine_capacity  4979 non-null   float64
 1   cylinder         4979 non-null   int64  
 2   horse_power      4979 non-null   int64  
 3   top_speed        4979 non-null   int64  
 4   num_seats        4979 non-null   int64  
dtypes: float64(1), int64(4)
memory usage: 194.6 KB


In [36]:
# Closed-form solution & Gradient Descent With comparison between the two methods 

try:
    ##############################
    #### Run gradient descent ####
    ##############################

    theta_best_descent = np.zeros(X_train_b.shape[1])  # Includes bias term  
    # Gradient Descent Implementation
    learning_rate = 0.01
    epochs = 1000
    m = X_train_b.shape[0]  # Number of samples in training set
    
    for epoch in range(epochs):
        gradients = (1 / m) * X_train_b.T @ (X_train_b @ theta_best_descent - y_train)  # Compute gradients
        theta_best_descent -= learning_rate * gradients  # Update theta
    
    # Predictions on validation set
    y_val_pred_gd = X_val_b @ theta_best_descent
    
    # Metrics for Gradient Descent
    mse_gd = np.mean((y_val - y_val_pred_gd) ** 2)
    mae_gd = np.mean(np.abs(y_val - y_val_pred_gd))
    ss_total_gd = np.sum((y_val - np.mean(y_val)) ** 2)
    ss_residual_gd = np.sum((y_val - y_val_pred_gd) ** 2)
    r2_gd = 1 - (ss_residual_gd / ss_total_gd)
    
    print(f"Gradient Descent - MSE: {mse_gd:.3f}, MAE: {mae_gd:.3f}, R²: {r2_gd:.3f}")
    print("Coefficients (Gradient Descent):", theta_best_descent)
    
    ##############################
    #### Closed-form solution ####
    ##############################
    theta_best_closed = np.linalg.inv(X_train_b.T @ X_train_b) @ X_train_b.T @ y_train
    y_val_pred_closed = X_val_b @ theta_best_closed
    
    # Metrics for Closed-form Solution
    mse_closed = np.mean((y_val - y_val_pred_closed) ** 2)
    mae_closed = np.mean(np.abs(y_val - y_val_pred_closed))
    ss_total_closed = np.sum((y_val - np.mean(y_val)) ** 2)
    ss_residual_closed = np.sum((y_val - y_val_pred_closed) ** 2)
    r2_closed = 1 - (ss_residual_closed / ss_total_closed)
    
    print(f"Closed-form - MSE: {mse_closed:.3f}, MAE: {mae_closed:.3f}, R²: {r2_closed:.3f}")
    print("Coefficients (Closed-form):", theta_best_closed)
except np.linalg.LinAlgError:
    print("Error: Singular matrix. Unable to compute the closed-form solution.")


Gradient Descent - MSE: 6736827576.766, MAE: 27708.579, R²: 0.353
Coefficients (Gradient Descent): [ 7.12590700e+04 -5.89216503e+01  1.16963734e+04  4.77025659e+04
  1.55275470e+04 -3.05710801e+03]
Closed-form - MSE: 6886305765.176, MAE: 27971.329, R²: 0.338
Coefficients (Closed-form): [71262.14649146   -72.98585132  9613.45568639 50250.67310241
 14856.60315043 -2976.84142149]


In [37]:
# LASSO and Ridge Regression with hyperparameter tuning and evaluation

from sklearn.linear_model import Lasso, Ridge
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import numpy as np

# Define hyperparameter grid
alpha_values = {'alpha': [0.001, 0.01, 0.1, 1, 10, 100]}

############################################ 
###   LASSO Regression with Grid Search  ###
############################################ 
lasso = Lasso(max_iter=10000)
lasso_grid = GridSearchCV(lasso, alpha_values, cv=5, scoring='neg_mean_squared_error')
lasso_grid.fit(X_train_scaled, y_train)
best_lasso = lasso_grid.best_estimator_

############################################ 
####  Ridge Regression with Grid Search  ###
############################################ 
ridge = Ridge()
ridge_grid = GridSearchCV(ridge, alpha_values, cv=5, scoring='neg_mean_squared_error')
ridge_grid.fit(X_train_scaled, y_train)
best_ridge = ridge_grid.best_estimator_

################################# 
###   Validation Set Metrics  ###
################################# 
# LASSO Predictions
lasso_val_pred = best_lasso.predict(X_val_scaled)
lasso_mse = mean_squared_error(y_val, lasso_val_pred)
lasso_mae = mean_absolute_error(y_val, lasso_val_pred)
lasso_r2 = r2_score(y_val, lasso_val_pred)

# Ridge Predictions
ridge_val_pred = best_ridge.predict(X_val_scaled)
ridge_mse = mean_squared_error(y_val, ridge_val_pred)
ridge_mae = mean_absolute_error(y_val, ridge_val_pred)
ridge_r2 = r2_score(y_val, ridge_val_pred)

# Print Validation Metrics
print("Validation Metrics:")
print(f"LASSO - MSE: {lasso_mse:.3f}, MAE: {lasso_mae:.3f}, R²: {lasso_r2:.3f}")
print(f"Ridge - MSE: {ridge_mse:.3f}, MAE: {ridge_mae:.3f}, R²: {ridge_r2:.3f}")
print("LASSO Coefficients:", best_lasso.coef_)
print("Ridge Coefficients:", best_ridge.coef_)

############################# 
###  Test Set Evaluation  ###
############################# 
# LASSO Predictions
lasso_test_pred = best_lasso.predict(X_test_scaled)
lasso_test_mse = mean_squared_error(y_test, lasso_test_pred)
lasso_test_mae = mean_absolute_error(y_test, lasso_test_pred)
lasso_test_r2 = r2_score(y_test, lasso_test_pred)

# Ridge Predictions
ridge_test_pred = best_ridge.predict(X_test_scaled)
ridge_test_mse = mean_squared_error(y_test, ridge_test_pred)
ridge_test_mae = mean_absolute_error(y_test, ridge_test_pred)
ridge_test_r2 = r2_score(y_test, ridge_test_pred)

# Print Test Metrics
print("\nTest Metrics:")
print(f"LASSO - MSE: {lasso_test_mse:.3f}, MAE: {lasso_test_mae:.3f}, R²: {lasso_test_r2:.3f}")
print(f"Ridge - MSE: {ridge_test_mse:.3f}, MAE: {ridge_test_mae:.3f}, R²: {ridge_test_r2:.3f}")


Validation Metrics:
LASSO - MSE: 6884149445.228, MAE: 27933.368, R²: 0.339
Ridge - MSE: 6676014578.626, MAE: 27253.356, R²: 0.359
LASSO Coefficients: [   -0.          9554.38939557 50231.92653826 14823.27287075
 -2900.2790271 ]
Ridge Coefficients: [  -59.7163599  11570.24263223 46837.48222636 15380.19623224
 -3190.66651936]

Test Metrics:
LASSO - MSE: 16768004847.356, MAE: 31647.551, R²: 0.298
Ridge - MSE: 16642234451.202, MAE: 30964.481, R²: 0.303


In [38]:
# Polynomial Regression and Radial Basis Function (RBF) Kernel Regression

from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR


#######################################
###  Polynomial Regression Results  ###
#######################################
print("Polynomial Regression Results:")
for degree in range(2, 11):  # Degrees from 2 to 10 inclusive
    # Transform input features to polynomial features
    poly = PolynomialFeatures(degree)
    X_train_poly = poly.fit_transform(X_train_scaled)
    X_val_poly = poly.transform(X_val_scaled)
    
    # Train Linear Regression model
    lin_reg = LinearRegression()
    lin_reg.fit(X_train_poly, y_train)
    
    # Make predictions on validation set
    y_val_pred_poly = lin_reg.predict(X_val_poly)
    
    # Calculate evaluation metrics
    mse_poly = np.sqrt(mean_squared_error(y_val, y_val_pred_poly))
    mae_poly = mean_absolute_error(y_val, y_val_pred_poly)
    r2_poly = r2_score(y_val, y_val_pred_poly)
    
    print(f"Degree {degree}: MSE = {mse_poly:.3f}, MAE = {mae_poly:.3f}, R² = {r2_poly:.3f}")

#######################################################
###  Radial Basis Function (RBF) Kernel Regression  ###
#######################################################
print("\nRBF Kernel Regression Results:")

# Define hyperparameter grid for SVR
param_grid = {'C': [0.1, 1, 10, 100], 'gamma': [0.01, 0.1, 1, 10]}
rbf_svr = SVR(kernel='rbf')

# GridSearchCV for hyperparameter tuning
grid_search = GridSearchCV(rbf_svr, param_grid, cv=5, scoring='neg_mean_squared_error')
grid_search.fit(X_train_scaled, y_train)

# Best RBF model from grid search
best_rbf_model = grid_search.best_estimator_

# Predictions on validation set
y_val_pred_rbf = best_rbf_model.predict(X_val_scaled)

# Calculate evaluation metrics for RBF
mse_rbf = np.sqrt(mean_squared_error(y_val, y_val_pred_rbf))
mae_rbf = mean_absolute_error(y_val, y_val_pred_rbf)
r2_rbf = r2_score(y_val, y_val_pred_rbf)

print(f"Best RBF MSE on Validation Set: {mse_rbf:.3f}, MAE = {mae_rbf:.3f}, R² = {r2_rbf:.3f}")
print("Best RBF Model Hyperparameters:", grid_search.best_params_)



Polynomial Regression Results:
Degree 2: MSE = 369793.868, MAE = 30538.249, R² = -12.138
Degree 3: MSE = 2817036.256, MAE = 106412.882, R² = -761.415
Degree 4: MSE = 262713562.080, MAE = 8340790.995, R² = -6630880.729
Degree 5: MSE = 16013607532.955, MAE = 507426844.230, R² = -24636820351.798
Degree 6: MSE = 200846154113.995, MAE = 6364173838.010, R² = -3875549926168.537
Degree 7: MSE = 29182600557197.824, MAE = 924687828154.035, R² = -81819020718037872.000
Degree 8: MSE = 839234041879579.625, MAE = 26592151967679.633, R² = -67666307731815833600.000
Degree 9: MSE = 5810729733003874.000, MAE = 184120017476478.969, R² = -3243901421774754021376.000
Degree 10: MSE = 5479992590121407.000, MAE = 173640210073227.250, R² = -2885135657705454698496.000

RBF Kernel Regression Results:
Best RBF MSE on Validation Set: 94252.581, MAE = 29924.732, R² = 0.147
Best RBF Model Hyperparameters: {'C': 100, 'gamma': 0.1}
