In [19]:
import pandas as pd

# Load the dataset
df = pd.read_csv('data/processed_cars_data.csv')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4979 entries, 0 to 4978
Data columns (total 9 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   car name         4979 non-null   object 
 1   engine_capacity  4979 non-null   float64
 2   cylinder         4979 non-null   int64  
 3   horse_power      4979 non-null   int64  
 4   top_speed        4979 non-null   int64  
 5   brand            4979 non-null   object 
 6   country          4979 non-null   object 
 7   num_seats        4979 non-null   int64  
 8   Price_USD        4979 non-null   float64
dtypes: float64(2), int64(4), object(3)
memory usage: 350.2+ KB


In [20]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import numpy as np

# Assuming df is your DataFrame
X = df.drop(columns=["Price_USD", "car name",'brand', 'country'])  # Drop 'Price_USD' (target) and 'car name' (irrelevant feature)
y = df["Price_USD"]

# # One-hot encode categorical features (brand, country)
# X = pd.get_dummies(X, columns=[], drop_first=True)
# X = X.astype(np.float64)

# Splitting the dataset into training (60%), validation (20%), and test (20%) sets
X_temp, X_test, y_temp, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_temp, y_temp, test_size=0.25, random_state=42)  # 0.25 * 0.8 = 0.2

# Standardize features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)
X_test_scaled = scaler.transform(X_test)

# Add bias column to scaled datasets
X_train_b = np.c_[np.ones((X_train_scaled.shape[0], 1)), X_train_scaled]
X_val_b = np.c_[np.ones((X_val_scaled.shape[0], 1)), X_val_scaled]
X_test_b = np.c_[np.ones((X_test_scaled.shape[0], 1)), X_test_scaled]

X.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4979 entries, 0 to 4978
Data columns (total 5 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   engine_capacity  4979 non-null   float64
 1   cylinder         4979 non-null   int64  
 2   horse_power      4979 non-null   int64  
 3   top_speed        4979 non-null   int64  
 4   num_seats        4979 non-null   int64  
dtypes: float64(1), int64(4)
memory usage: 194.6 KB


In [21]:
# Closed-form solution & Gradient Descent With comparison between the two methods 

try:
    ##############################
    #### Run gradient descent ####
    ##############################

    theta_best_descent = np.zeros(X_train_b.shape[1])  # Includes bias term  
    # Gradient Descent Implementation
    learning_rate = 0.01
    epochs = 1000
    m = X_train_b.shape[0]  # Number of samples in training set
    
    for epoch in range(epochs):
        gradients = (1 / m) * X_train_b.T @ (X_train_b @ theta_best_descent - y_train)  # Compute gradients
        theta_best_descent -= learning_rate * gradients  # Update theta
    
    # Predictions on validation set
    y_val_pred_gd = X_val_b @ theta_best_descent
    rmse_gd = np.sqrt(np.mean((y_val - y_val_pred_gd) ** 2))
    
    print("Gradient Descent - Validation RMSE:", rmse_gd)
    print("Coefficients (Gradient Descent):", theta_best_descent)
    
    ##############################
    #### Closed-form solution ####
    ##############################
    theta_best_closed = np.linalg.inv(X_train_b.T @ X_train_b) @ X_train_b.T @ y_train
    y_val_pred_closed = X_val_b @ theta_best_closed
    rmse_closed = np.sqrt(np.mean((y_val - y_val_pred_closed) ** 2))
    print("Closed-form solution - Validation RMSE:", rmse_closed)
    print("Coefficients (Closed-form):", theta_best_closed)
except np.linalg.LinAlgError:
    print("Error: Singular matrix. Unable to compute the closed-form solution.")


Gradient Descent - Validation RMSE: 82078.1796628439
Coefficients (Gradient Descent): [ 7.12590700e+04 -5.89216503e+01  1.16963734e+04  4.77025659e+04
  1.55275470e+04 -3.05710801e+03]
Closed-form solution - Validation RMSE: 82983.76808253591
Coefficients (Closed-form): [71262.14649146   -72.98585132  9613.45568639 50250.67310241
 14856.60315043 -2976.84142149]


In [22]:
# LASSO and Ridge Regression with hyperparameter tuning and evaluation

from sklearn.linear_model import Lasso, Ridge
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error , r2_score

# Define hyperparameter grids
# Hyperparameter grid
alpha_values = {'alpha': [0.001, 0.01, 0.1, 1, 10, 100]}

# LASSO Regression
lasso = Lasso(max_iter=10000)
lasso_grid = GridSearchCV(lasso, alpha_values, cv=5, scoring='neg_mean_squared_error')
lasso_grid.fit(X_train_scaled, y_train)
best_lasso = lasso_grid.best_estimator_

# Ridge Regression
ridge = Ridge()
ridge_grid = GridSearchCV(ridge, alpha_values, cv=5, scoring='neg_mean_squared_error')
ridge_grid.fit(X_train_scaled, y_train)
best_ridge = ridge_grid.best_estimator_

# Validation Metrics
lasso_val_pred = best_lasso.predict(X_val_scaled)
ridge_val_pred = best_ridge.predict(X_val_scaled)

lasso_rmse = np.sqrt(mean_squared_error(y_val, lasso_val_pred))
ridge_rmse = np.sqrt(mean_squared_error(y_val, ridge_val_pred))

lasso_r2 = r2_score(y_val, lasso_val_pred)
ridge_r2 = r2_score(y_val, ridge_val_pred)

print(f"Best LASSO RMSE: {lasso_rmse:.3f}, R²: {lasso_r2:.3f}")
print(f"Best Ridge RMSE: {ridge_rmse:.3f}, R²: {ridge_r2:.3f}")

print("LASSO Coefficients:", best_lasso.coef_)
print("Ridge Coefficients:", best_ridge.coef_)

# Test Set Evaluation
lasso_test_pred = best_lasso.predict(X_test_scaled)
ridge_test_pred = best_ridge.predict(X_test_scaled)

lasso_test_rmse = np.sqrt(mean_squared_error(y_test, lasso_test_pred))
ridge_test_rmse = np.sqrt(mean_squared_error(y_test, ridge_test_pred))

print(f"LASSO Test RMSE: {lasso_test_rmse:.3f}")
print(f"Ridge Test RMSE: {ridge_test_rmse:.3f}")

Best LASSO RMSE: 82970.775, R²: 0.339
Best Ridge RMSE: 81706.882, R²: 0.359
LASSO Coefficients: [   -0.          9554.38939557 50231.92653826 14823.27287075
 -2900.2790271 ]
Ridge Coefficients: [  -59.7163599  11570.24263223 46837.48222636 15380.19623224
 -3190.66651936]
LASSO Test RMSE: 129491.331
Ridge Test RMSE: 129004.785


In [23]:
# Polynomial Regression and Radial Basis Function (RBF) Kernel Regression

from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR


# Polynomial Regression: Varying the polynomial degree
print("Polynomial Regression Results:")
for degree in range(2, 11):  # Degrees from 2 to 10 inclusive
    poly = PolynomialFeatures(degree)
    X_train_poly = poly.fit_transform(X_train_scaled)
    X_val_poly = poly.transform(X_val_scaled)
    
    lin_reg = LinearRegression()
    lin_reg.fit(X_train_poly, y_train)
    
    y_val_pred = lin_reg.predict(X_val_poly)
    rmse_poly = np.sqrt(mean_squared_error(y_val, y_val_pred))
    print(f"Degree {degree}: RMSE = {rmse_poly:.3f}")

# Radial Basis Function (RBF) Kernel Regression
print("\nRBF Kernel Regression Results:")
# Define hyperparameter grid for SVR
from sklearn.model_selection import GridSearchCV

rbf_svr = SVR(kernel='rbf')
param_grid = {'C': [0.1, 1, 10, 100], 'gamma': [0.01, 0.1, 1, 10]}

grid_search = GridSearchCV(rbf_svr, param_grid, cv=5, scoring='neg_mean_squared_error')
grid_search.fit(X_train_scaled, y_train)

best_rbf_model = grid_search.best_estimator_

# Evaluate RBF on validation set
y_val_pred_rbf = best_rbf_model.predict(X_val_scaled)
rmse_rbf = np.sqrt(mean_squared_error(y_val, y_val_pred_rbf))
print(f"Best RBF RMSE on Validation Set: {rmse_rbf:.3f}")


Polynomial Regression Results:
Degree 2: RMSE = 369793.868
Degree 3: RMSE = 2817036.256
Degree 4: RMSE = 262713562.080
Degree 5: RMSE = 16013607532.955
Degree 6: RMSE = 200846154113.995
Degree 7: RMSE = 29182600557197.824
Degree 8: RMSE = 839234041879579.625
Degree 9: RMSE = 5810729733003874.000
Degree 10: RMSE = 5479992590121407.000

RBF Kernel Regression Results:
Best RBF RMSE on Validation Set: 94252.581
