In [74]:
import pandas as pd

# Load the dataset
df = pd.read_csv('data/processed_cars_data.csv')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4979 entries, 0 to 4978
Data columns (total 9 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   car name         4979 non-null   object 
 1   engine_capacity  4979 non-null   float64
 2   cylinder         4979 non-null   int64  
 3   horse_power      4979 non-null   int64  
 4   top_speed        4979 non-null   int64  
 5   brand            4979 non-null   object 
 6   country          4979 non-null   object 
 7   num_seats        4979 non-null   int64  
 8   Price_USD        4979 non-null   float64
dtypes: float64(2), int64(4), object(3)
memory usage: 350.2+ KB


In [75]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import numpy as np

# Assuming df is your DataFrame
X = df.drop(columns=["Price_USD", "car name",'brand', 'country'])  # Drop 'Price_USD' (target) and 'car name' (irrelevant feature)
y = df["Price_USD"]

# One-hot encode categorical features (brand, country)
# X = pd.get_dummies(X, columns=[], drop_first=True)
# X = X.astype(np.float64)

# Splitting the dataset into training (60%), validation (20%), and test (20%) sets
X_temp, X_test, y_temp, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_temp, y_temp, test_size=0.25, random_state=42)  # 0.25 * 0.8 = 0.2

# Standardize features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)
X_test_scaled = scaler.transform(X_test)

# Add bias column to scaled datasets
X_train_b = np.c_[np.ones((X_train_scaled.shape[0], 1)), X_train_scaled]
X_val_b = np.c_[np.ones((X_val_scaled.shape[0], 1)), X_val_scaled]
X_test_b = np.c_[np.ones((X_test_scaled.shape[0], 1)), X_test_scaled]

X.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4979 entries, 0 to 4978
Data columns (total 5 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   engine_capacity  4979 non-null   float64
 1   cylinder         4979 non-null   int64  
 2   horse_power      4979 non-null   int64  
 3   top_speed        4979 non-null   int64  
 4   num_seats        4979 non-null   int64  
dtypes: float64(1), int64(4)
memory usage: 194.6 KB


In [76]:
# Closed-form solution & Gradient Descent With comparison between the two methods 

try:
    ##############################
    #### Run gradient descent ####
    ##############################

    theta_best_descent = np.zeros(X_train_b.shape[1])  # Includes bias term  
    # Gradient Descent Implementation
    learning_rate = 0.01
    epochs = 1000
    m = X_train_b.shape[0]  # Number of samples in training set
    
    for epoch in range(epochs):
        gradients = (1 / m) * X_train_b.T @ (X_train_b @ theta_best_descent - y_train)  # Compute gradients
        theta_best_descent -= learning_rate * gradients  # Update theta
    
    # Predictions on validation set
    y_val_pred_gd = X_val_b @ theta_best_descent
    rmse_gd = np.sqrt(np.mean((y_val - y_val_pred_gd) ** 2))
    
    print("Gradient Descent - Validation RMSE:", rmse_gd)
    print("Coefficients (Gradient Descent):", theta_best_descent)
    
    ##############################
    #### Closed-form solution ####
    ##############################
    theta_best_closed = np.linalg.inv(X_train_b.T @ X_train_b) @ X_train_b.T @ y_train
    y_val_pred_closed = X_val_b @ theta_best_closed
    rmse_closed = np.sqrt(np.mean((y_val - y_val_pred_closed) ** 2))
    print("Closed-form solution - Validation RMSE:", rmse_closed)
    print("Coefficients (Closed-form):", theta_best_closed)
except np.linalg.LinAlgError:
    print("Error: Singular matrix. Unable to compute the closed-form solution.")


Gradient Descent - Validation RMSE: 82078.1796628439
Coefficients (Gradient Descent): [ 7.12590700e+04 -5.89216503e+01  1.16963734e+04  4.77025659e+04
  1.55275470e+04 -3.05710801e+03]
Closed-form solution - Validation RMSE: 82983.76808253591
Coefficients (Closed-form): [71262.14649146   -72.98585132  9613.45568639 50250.67310241
 14856.60315043 -2976.84142149]


In [77]:
# LASSO and Ridge Regression
from sklearn.linear_model import Lasso, Ridge
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error

# Define hyperparameter grids
alpha_values = {'alpha': [0.001, 0.01, 0.1, 1, 10, 100]}

# LASSO Regression
lasso = Lasso(max_iter=10000)
lasso_grid = GridSearchCV(lasso, alpha_values, cv=5, scoring='neg_mean_squared_error')
lasso_grid.fit(X_train, y_train)
best_lasso = lasso_grid.best_estimator_

# Ridge Regression
ridge = Ridge()
ridge_grid = GridSearchCV(ridge, alpha_values, cv=5, scoring='neg_mean_squared_error')
ridge_grid.fit(X_train, y_train)
best_ridge = ridge_grid.best_estimator_

# Evaluate
lasso_rmse = np.sqrt(mean_squared_error(y_val, best_lasso.predict(X_val)))
ridge_rmse = np.sqrt(mean_squared_error(y_val, best_ridge.predict(X_val)))

print("Best LASSO RMSE:", lasso_rmse)
print("Best Ridge RMSE:", ridge_rmse)


Best LASSO RMSE: 83021.41042819904
Best Ridge RMSE: 83043.13777287723


In [78]:
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression

for degree in range(2, 11):
    poly = PolynomialFeatures(degree)
    X_train_poly = poly.fit_transform(X_train)
    X_val_poly = poly.transform(X_val)
    
    lin_reg = LinearRegression()
    lin_reg.fit(X_train_poly, y_train)
    
    y_val_pred = lin_reg.predict(X_val_poly)
    rmse_poly = np.sqrt(mean_squared_error(y_val, y_val_pred))
    print(f"Degree {degree}: RMSE = {rmse_poly}")

Degree 2: RMSE = 369793.86826556915
Degree 3: RMSE = 2817036.2836314747
Degree 4: RMSE = 263663956.23629493
Degree 5: RMSE = 15183008311.954758
Degree 6: RMSE = 345008597172.33124
Degree 7: RMSE = 5052708079739.828
Degree 8: RMSE = 88082607577091.47
Degree 9: RMSE = 2180551429222975.8
Degree 10: RMSE = 1.0985579416446034e+16


In [79]:
from sklearn.svm import SVR

# Define RBF SVR model and parameter grid
rbf_svr = SVR(kernel='rbf')
param_grid = {'C': [0.1, 1, 10, 100], 'gamma': ['scale', 'auto', 0.01, 0.1, 1]}

# Grid search for hyperparameter tuning
rbf_grid = GridSearchCV(rbf_svr, param_grid, cv=5, scoring='neg_mean_squared_error')
rbf_grid.fit(X_train, y_train.to_numpy())  # Use .to_numpy() instead of .ravel()

# Get the best model
best_rbf = rbf_grid.best_estimator_

# Make predictions on the validation set
y_val_pred_rbf = best_rbf.predict(X_val)

# Calculate RMSE
rmse_rbf = np.sqrt(mean_squared_error(y_val, y_val_pred_rbf))
print("Best RBF RMSE:", rmse_rbf)


Best RBF RMSE: 93022.55338893594
