In [46]:
import pandas as pd

# Load the dataset
df = pd.read_csv('data/processed_cars_data.csv')
df.info()
df.describe()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4979 entries, 0 to 4978
Data columns (total 9 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   car name         4979 non-null   object 
 1   engine_capacity  4979 non-null   float64
 2   cylinder         4979 non-null   int64  
 3   horse_power      4979 non-null   int64  
 4   top_speed        4979 non-null   int64  
 5   brand            4979 non-null   object 
 6   country          4979 non-null   object 
 7   num_seats        4979 non-null   int64  
 8   Price_USD        4979 non-null   float64
dtypes: float64(2), int64(4), object(3)
memory usage: 350.2+ KB


Unnamed: 0,engine_capacity,cylinder,horse_power,top_speed,num_seats,Price_USD
count,4979.0,4979.0,4979.0,4979.0,4979.0,4979.0
mean,3.616951,5.165696,285.849367,222.112874,4.921068,72155.46
std,28.331114,1.842988,183.686725,46.665063,1.40137,107289.0
min,0.0,3.0,25.0,120.0,2.0,4608.0
25%,2.0,4.0,164.0,188.0,5.0,25650.0
50%,2.4,4.0,250.0,220.0,5.0,47250.0
75%,3.5,6.0,362.0,250.0,5.0,83437.65
max,1000.0,16.0,5050.0,966.0,18.0,3594996.0


In [47]:
from sklearn.model_selection import train_test_split

# Assuming df is your DataFrame
X = df.drop(columns=["Price_USD", "car name",'brand', 'country'])  # Drop 'Price_USD' (target) and 'car name' (irrelevant feature)
y = df["Price_USD"]

# # One-hot encode categorical features (brand, country)
# X = pd.get_dummies(X, columns=[], drop_first=True)
# 
# # Convert all data to float64
# X = X.astype(np.float64)

# Splitting the dataset into training (60%), validation (20%), and test (20%) sets
X_temp, X_test, y_temp, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_temp, y_temp, test_size=0.25, random_state=42)  # 0.25 * 0.8 = 0.2

# Convert to NumPy arrays
# X_train, X_val, y_train, y_val = X_train.to_numpy(), X_val.to_numpy(), y_train.to_numpy(), y_val.to_numpy()

X.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4979 entries, 0 to 4978
Data columns (total 5 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   engine_capacity  4979 non-null   float64
 1   cylinder         4979 non-null   int64  
 2   horse_power      4979 non-null   int64  
 3   top_speed        4979 non-null   int64  
 4   num_seats        4979 non-null   int64  
dtypes: float64(1), int64(4)
memory usage: 194.6 KB


In [48]:
import numpy as np

# Add a bias term to the feature matrix
X_train_b = np.c_[np.ones((X_train.shape[0], 1)), X_train]  # Add bias term
X_val_b = np.c_[np.ones((X_val.shape[0], 1)), X_val]

# Closed-form solution
# theta_best = (X^T . X)^-1 . X^T . y

try:
    theta_best = np.linalg.inv(X_train_b.T @ X_train_b) @ X_train_b.T @ y_train
    # Predictions on the validation set
    y_val_pred = X_val_b @ theta_best
    # Evaluate the performance (e.g., using RMSE)
    print("Validation RMSE (Closed-form):", np.sqrt(np.mean((y_val - y_val_pred)**2)))
except np.linalg.LinAlgError as e:
    print("Error: Singular matrix. Unable to compute the closed-form solution.")

Validation RMSE (Closed-form): 82983.76808253599


In [49]:
from sklearn.linear_model import Lasso, Ridge
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error

# Define hyperparameter grids
alpha_values = {'alpha': [0.001, 0.01, 0.1, 1, 10, 100]}

# LASSO Regression
lasso = Lasso(max_iter=10000)
lasso_grid = GridSearchCV(lasso, alpha_values, cv=5, scoring='neg_mean_squared_error')
lasso_grid.fit(X_train, y_train)
best_lasso = lasso_grid.best_estimator_

# Ridge Regression
ridge = Ridge()
ridge_grid = GridSearchCV(ridge, alpha_values, cv=5, scoring='neg_mean_squared_error')
ridge_grid.fit(X_train, y_train)
best_ridge = ridge_grid.best_estimator_

# Evaluate
lasso_rmse = np.sqrt(mean_squared_error(y_val, best_lasso.predict(X_val)))
ridge_rmse = np.sqrt(mean_squared_error(y_val, best_ridge.predict(X_val)))

print("Best LASSO RMSE:", lasso_rmse)
print("Best Ridge RMSE:", ridge_rmse)


Best LASSO RMSE: 83021.41042819904
Best Ridge RMSE: 83043.13777287723


In [50]:
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression

for degree in range(2, 11):
    poly = PolynomialFeatures(degree)
    X_train_poly = poly.fit_transform(X_train)
    X_val_poly = poly.transform(X_val)
    
    lin_reg = LinearRegression()
    lin_reg.fit(X_train_poly, y_train)
    
    y_val_pred = lin_reg.predict(X_val_poly)
    rmse_poly = np.sqrt(mean_squared_error(y_val, y_val_pred))
    print(f"Degree {degree}: RMSE = {rmse_poly}")

Degree 2: RMSE = 369793.86826556915
Degree 3: RMSE = 2817036.2836314747
Degree 4: RMSE = 263663956.23629493
Degree 5: RMSE = 15183008311.954758
Degree 6: RMSE = 345008597172.33124
Degree 7: RMSE = 5052708079739.828
Degree 8: RMSE = 88082607577091.47
Degree 9: RMSE = 2180551429222975.8
Degree 10: RMSE = 1.0985579416446034e+16


In [51]:
from sklearn.svm import SVR

# Define RBF SVR model and parameter grid
rbf_svr = SVR(kernel='rbf')
param_grid = {'C': [0.1, 1, 10, 100], 'gamma': ['scale', 'auto', 0.01, 0.1, 1]}

# Grid search for hyperparameter tuning
rbf_grid = GridSearchCV(rbf_svr, param_grid, cv=5, scoring='neg_mean_squared_error')
rbf_grid.fit(X_train, y_train.to_numpy())  # Use .to_numpy() instead of .ravel()

# Get the best model
best_rbf = rbf_grid.best_estimator_

# Make predictions on the validation set
y_val_pred_rbf = best_rbf.predict(X_val)

# Calculate RMSE
rmse_rbf = np.sqrt(mean_squared_error(y_val, y_val_pred_rbf))
print("Best RBF RMSE:", rmse_rbf)


Best RBF RMSE: 93022.55338893594
