In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import csv
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import GridSearchCV
from sklearn.impute import SimpleImputer


def handle_bad_lines(line):
    #print(f"Skipping bad line: {line}")
    return None


car_prices= pd.read_csv('car_prices.csv', engine='python', on_bad_lines=handle_bad_lines)
car_prices.drop(columns=['vin','saledate','seller','mmr','trim','model','interior','state'], inplace=True)
print(car_prices.head())

   year   make   body transmission  condition  odometer  color  sellingprice
0  2015    Kia    SUV    automatic        5.0   16639.0  white         21500
1  2015    Kia    SUV    automatic        5.0    9393.0  white         21500
2  2014    BMW  Sedan    automatic        4.5    1331.0   gray         30000
3  2015  Volvo  Sedan    automatic        4.1   14282.0  white         27750
4  2014    BMW  Sedan    automatic        4.3    2641.0   gray         67000


In [4]:
car_prices.dropna(subset=['make','body','transmission','color'], inplace=True)
car_prices['odometer'].fillna(car_prices['odometer'].median(), inplace=True)
car_prices['condition'].fillna(car_prices['condition'].median(), inplace=True)
print(car_prices.shape[0])

481840


In [5]:
car_prices = car_prices[car_prices['sellingprice']<125000]
print(car_prices.shape[0])

481779


In [7]:
car_prices = pd.get_dummies(car_prices, columns=['make','body','transmission','color'],)
print(car_prices.columns)

Index(['year', 'condition', 'odometer', 'sellingprice', 'make_Acura',
       'make_Aston Martin', 'make_Audi', 'make_BMW', 'make_Bentley',
       'make_Buick',
       ...
       'color_off-white', 'color_orange', 'color_pink', 'color_purple',
       'color_red', 'color_silver', 'color_turquoise', 'color_white',
       'color_yellow', 'color_—'],
      dtype='object', length=163)


In [8]:
X = car_prices.drop(columns=['sellingprice'])
y = car_prices['sellingprice']
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2, random_state=42)
print(X_train, X_test, y_train, y_test)

        year  condition  odometer  make_Acura  make_Aston Martin  make_Audi   
422459  2011        1.9   32362.0       False              False      False  \
63800   2011        4.4   33251.0       False              False      False   
141966  2013        4.5  100517.0       False              False      False   
157168  2013        3.6   54354.0       False              False      False   
90567   2009        3.6   42094.0       False              False       True   
...      ...        ...       ...         ...                ...        ...   
299052  2014        4.8    2566.0       False              False      False   
422056  2014        4.3   31018.0       False              False      False   
153911  2010        3.5   49467.0       False              False      False   
170958  2014        4.6   43276.0       False              False      False   
141782  2001        2.0  189679.0       False              False      False   

        make_BMW  make_Bentley  make_Buick  make_Ca

In [9]:
scaler = StandardScaler()
X_train[['year','condition','odometer']]=scaler.fit_transform(X_train[['year','condition','odometer']])
X_test[['year','condition','odometer']]=scaler.transform(X_test[['year','condition','odometer']])
imputer = SimpleImputer(strategy='most_frequent')

In [10]:
X_train = imputer.fit_transform(X_train)

In [11]:
print(car_prices.shape[0])
new_cars = car_prices.dropna(inplace=True)
print(new_cars)

481779
None


In [12]:
rf = RandomForestRegressor(random_state=42)
rf.fit(X_train, y_train)

In [13]:
y_pred_train = rf.predict(X_train)
y_pred_test = rf.predict(X_test)
mse_train = mean_squared_error(y_train,y_pred_train)
mse_test = mean_squared_error(y_test,y_pred_test)



In [14]:
r2_train = r2_score(y_train,y_pred_train)
r2_test = r2_score(y_test, y_pred_test)

In [15]:
print("Training set MSE: ", mse_train)
print("Testing set MSE: ", mse_test)
print("Training set R-squared score:", r2_train)
print("Testing set R-squared score: ", r2_test)

Training set MSE:  2337355.674909992
Testing set MSE:  16307744.385474788
Training set R-squared score: 0.9742849251802634
Testing set R-squared score:  0.8181051089182547


In [26]:
test_data = car_prices.sample(n=10, random_state=2)
X_test = test_data.drop('sellingprice',axis=1)
y_pred = rf.predict(X_test)
y_true=test_data['sellingprice']



In [27]:
for i in range(len(y_true)):
    print(f"Predicted selling price: {y_pred[i]:.2f}, Actual selling price: {y_true.iloc[i]}")

Predicted selling price: 17199.00, Actual selling price: 29200
Predicted selling price: 10480.10, Actual selling price: 17800
Predicted selling price: 9809.50, Actual selling price: 2600
Predicted selling price: 9843.00, Actual selling price: 26900
Predicted selling price: 10195.00, Actual selling price: 300
Predicted selling price: 13864.50, Actual selling price: 17600
Predicted selling price: 10814.50, Actual selling price: 1400
Predicted selling price: 11140.50, Actual selling price: 19600
Predicted selling price: 11552.00, Actual selling price: 7000
Predicted selling price: 9977.00, Actual selling price: 14000


In [28]:
param_grid = {
    'n_estimators': [50, 100, 150],
    'max_depth': [5, 10, 15, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=3, n_jobs=-1, verbose=2)
grid_search.fit(X_train, y_train)
best_params = grid_search.best_params_
print("Best parameters found using GridSearchCV: ", best_params)

Fitting 3 folds for each of 108 candidates, totalling 324 fits
[CV] END max_depth=5, min_samples_leaf=1, min_samples_split=2, n_estimators=50; total time= 2.9min
[CV] END max_depth=5, min_samples_leaf=1, min_samples_split=2, n_estimators=50; total time= 2.9min
[CV] END max_depth=5, min_samples_leaf=1, min_samples_split=2, n_estimators=50; total time= 2.9min
[CV] END max_depth=5, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time= 5.5min
[CV] END max_depth=5, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time= 5.5min
[CV] END max_depth=5, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time= 5.5min
[CV] END max_depth=5, min_samples_leaf=1, min_samples_split=5, n_estimators=50; total time= 2.6min
[CV] END max_depth=5, min_samples_leaf=1, min_samples_split=5, n_estimators=50; total time= 2.7min
[CV] END max_depth=5, min_samples_leaf=1, min_samples_split=2, n_estimators=150; total time= 8.0min
[CV] END max_depth=5, min_samples_leaf=1, 

In [None]:
rf_tuned = RandomForestRegressor(random_state=42, **best_params)
rf_tuned.fit(X_train, y_train)

In [None]:
y_pred_train_tuned = rf_tuned.predict(X_train)
y_pred_test_tuned = rf_tuned.predict(X_test)
mse_train_tuned = mean_squared_error(y_train, y_pred_train_tuned)
mse_test_tuned = mean_squared_error(y_test, y_pred_test_tuned)
r2_train_tuned = r2_score(y_train, y_pred_train_tuned)
r2_test_tuned = r2_score(y_test, y_pred_test_tuned)

In [None]:
print("Training set Mean Squared Error (tuned model): ", mse_train_tuned)
print("Testing set Mean Squared Error (tuned model): ", mse_test_tuned)
print("Training set R-squared score (tuned model): ", r2_train_tuned)
print("Testing set R-squared score (tuned model): ", r2_test_tuned)