In [29]:
print("Hello")

Hello


In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import csv
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import GridSearchCV
from sklearn.impute import SimpleImputer


def handle_bad_lines(line):
    #print(f"Skipping bad line: {line}")
    return None


car_prices= pd.read_csv('car_prices.csv', engine='python', on_bad_lines=handle_bad_lines)
car_prices.drop(columns=['vin','saledate','seller'], inplace=True)
print(car_prices.head())

   year   make                model        trim   body transmission state   
0  2015    Kia              Sorento          LX    SUV    automatic    ca  \
1  2015    Kia              Sorento          LX    SUV    automatic    ca   
2  2014    BMW             3 Series  328i SULEV  Sedan    automatic    ca   
3  2015  Volvo                  S60          T5  Sedan    automatic    ca   
4  2014    BMW  6 Series Gran Coupe        650i  Sedan    automatic    ca   

   condition  odometer  color interior    mmr  sellingprice  
0        5.0   16639.0  white    black  20500         21500  
1        5.0    9393.0  white    beige  20800         21500  
2        4.5    1331.0   gray    black  31900         30000  
3        4.1   14282.0  white    black  27500         27750  
4        4.3    2641.0   gray    black  66000         67000  


In [3]:
car_prices.dropna(subset=['make','model','trim','body','transmission','color','interior','state'], inplace=True)
car_prices['odometer'].fillna(car_prices['odometer'].median(), inplace=True)
car_prices['condition'].fillna(car_prices['condition'].median(), inplace=True)
print(car_prices.shape[0])

481752


In [4]:
car_prices = car_prices[car_prices['sellingprice']<125000]
print(car_prices.shape[0])

481691


In [5]:
car_prices = pd.get_dummies(car_prices, columns=['make','model','trim','body','transmission','color','interior','state'],)
#print(car_prices.columns)

In [6]:
X = car_prices.drop(columns=['sellingprice'])
y = car_prices['sellingprice']
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2, random_state=42)
print(X_train, X_test, y_train, y_test)


        year  condition  odometer    mmr  make_Acura  make_Aston Martin   
95134   1999        1.9  224009.0   1275        True              False  \
455846  2014        4.2   30862.0  17250       False              False   
204992  2004        2.4  113658.0   2925       False              False   
471304  2003        4.7   85229.0   2625       False              False   
341201  2014        3.8   35187.0  13250       False              False   
...      ...        ...       ...    ...         ...                ...   
299103  2014        3.6   17230.0  15450       False              False   
422139  2014        3.8   33402.0  15150       False              False   
153943  2010        3.2  111959.0   9225       False              False   
170984  2008        2.5  108426.0   7000       False              False   
141814  1998        2.0  148384.0    750       False              False   

        make_Audi  make_BMW  make_Bentley  make_Buick  ...  state_or   
95134       False     False

In [7]:
scaler = StandardScaler()
X_train[['year','condition','odometer']]=scaler.fit_transform(X_train[['year','condition','odometer']])
X_test[['year','condition','odometer']]=scaler.transform(X_test[['year','condition','odometer']])
imputer = SimpleImputer(strategy='most_frequent')

In [8]:
X_train = imputer.fit_transform(X_train)

In [9]:
print(car_prices.shape[0])
new_cars = car_prices.dropna(inplace=True)
print(new_cars)

481691
None


In [10]:
rf = RandomForestRegressor(random_state=42)
rf.fit(X_train, y_train)


In [11]:
y_pred_train = rf.predict(X_train)
y_pred_test = rf.predict(X_test)
mse_train = mean_squared_error(y_train,y_pred_train)
mse_test = mean_squared_error(y_test,y_pred_test)



In [12]:
r2_train = r2_score(y_train,y_pred_train)
r2_test = r2_score(y_test, y_pred_test)

In [13]:
print("Training set MSE: ", mse_train)
print("Testing set MSE: ", mse_test)
print("Training set R-squared score:", r2_train)
print("Testing set R-squared score: ", r2_test)

Training set MSE:  309816.5853491317
Testing set MSE:  2168123.4376830743
Training set R-squared score: 0.996555837030447
Testing set R-squared score:  0.9761132406049599


In [28]:
test_data = car_prices.sample(n=10, random_state=2)
X_test = test_data.drop('sellingprice',axis=1)
y_pred = rf.predict(X_test)
y_true=test_data['sellingprice']



In [29]:
print(test_data)

        year  condition  odometer    mmr  sellingprice  make_Acura   
62937   2012        4.6   42753.0  15650         16000       False  \
417722  2013        2.5   16161.0  22500         19600       False   
314054  2013        4.0   17988.0  16250         13200       False   
279187  2010        4.0   79104.0   7450          8200       False   
237139  2015        3.8   21121.0  17100         17400       False   
60996   2013        4.3   72227.0  14650         14500       False   
290764  2014        3.9    8288.0  12600         13000       False   
356039  2003        2.9  182696.0   1750          3700       False   
262007  2014        4.1    9101.0  27800         28500       False   
355578  2013        4.9   11347.0  30900         33600       False   

        make_Aston Martin  make_Audi  make_BMW  make_Bentley  ...  state_or   
62937               False      False     False         False  ...     False  \
417722              False       True     False         False  ...     F

In [30]:
for i in range(len(y_true)):
    print(f"Predicted selling price: {y_pred[i]:.2f}, Actual selling price: {y_true.iloc[i]}")

Predicted selling price: 15664.01, Actual selling price: 16000
Predicted selling price: 21304.80, Actual selling price: 19600
Predicted selling price: 15047.00, Actual selling price: 13200
Predicted selling price: 7812.00, Actual selling price: 8200
Predicted selling price: 16044.00, Actual selling price: 17400
Predicted selling price: 14051.50, Actual selling price: 14500
Predicted selling price: 11625.00, Actual selling price: 13000
Predicted selling price: 2647.50, Actual selling price: 3700
Predicted selling price: 27500.50, Actual selling price: 28500
Predicted selling price: 30209.00, Actual selling price: 33600
