# Car Price Prediction

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.dummy import DummyRegressor
import scipy.stats as stats
from statsmodels.formula.api import ols
import statsmodels.api as sm
from sklearn.model_selection import train_test_split
from sklearn import linear_model
from sklearn import metrics
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler
from statsmodels.stats.outliers_influence import variance_inflation_factor
from sklearn.feature_selection import SelectKBest, f_regression,mutual_info_regression
from sklearn.preprocessing import MinMaxScaler
pd.set_option('display.max_columns', None)
plt.style.use('seaborn-deep')
plt.rcParams.update({'font.size': 15})

In [None]:
df = pd.read_csv('../input/craigslist-carstrucks-data/vehicles.csv')

In [None]:
df.shape

In [None]:
len(df[df.price == 0])

## Data Cleaning

In [None]:
# Removing rows with missing price and year
df.drop(df[df['price'] == 0].index, inplace = True) 
df.drop(df[df['year'].isna()].index, inplace = True) 
df.drop(df[df['year'] == 2021].index, inplace = True)
df.drop([496], inplace = True)
df.drop(df[df.odometer > 500000].index, inplace = True)

In [None]:
# Changing type of year to int
df['year'] = df['year'].astype(int)
df['year'].dtype

In [None]:
year_med =  dict(df.groupby('year')['odometer'].median())

In [None]:
# Imputing missing values in odometer with median odometer of each year cars
df['odometer'] = df['odometer'].fillna(df['year'].apply(lambda x: year_med.get(x)))

In [None]:
# Removing the leftover missing values in odometer because no information available for those years
df.drop(df[df['odometer'].isna()].index, inplace = True) 

In [None]:
df.shape

In [None]:
# Dropping cars with price less than 1000 with miles less than 60,000 and model year greater than 2010
df.drop(df[(df.price < 1000 ) & (df.odometer < 60000 ) & (df.year > 2010)].index, inplace = True)
df.drop(df[(df.price < 200)].index, inplace = True)
df.drop(df[(df.price > 50000)].index, inplace = True)

In [None]:
df.shape

In [None]:
df.head()

In [None]:
# Dropping rows with missing cylinders, transmission and drive
df.drop(df[(df['cylinders'].isna()) & (df['transmission'].isna()) & (df['drive'].isna())].index, inplace = True)

In [None]:
df['cylinders'] = df['cylinders'].apply(lambda x: x.split()[0] if not pd.isnull(x) else np.nan)

In [None]:
# Changing cylinders of tesla cars to 0
df.loc[df.manufacturer == 'tesla', 'cylinders'] = 0
df.cylinders.fillna(0, inplace = True)
df.cylinders =  df.cylinders.replace('other', 0)

In [None]:
df.columns

In [None]:
df['cylinders'] = df['cylinders'].astype(int)


In [None]:
# Changing cylinder of all 2019 honda accords to 4
df.loc[((df.manufacturer == 'honda') & (df.year == 2019) & (df.model == 'accord')), 'cylinders'] = 4

In [None]:
df.loc[((df.manufacturer == 'honda') & (df.year == 2020) & (df.model == 'accord')),'cylinders'] = 4

In [None]:
# Dropping unwanted columns
df.drop(['id', 'url', 'region_url', 'VIN', 'lat', 'long','image_url', 'region'], axis = 1, inplace = True) 

In [None]:
df.shape

In [None]:
df.isna().sum()

In [None]:
# filling missing condition values with fair considering their condition would be fair
#df.condition.fillna('fair', inplace = True)

In [None]:
df.condition.replace('like new', 'excellent', inplace = True)
df.condition.replace('new', 'excellent', inplace = True)

In [None]:
df.condition.value_counts()

In [None]:
df.fuel.fillna('gas', inplace = True)

In [None]:
df.fuel.value_counts()

In [None]:
df.transmission.value_counts()

In [None]:
df["description"]=df["description"].apply(lambda x:str(x).lower())
df.loc[(df['description'].str.contains('automatic transmission')) & (df['transmission'] == 'other'), "transmission"] = "automatic"
df.loc[(df['description'].str.contains('manual transmission')) & (df['transmission'] == 'other'), "transmission"] = "manual"
df.loc[(df['description'].str.contains('automatic transmission')) & (df['transmission'].isna()), "transmission"] = "automatic"
df.loc[(df['description'].str.contains('manual transmission')) & (df['transmission'].isna()), "transmission"] = "manual"

In [None]:
# Assuming most cars have automatic transmission
df.transmission.fillna('automatic', inplace = True)

In [None]:
df.isna().mean()

In [None]:
# Creating age from year
df['age'] = 2021 - df['year']
#df.drop(columns = 'year', inplace = True)

In [None]:
df.drop(df[df.odometer < 10].index, inplace = True) 
df.drop(df[df.odometer > 300000].index, inplace = True)

In [None]:
df.shape

In [None]:
dff = df

In [None]:
df.columns

In [None]:
target = df['price']

In [None]:
df.shape

In [None]:
# Going through description to check how many cars have new tires and creating a column for new tires
df['new_tires'] = np.where((df['description'].str.contains('new tire')), 1,0) 

In [None]:
df['new_tires'].sum()

In [None]:
df['heated_seats'] = np.where((df['description'].str.contains('HEATED SEAT')) |
                           (df['description'].str.contains('heated seat')) |
                           (df['description'].str.contains('Heated Seat')) |
                           (df['description'].str.contains('heated SEAT')) |
                           (df['description'].str.contains('HEATED seat')), 1,0)

In [None]:
df['heated_seats'].sum()

In [None]:
df['auto_headlight'] = np.where((df['description'].str.contains('automatic headlight')) |
                                (df['description'].str.contains('auto headlight')) |
                                (df['description'].str.contains('AUTOMATIC HEADLIGHT')) |
                                (df['description'].str.contains('Automatic Headlight')) |
                                (df['description'].str.contains('AUTO HEADLIGHT')) |
                                (df['description'].str.contains('Auto Headlight')), 1,0)

In [None]:
df['auto_headlight'].sum()

In [None]:
df['leather'] = np.where((df['description'].str.contains('leather')) |
                          (df['description'].str.contains('Leather'))|
                          (df['description'].str.contains('LEATHER')) , 1,0)

In [None]:
df['leather'].sum()

In [None]:
# Removing Description and model column
df.drop(['description', 'model','state'], axis = 1, inplace = True)


In [None]:
df.columns

## Baseline Model

In [None]:
dff.columns
base_features = [ 'odometer','age', 'new_tires', 'heated_seats', 'auto_headlight', 'leather', 'cylinders'] 

In [None]:
scaler = StandardScaler()
#instantiate a linear regression object
scaler.fit(dff[base_features])
lm = linear_model.LinearRegression()

#fit the linear regression to the data
lm = lm.fit(dff[base_features], target)


print(lm.intercept_)
print(lm.coef_)
print ("R^2 Score:", lm.score(dff[base_features], target))
pred = lm.predict(dff[base_features])
rmse = np.sqrt(metrics.mean_squared_error(target, pred))
print('Root Mean Squared Error:' , rmse)

## Creating Dummy Variables

In [None]:
df.columns

In [None]:
target = df['price']
df =  pd.get_dummies(df, columns=['condition'], drop_first = True )
df =  pd.get_dummies(df, columns=['fuel'], drop_first = True )
df =  pd.get_dummies(df, columns=['title_status'], drop_first = True )
df =  pd.get_dummies(df, columns=['transmission'], drop_first = True )
df =  pd.get_dummies(df, columns=['drive'], drop_first = True )
df =  pd.get_dummies(df, columns=['size'], drop_first = True )
df =  pd.get_dummies(df, columns=['type'], drop_first = True )
df =  pd.get_dummies(df, columns=['paint_color'], drop_first = True )
#df =  pd.get_dummies(df, columns=['state'], drop_first = True )
#df =  pd.get_dummies(df, columns=['cylinders'], drop_first = True )
#df =  pd.get_dummies(df, columns=['manufacturer'], drop_first = True )

In [None]:
df.head()

In [None]:
df.shape

In [None]:
df.columns

In [None]:
def train_regression_model(df,kfold):
  features = ['age', 'odometer', 'new_tires', 'heated_seats',
       'auto_headlight', 'leather', 'cylinders',
       'condition_fair', 'condition_good', 'condition_salvage',
       'fuel_electric', 'fuel_gas', 'fuel_hybrid', 'fuel_other',
       'title_status_lien', 'title_status_missing', 'title_status_parts only',
       'title_status_rebuilt', 'title_status_salvage', 'transmission_manual',
       'transmission_other', 'drive_fwd', 'drive_rwd', 'size_full-size',
       'size_mid-size', 'size_sub-compact', 'type_bus', 'type_convertible',
       'type_coupe', 'type_hatchback', 'type_mini-van', 'type_offroad',
       'type_other', 'type_pickup', 'type_sedan', 'type_truck', 'type_van',
       'type_wagon', 'paint_color_blue', 'paint_color_brown',
       'paint_color_custom', 'paint_color_green', 'paint_color_grey',
       'paint_color_orange', 'paint_color_purple', 'paint_color_red',
       'paint_color_silver', 'paint_color_white', 'paint_color_yellow']

  scaler = StandardScaler()
  scaler2 = MinMaxScaler()
  #call train_test_split on the data and capture the results
  train_data=df[df.kfold != fold].reset_index(drop=True)
  valid_data=df[df.kfold == fold].reset_index(drop=True)
  X_train=train_data[features]
  y_train=train_data["price"]
  X_test=valid_data[features]
  y_test=valid_data["price"]
  #print(len(y_test.values),len(valid_data),len(X_test))
  scaler2.fit(X_train)
  lm = linear_model.LinearRegression()
  lm = lm.fit(X_train, y_train)
  y_train_pred = lm.predict(X_train)
  train_rmse = np.sqrt(metrics.mean_squared_error(y_train, y_train_pred))
  print('Root Mean Squared Error:' , train_rmse)
  y_pred = lm.predict(X_test)
  test_rmse = np.sqrt(metrics.mean_squared_error(y_test, y_pred))
  print('Root Mean Squared Error:' + str(np.sqrt(metrics.mean_squared_error(y_test, y_pred))))
  print('Fold: ',int(kfold),'Training: ', int(train_rmse), "vs. Testing: ", int(test_rmse))
  return train_rmse,test_rmse

Removing Paint Color

In [None]:
def train_regression_model_v2(df,kfold):
  features = ['age', 'odometer', 'new_tires', 'heated_seats',
       'auto_headlight', 'leather', 'cylinders',
       'condition_fair', 'condition_good', 'condition_salvage',
       'fuel_electric', 'fuel_gas', 'fuel_hybrid', 'fuel_other',
       'title_status_lien', 'title_status_missing', 'title_status_parts only',
       'title_status_rebuilt', 'title_status_salvage', 'transmission_manual',
       'transmission_other', 'drive_fwd', 'drive_rwd', 'size_full-size',
       'size_mid-size', 'size_sub-compact', 'type_bus', 'type_convertible',
       'type_coupe', 'type_hatchback', 'type_mini-van', 'type_offroad',
       'type_other', 'type_pickup', 'type_sedan', 'type_truck', 'type_van',
       'type_wagon', ]

  scaler = StandardScaler()
  scaler2 = MinMaxScaler()
  #call train_test_split on the data and capture the results
  train_data=df[df.kfold != fold].reset_index(drop=True)
  valid_data=df[df.kfold == fold].reset_index(drop=True)
  X_train=train_data[features]
  y_train=train_data["price"]
  X_test=valid_data[features]
  y_test=valid_data["price"]
  #print(len(y_test.values),len(valid_data),len(X_test))
  scaler2.fit(X_train)
  lm = linear_model.LinearRegression()
  lm = lm.fit(X_train, y_train)
  y_train_pred = lm.predict(X_train)
  train_rmse = np.sqrt(metrics.mean_squared_error(y_train, y_train_pred))
  print('Root Mean Squared Error:' , train_rmse)
  y_pred = lm.predict(X_test)
  test_rmse = np.sqrt(metrics.mean_squared_error(y_test, y_pred))
  print('Root Mean Squared Error:' + str(np.sqrt(metrics.mean_squared_error(y_test, y_pred))))
  print('Fold: ',int(kfold),'Training: ', int(train_rmse), "vs. Testing: ", int(test_rmse))
  return train_rmse,test_rmse

In [None]:
from sklearn import model_selection
df["kfold"]=-1
df=df.sample(frac=1).reset_index(drop=True)
kf=model_selection.KFold(n_splits=5)
for fold,(tar,val)in enumerate(kf.split(df)):
  df.loc[val,"kfold"]=fold

In [None]:
avg_train_rmse=0
avg_test_rmse=0
for i in range(5):
  print("="*50)
  train_rmse,test_rmse=train_regression_model(df,i)
  avg_train_rmse+=train_rmse
  avg_test_rmse+=test_rmse
  print("="*50)
print("Average Train RMSE : ",avg_train_rmse/5)
print("Average Test RMSE : ",avg_test_rmse/5)

Removing Paint Color Feature From Data Then Performance is still same

In [None]:
avg_train_rmse=0
avg_test_rmse=0
for i in range(5):
  print("="*50)
  train_rmse,test_rmse=train_regression_model_v2(df,i)
  avg_train_rmse+=train_rmse
  avg_test_rmse+=test_rmse
  print("="*50)
print("Average Train RMSE : ",avg_train_rmse/5)
print("Average Test RMSE : ",avg_test_rmse/5)

In [None]:
# testing the model on training data and getting the rmse
df

In [None]:
print('Training: ', int(train_rmse), "vs. Testing: ", int(test_rmse))

### Using k best

In [None]:
selector = SelectKBest(f_regression, k = 20)
selector.fit(X_train, y_train)

selected_columns = X_train.columns[selector.get_support()]
removed_columns = X_train.columns[~selector.get_support()]

In [None]:
list(removed_columns)

In [None]:
list(selected_columns)

In [None]:
#instantiate a linear regression object
lm_kbest = LinearRegression()

#fit the linear regression to the data
lm_kbest = lm_kbest.fit(X_train[selected_columns], y_train)

y_train_kbest = lm_kbest.predict(X_train[selected_columns])


trainK_rmse = np.sqrt(metrics.mean_squared_error(y_train, y_train_kbest))


print('Training Root Mean Squared Error:' , trainK_rmse)

y_kbest = lm_kbest.predict(X_test[selected_columns])

testK_rmse = np.sqrt(metrics.mean_squared_error(y_test, y_kbest))

print('Testing Root Mean Squared Error:' , testK_rmse)


print('Original: ', test_rmse, "vs. KBest: ", testK_rmse)
lm_kbest.score(X_train[selected_columns], y_train)