In [187]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pylab as plt

In [188]:
corollas = pd.read_csv('ToyotaCorolla(2).csv')
corollas.head()

Unnamed: 0,Id,Price,Age_08_22,KM,Fuel_Type,HP,Color,Automatic,CC,Doors,...,Airbag_1,Airbag_2,Airco,CD_Player,Powered_Windows,Power_Steering,Radio,Mistlamps,Sport_Model,Metallic_Rim
0,,,,,,,,,,,...,,,,,,,,,,
1,1.0,13500.0,23.0,46986.0,Diesel,90.0,Blue,0.0,2000.0,3.0,...,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0
2,,,,,,,,,,,...,,,,,,,,,,
3,2.0,13750.0,23.0,72937.0,Diesel,90.0,Silver,0.0,2000.0,3.0,...,1.0,1.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0
4,,,,,,,,,,,...,,,,,,,,,,


## Preprocessing

In [189]:
# Every alternate row from index 0 is empty. remove all such rows
corollas = corollas.drop(corollas.index[::2],axis=0)
corollas

Unnamed: 0,Id,Price,Age_08_22,KM,Fuel_Type,HP,Color,Automatic,CC,Doors,...,Airbag_1,Airbag_2,Airco,CD_Player,Powered_Windows,Power_Steering,Radio,Mistlamps,Sport_Model,Metallic_Rim
1,1.0,13500.0,23.0,46986.0,Diesel,90.0,Blue,0.0,2000.0,3.0,...,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0
3,2.0,13750.0,23.0,72937.0,Diesel,90.0,Silver,0.0,2000.0,3.0,...,1.0,1.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0
5,3.0,13950.0,24.0,41711.0,Diesel,90.0,Blue,0.0,2000.0,3.0,...,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
7,4.0,14950.0,26.0,48000.0,Diesel,90.0,Black,0.0,2000.0,3.0,...,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
9,5.0,13750.0,30.0,38500.0,Diesel,90.0,Black,0.0,2000.0,3.0,...,1.0,1.0,1.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2863,1438.0,7500.0,69.0,20544.0,Petrol,86.0,Blue,0.0,1300.0,3.0,...,1.0,1.0,1.0,0.0,1.0,1.0,0.0,,1.0,0.0
2865,1439.0,10845.0,72.0,19000.0,Petrol,86.0,Grey,0.0,1300.0,3.0,...,1.0,1.0,0.0,0.0,0.0,1.0,0.0,,1.0,0.0
2867,1440.0,8500.0,71.0,17016.0,Petrol,86.0,Blue,0.0,1300.0,3.0,...,1.0,1.0,0.0,0.0,0.0,1.0,0.0,,0.0,0.0
2869,1441.0,7250.0,70.0,16916.0,Petrol,86.0,Grey,0.0,1300.0,3.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,0.0,0.0


### Dealing with null values

In [190]:
corollas.dtypes
corollas.isna().sum()

Id                    0
Price                 1
Age_08_22             0
KM                    0
Fuel_Type             0
HP                    0
Color                 9
Automatic             0
CC                    5
Doors                 0
Cylinders             0
Gears                 0
Mfr_Guarantee         1
ABS                   0
Airbag_1              0
Airbag_2              0
Airco                 1
CD_Player             0
Powered_Windows       0
Power_Steering        0
Radio                 0
Mistlamps          1035
Sport_Model           0
Metallic_Rim          0
dtype: int64

In [191]:
#Mistlamps contains 518 Null values. Replace Null values with 0 (majority value count for Mistlamps)
corollas['Mistlamps'] = corollas['Mistlamps'].fillna(0)
corollas.isna().sum()

Id                 0
Price              1
Age_08_22          0
KM                 0
Fuel_Type          0
HP                 0
Color              9
Automatic          0
CC                 5
Doors              0
Cylinders          0
Gears              0
Mfr_Guarantee      1
ABS                0
Airbag_1           0
Airbag_2           0
Airco              1
CD_Player          0
Powered_Windows    0
Power_Steering     0
Radio              0
Mistlamps          0
Sport_Model        0
Metallic_Rim       0
dtype: int64

In [192]:
#as number of NaN values is maximum 9, we can simply drop the rows with Na values in the result dataframe
corollas.dropna(subset=['Price','Color','CC','Airco','Mfr_Guarantee'],inplace=True)
corollas.isna().sum()

Id                 0
Price              0
Age_08_22          0
KM                 0
Fuel_Type          0
HP                 0
Color              0
Automatic          0
CC                 0
Doors              0
Cylinders          0
Gears              0
Mfr_Guarantee      0
ABS                0
Airbag_1           0
Airbag_2           0
Airco              0
CD_Player          0
Powered_Windows    0
Power_Steering     0
Radio              0
Mistlamps          0
Sport_Model        0
Metallic_Rim       0
dtype: int64

In [193]:
corollas.shape

(1419, 24)

### Setting predictors and response variable

In [194]:
#remove Id and Price from predictors
corollas_preds = corollas.loc[:,'Age_08_22':]
response_var = corollas.loc[:,'Price']
print(corollas_preds.shape)
print(response_var.shape)

(1419, 22)
(1419,)


#### Correlation Analysis

After performing a correlation analysis on the predictors dataframe, we found that no columns have a correlation coefficient greater than 0.9. The highest correlation coefficient observed was less than 0.7, which suggests that there is no issue of multicollinearity in the predictors.

This result indicates that we can proceed with multiple linear regression without worrying about multicollinearity. 

In [195]:
corr_matrix = corollas_preds.corr()
max = 0.0
for i in range(len(corr_matrix.columns)):
    for j in range(i):
        if abs(corr_matrix.iloc[i, j]) > max:
            max = abs(corr_matrix.iloc[i, j])
        if abs(corr_matrix.iloc[i, j]) > 0.9:
            print(corr_matrix.columns[i], corr_matrix.columns[j], corr_matrix.iloc[i, j])
print("Maximum value of correlation coefficient:",max)

Maximum value of correlation coefficient: 0.6631723703902866


### dealing with categorical variables and scaling

In [196]:
corollas_preds.columns = ['carAge', 'kmDriven', 'fuelType', 'horsepower', 'color', 'automatics', 'cc', 'doors', 'cylinders', 'gears', 'manufacturerGuarantee', 'antilockBrakes', 'driverAirbags', 'passengerAirbags', 'airConditioner', 'cdPlayer', 'powerWindows', 'powerSteering', 'radio', 'mistLamps', 'sportsModel', 'metallicRim']
print(corollas_preds.head())

   carAge  kmDriven fuelType  horsepower   color  automatics      cc  doors  \
1    23.0   46986.0   Diesel        90.0    Blue         0.0  2000.0    3.0   
3    23.0   72937.0   Diesel        90.0  Silver         0.0  2000.0    3.0   
5    24.0   41711.0   Diesel        90.0    Blue         0.0  2000.0    3.0   
7    26.0   48000.0   Diesel        90.0   Black         0.0  2000.0    3.0   
9    30.0   38500.0   Diesel        90.0   Black         0.0  2000.0    3.0   

   cylinders  gears  ...  driverAirbags  passengerAirbags  airConditioner  \
1        4.0    5.0  ...            1.0               1.0             0.0   
3        4.0    5.0  ...            1.0               1.0             1.0   
5        4.0    5.0  ...            1.0               1.0             0.0   
7        4.0    5.0  ...            1.0               1.0             0.0   
9        4.0    5.0  ...            1.0               1.0             1.0   

   cdPlayer  powerWindows  powerSteering  radio  mistLamps  sp

In [197]:
#deal with categorical variables using label encoding
import sklearn
import pickle
from sklearn.preprocessing import LabelEncoder
le=LabelEncoder()
le2 =LabelEncoder()
corollas_preds['color'] = le.fit_transform(corollas_preds['color'])
corollas_preds['fuelType'] = le2.fit_transform(corollas_preds['fuelType'])

#save encoders in pickle file for use in production
output = open('color_encoder.pkl','wb')
pickle.dump(le, output)
output.close()
output = open('fuel_type_encoder.pkl','wb')
pickle.dump(le2, output)
output.close()
#column names list for use

#normalize predictos using standardization
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaled = scaler.fit_transform(corollas_preds)
scaled_preds = pd.DataFrame(scaled, columns=corollas_preds.columns)
print(scaled_preds.head())
#save scaler for production use
output_2 = open('scaler.pkl','wb')
pickle.dump(scaler, output_2)
output_2.close()

#verification
print(scaled_preds.shape)
col_means = np.mean(scaled_preds, axis=0)
col_stdevs = np.std(scaled_preds, axis=0)

# Print the mean and standard deviation of each column
for col, mean, stdev in zip(scaled_preds.columns, col_means, col_stdevs):
    print(f"Column '{col}': Mean={mean:.2f}, Standard Deviation={stdev:.2f}")

     carAge  kmDriven  fuelType  horsepower     color  automatics        cc  \
0 -1.819580 -0.581559 -2.340269   -0.770452 -0.887070   -0.239535  0.993976   
1 -1.819580  0.109448 -2.340269   -0.770452  1.460254   -0.239535  0.993976   
2 -1.765016 -0.722019 -2.340269   -0.770452 -0.887070   -0.239535  0.993976   
3 -1.655887 -0.554559 -2.340269   -0.770452 -1.473902   -0.239535  0.993976   
4 -1.437630 -0.807519 -2.340269   -0.770452 -1.473902   -0.239535  0.993976   

     doors  cylinders     gears  ...  driverAirbags  passengerAirbags  \
0 -1.07477        0.0 -0.141281  ...       0.174646          0.622171   
1 -1.07477        0.0 -0.141281  ...       0.174646          0.622171   
2 -1.07477        0.0 -0.141281  ...       0.174646          0.622171   
3 -1.07477        0.0 -0.141281  ...       0.174646          0.622171   
4 -1.07477        0.0 -0.141281  ...       0.174646          0.622171   

   airConditioner  cdPlayer  powerWindows  powerSteering     radio  mistLamps  \
0    

### Test and train split

In [198]:
#split test and training data
from sklearn.model_selection import train_test_split
results = []
X = scaled_preds
y = response_var
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1)

## Model 1: Multiple Linear Regression - Linear and Ridge

In [199]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, mean_squared_error

reg = LinearRegression(copy_X=True)
reg.fit(X_train, y_train)

LinearRegression()

In [200]:
#training and metric
y_pred_train1 = reg.predict(X_train)
rmse_train_lr = mean_squared_error(y_train, y_pred_train1, squared=False)
r2_train_lr = r2_score(y_train, y_pred_train1)

#testing and metric
y_pred_test1 = reg.predict(X_test)
rmse_test_lr = mean_squared_error(y_test, y_pred_test1, squared=False)
r2_test_lr = r2_score(y_test, y_pred_test1)

print('RMSE for training data: ', rmse_train_lr)
print('R2 for training data: ', r2_train_lr)
print('RMSE for testing data: ', rmse_test_lr)
print('R2 for testing data: ', r2_test_lr)


RMSE for training data:  1337.5678775871756
R2 for training data:  0.8548148223738165
RMSE for testing data:  1501.025585462913
R2 for testing data:  0.8287856942619309


In [201]:
#as there is no hyper parameter tuning feasible in sklearn LR, we will try ridge regression
from sklearn.linear_model import Ridge
alpha = [0.01,0.1,0.5,1,5,10]

for a in alpha:
    ridge = Ridge(alpha=a)
    ridge.fit(X_train,y_train)
    y_pred = ridge.predict(X_test)
    print("Ridge alpha = ",a)
    print("RMSE = ",np.sqrt(mean_squared_error(y_test,y_pred)))
    print("R2 = ",r2_score(y_test,y_pred))
    print("--------------------------------------------")

Ridge alpha =  0.01
RMSE =  1501.0276843222455
R2 =  0.8287852154493136
--------------------------------------------
Ridge alpha =  0.1
RMSE =  1501.0466188837088
R2 =  0.8287808958790052
--------------------------------------------
Ridge alpha =  0.5
RMSE =  1501.131745119699
R2 =  0.8287614752281471
--------------------------------------------
Ridge alpha =  1
RMSE =  1501.240369663754
R2 =  0.8287366920873889
--------------------------------------------
Ridge alpha =  5
RMSE =  1502.1948969655302
R2 =  0.8285188356035537
--------------------------------------------
Ridge alpha =  10
RMSE =  1503.5875237832433
R2 =  0.8282007411083929
--------------------------------------------


In [202]:
#alpha=0.01 had the minimal RMSE and maximum R2 for test data. Rerunning ridge with chosen hyperparameter value for reference
ridge = Ridge(alpha=0.01)
ridge.fit(X_train, y_train)
y_train_pred = ridge.predict(X_train)
y_test_pred = ridge.predict(X_test)


print("train RMSE = ",np.sqrt(mean_squared_error(y_train,y_train_pred)))
print("train R2 = ",r2_score(y_train,y_train_pred))
print("test RMSE = ",np.sqrt(mean_squared_error(y_test,y_pred)))
print("test R2 = ",r2_score(y_test,y_pred))

train RMSE =  1337.5678780555968
train R2 =  0.8548148222721277
test RMSE =  1503.5875237832433
test R2 =  0.8282007411083929


## Model 2: Decision Tree Regressor 

In [203]:
#next sklearn Decision Tree Regressor is tested where Grid Search isused to tune max_depth parameter
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import GridSearchCV

dtree_reg = DecisionTreeRegressor()
param_grid = {'max_depth':[2,4,6,8,10,12,14,16,18,20]}
grid_search = GridSearchCV(dtree_reg,param_grid,cv=5, scoring='r2')
grid_search.fit(X_train,y_train)
best_params  = grid_search.best_params_
print("Best Parameters are:",best_params)

#Now we will use the best parameters to train the model
dtree_reg_opt = DecisionTreeRegressor(max_depth=best_params['max_depth'])
dtree_reg_opt.fit(X_train,y_train)
y_pred_train3 = dtree_reg_opt.predict(X_train)
y_pred_test3 = dtree_reg_opt.predict(X_test)

#Now we will evaluate the model
rmse_train = np.sqrt(mean_squared_error(y_train,y_pred_train3))
rmse_test = np.sqrt(mean_squared_error(y_test,y_pred_test3))
r2_train = r2_score(y_train,y_pred_train3)
r2_test = r2_score(y_test,y_pred_test3)
print("RMSE train:",rmse_train)
print("r2 score train:",r2_train)
print("-----------------------------------------------------")
print("RMSE test:",rmse_test)
print("r2 score test:",r2_test) 

Best Parameters are: {'max_depth': 6}
RMSE train: 947.3523097419901
r2 score train: 0.9271694388650669
-----------------------------------------------------
RMSE test: 1191.368844706657
r2 score test: 0.8921410131177783


## Model 3: Random Forest Regressor

In [204]:
# Third and last model tested is sklearn Random Forest Regressor where number of tree hyperparameter is tuned using GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV

rf_reg = RandomForestRegressor()
param_grid = {'n_estimators': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100]}

grid_search = GridSearchCV(rf_reg, param_grid, cv=5, scoring='r2')
grid_search.fit(X_train, y_train)

print("Best parameters are: ",grid_search.best_params_)

rf_reg_opt = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'], random_state=42)
rf_reg_opt.fit(X_train, y_train)

y_pred_train4 = rf_reg_opt.predict(X_train)
y_pred_test4 = rf_reg_opt.predict(X_test)

rmse_train = np.sqrt(mean_squared_error(y_train,y_pred_train4))
rmse_test = np.sqrt(mean_squared_error(y_test,y_pred_test4))
r2_train = r2_score(y_train,y_pred_train4)
r2_test = r2_score(y_test,y_pred_test4)
print("RMSE train:",rmse_train)
print("r2 score train:",r2_train)
print("-----------------------------------------------------")
print("RMSE test:",rmse_test)
print("r2 score test:",r2_test) 


Best parameters are:  {'n_estimators': 50}
RMSE train: 447.50820106107375
r2 score train: 0.9837485312641524
-----------------------------------------------------
RMSE test: 1084.5505170311285
r2 score test: 0.9106152498350619


In [205]:
import pickle

filename = 'rforest_regressor_price.pkl'

In [206]:
pickle.dump(rf_reg_opt, open(filename, 'wb'))

In [207]:
form_data = {'carAge': '23', 
             'kmDriven': '46986', 
             'fuelType': 'diesel',
               'horsepower': '90',
                 'color': 'blue',
                   'automatics': '0', 'cc': '2000', 'doors': '3', 'cylinders': '3', 'gears': '5', 'manufacturerGuarantee': '0', 'antilockBrakes': '1', 'driverAirbags': '1', 'passengerAirbags': '1', 'airConditioner': '0', 'cdPlayer': '0', 'powerWindows': '1', 'powerSteering': '1', 'radio': '0', 'mistLamps': '0', 'sportsModel': '0', 'metallicRim': '0'}
form_data['color'] = form_data['color'].capitalize()
form_data['fuelType'] = form_data['fuelType'].capitalize()
query_df = pd.DataFrame([form_data.values()], columns=form_data.keys())
#print(query_df)
pkl_file = open('color_encoder.pkl','rb')
le_categorical = pickle.load(pkl_file)
pkl_file.close()
pkl_file = open('fuel_type_encoder.pkl','rb')
le_fuel = pickle.load(pkl_file)
pkl_file.close()
query_df['color'] = le_categorical.transform(query_df['color'])
query_df['fuelType'] = le_fuel.transform(query_df['fuelType'])
pkl_file = open('scaler.pkl','rb')
scaler = pickle.load(pkl_file)
pkl_file.close()
query_df_scale = scaler.transform(query_df)
print(query_df)
print(query_df_scale)
form_data.keys()
predict1 = rf_reg_opt.predict(query_df)
print(predict1)

  carAge kmDriven  fuelType horsepower  color automatics    cc doors  \
0     23    46986         1         90      2          0  2000     3   

  cylinders gears  ... driverAirbags passengerAirbags airConditioner cdPlayer  \
0         3     5  ...             1                1              0        0   

  powerWindows powerSteering radio mistLamps sportsModel metallicRim  
0            1             1     0         0           0           0  

[1 rows x 22 columns]
[[-1.81958011 -0.58155935 -2.34026944 -0.77045157 -0.88707038 -0.23953507
   0.99397612 -1.07477033 -1.         -0.14128054 -0.83387079  0.48253602
   0.17464561  0.62217102 -1.00636274 -0.5221549   0.89102847  0.15189265
  -0.41443803 -0.29696832 -0.64730209 -0.5057195 ]]
[6187.]
