In [1]:
%config IPCompleter.use_jedi = False 
%config Completer.evaluation = 'limited'
import warnings
warnings.filterwarnings('ignore')

In [2]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt 
import seaborn as sns 

<h3> Importing sklearn stuffs

In [15]:
from sklearn.model_selection import train_test_split

# import the LinearRegression Model class 
from sklearn.linear_model import LinearRegression
# As we have string type column so we need to encode this
from sklearn.preprocessing import OneHotEncoder
# import matrices to evaluate the model's performance
from sklearn.metrics import r2_score
# as we need to apply tranformer on multiple cols and also need to make a pipeline so we will import 2 extra thins
from sklearn.compose import ColumnTransformer , make_column_transformer
from sklearn.pipeline import make_pipeline
# To fill the missing values we need SimpleImputer
from sklearn.impute import SimpleImputer

In [None]:
cars = pd.read_csv('../Data/cleaned_car_data.csv')

In [4]:
# Extract the input and target features
X = cars.drop(columns = ['Price'])
y = cars['Price']

In [5]:
X 

Unnamed: 0,name,company,year,kms_driven,fuel_type
0,Hyundai Santro Xing,Hyundai,2007,45000,Petrol
1,Mahindra Jeep CL550,Mahindra,2006,40,Diesel
2,Hyundai Grand i10,Hyundai,2014,28000,Petrol
3,Ford EcoSport Titanium,Ford,2014,36000,Diesel
4,Ford Figo,Ford,2012,41000,Diesel
...,...,...,...,...,...
719,Maruti Suzuki Ritz,Maruti,2011,50000,Petrol
720,Tata Indica V2,Tata,2009,30000,Diesel
721,Toyota Corolla Altis,Toyota,2009,132000,Petrol
722,Tata Zest XM,Tata,2018,27000,Diesel


In [45]:
# split the train and test set 
X_train , X_test , y_train , y_test = train_test_split(X , y , test_size = 0.2 , random_state = 2)

In [46]:
X_train.shape

(579, 5)

<h2> Model Building version-01

In [47]:
# Pipepline for categorical col 
cat_pipeline = make_pipeline(
    SimpleImputer(strategy = 'most_frequent'), # Handle missing categorical data
    OneHotEncoder(handle_unknown = 'ignore') # One-hot encode the categorical features 
)

In [48]:
# # Pipepline for neumerical col(year and kms_driven)
num_pipeline = make_pipeline(
    SimpleImputer(strategy = 'mean') # handle the missing numeric data
)

In [49]:
# Define the column transformer
column_transformer = make_column_transformer(
    (cat_pipeline , ['name' , 'company' , 'fuel_type']), 
    (num_pipeline , ['year' , 'kms_driven']), 
    remainder = 'passthrough'
)

In [50]:
# Final Pipeline: Transformer + Linear Model 
pipeline = make_pipeline(
    column_transformer , 
    LinearRegression()
)

In [51]:
# train the model and predict
pipeline.fit(X_train, y_train)
y_pred = pipeline.predict(X_test)

In [52]:
r2_score(y_test , y_pred)

0.150584586377993

In [57]:
r2Scores = []
for i in range(1000):
    X_train , X_test , y_train , y_test = train_test_split(X , y , test_size = 0.2 , random_state = i) 
    lr = LinearRegression()
    pipeline = make_pipeline(column_transformer , lr)
    pipeline.fit(X_train , y_train) 
    y_pred = pipeline.predict(X_test)
    r2Scores.append(r2_score(y_test , y_pred))

In [58]:
np.argmax(r2Scores)

np.int64(326)

In [59]:
r2Scores[np.argmax(r2Scores)]

0.7815851810398547

<h2> Model Building version-02

In [60]:
ohe = OneHotEncoder()
ohe.fit(X[['name' , 'company' , 'fuel_type']])

In [61]:
column_trans = make_column_transformer(
    (OneHotEncoder(categories = ohe.categories , handle_unknown = 'ignore') , ['name' , 'company' , 'fuel_type']), 
    remainder = 'passthrough'
)

In [62]:
lr = LinearRegression()

In [63]:
pipe = make_pipeline(column_trans , lr)

In [64]:
pipe.fit(X_train , y_train)

In [65]:
y_pred = pipe.predict(X_test)

In [69]:
r2Scores1 = []
for i in range(1000):
    X_train , X_test , y_train , y_test = train_test_split(X , y , test_size = 0.2 , random_state = i) 
    lr1 = LinearRegression()
    pipeline1 = make_pipeline(column_trans , lr1)
    pipeline1.fit(X_train , y_train) 
    y_pred1 = pipeline.predict(X_test)
    r2Scores1.append(r2_score(y_test , y_pred1))

In [70]:
r2Scores1[np.argmax(r2Scores1)]

0.8708313346639505

<h2> As version 2 is giving more accurate result around 87% so we will use version-2 model as final.

In [71]:
X_train , X_test , y_train , y_test = train_test_split(X , y , test_size = 0.2 , random_state = np.argmax(r2Scores1)) 

lr_final = LinearRegression()

pipeline_final = make_pipeline(column_trans , lr_final)
pipeline_final.fit(X_train , y_train) 


In [73]:
y_pred1 = pipeline.predict(X_test)
r2_score(y_test , y_pred1)

0.8708313346639505

In [74]:
import pickle

In [76]:
# save the model
model_path = 'Model/LinearRegressionModel.pkl'

pickle.dump(pipeline_final , open(model_path , 'wb'))

In [78]:
# Let's predict a single sample

pipeline_final.predict(pd.DataFrame([['Maruti Suzuki Swift' , 'Maruti' , 2019 , 100 , 'Petrol']] , 
                                    columns = ['name' , 'company' , 'year' , 'kms_driven' , 'fuel_type']))

array([434837.64557176])