In [None]:
import numpy as np
import pandas as pd

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn import set_config
from sklearn.metrics import r2_score
import pickle

In [None]:
car=pd.read_csv('https://raw.githubusercontent.com/rajtilakls2510/car_price_predictor/master/quikr_car.csv')
car.head()

Unnamed: 0,name,company,year,Price,kms_driven,fuel_type
0,Hyundai Santro Xing XO eRLX Euro III,Hyundai,2007,80000,"45,000 kms",Petrol
1,Mahindra Jeep CL550 MDI,Mahindra,2006,425000,40 kms,Diesel
2,Maruti Suzuki Alto 800 Vxi,Maruti,2018,Ask For Price,"22,000 kms",Petrol
3,Hyundai Grand i10 Magna 1.2 Kappa VTVT,Hyundai,2014,325000,"28,000 kms",Petrol
4,Ford EcoSport Titanium 1.5L TDCi,Ford,2014,575000,"36,000 kms",Diesel


In [None]:
#Check missing values
car.isnull().sum()

name           0
company        0
year           0
Price          0
kms_driven    52
fuel_type     55
dtype: int64

In [None]:
#car['year'].unique()

Issues:
1. Name-only 1st 3 words in names
2. Year-keep only numeric values in year, convert from object to int.
3. Price-convert from object to int, remove str values.
4. Kms_driven-convert from object to int, remove nan and str values.
5. Fuel_type-remove nan values

Data Cleaning

In [None]:
backup=car.copy()

In [None]:
car=car[car['year'].str.isnumeric()]

In [None]:
car['year']=car['year'].astype(int)

In [None]:
car=car[car['Price']!='Ask For Price']

In [None]:
car['Price']=car['Price'].str.replace(',','').astype(int)

In [None]:
car['kms_driven']=car['kms_driven'].str.split(' ').str.get(0).str.replace(',','')

In [None]:
#car['kms_driven']=car['kms_driven']!='Petrol'
car=car[car['kms_driven'].str.isnumeric()]

In [None]:
car['kms_driven']=car['kms_driven'].astype(int)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  car['kms_driven']=car['kms_driven'].astype(int)


In [None]:
car=car[~car['fuel_type'].isna()]

In [None]:
car['name']=car['name'].str.split(' ').str.slice(0,3).str.join(' ')

In [None]:
car.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 816 entries, 0 to 889
Data columns (total 6 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   name        816 non-null    object
 1   company     816 non-null    object
 2   year        816 non-null    int64 
 3   Price       816 non-null    int64 
 4   kms_driven  816 non-null    int64 
 5   fuel_type   816 non-null    object
dtypes: int64(3), object(3)
memory usage: 44.6+ KB


In [None]:
#car

Resolve issue with the indices

In [None]:
car=car.reset_index(drop=True)

In [None]:
car.describe()

Unnamed: 0,year,Price,kms_driven
count,816.0,816.0,816.0
mean,2012.444853,411717.6,46275.531863
std,4.002992,475184.4,34297.428044
min,1995.0,30000.0,0.0
25%,2010.0,175000.0,27000.0
50%,2013.0,299999.0,41000.0
75%,2015.0,491250.0,56818.5
max,2019.0,8500003.0,400000.0


Resolve Price->Max->Outlier

In [None]:
car=car[car['Price']<6e6].reset_index(drop=True)

Saving the cleaned data in a csv


In [None]:
car.to_csv('Cleaned Car.csv')

Build a model
Linear Regression

In [None]:
X=car.drop(columns='Price')
y=car['Price']

In [None]:
#X

In [None]:
#y

In [None]:
car.head()

Unnamed: 0,name,company,year,Price,kms_driven,fuel_type
0,Hyundai Santro Xing,Hyundai,2007,80000,45000,Petrol
1,Mahindra Jeep CL550,Mahindra,2006,425000,40,Diesel
2,Hyundai Grand i10,Hyundai,2014,325000,28000,Petrol
3,Ford EcoSport Titanium,Ford,2014,575000,36000,Diesel
4,Ford Figo,Ford,2012,175000,41000,Diesel


In [None]:
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=0)

In [None]:
#X_train.shape

In [None]:
lr=LinearRegression()

In [None]:
#One hot encoding
ohe=OneHotEncoder()
ohe.fit(X[['name','company','fuel_type']])

In [None]:
transformer=ColumnTransformer([
    ('ohe',OneHotEncoder(categories=ohe.categories_),['name','company','fuel_type'])
],remainder='passthrough')

In [None]:
pipe=Pipeline([
    ('trnf1',transformer),
    ('trnf2',lr)
])

In [None]:
pipe.fit(X_train,y_train)

In [None]:
set_config(display='diagram')

In [None]:
y_pred=pipe.predict(X_test)
#y_pred

In [None]:
r2=r2_score(y_test,y_pred)
print('R2 Score:',r2)

R2 Score: 0.659028389313812


In [None]:
score=[]
for i in range(1000):
  X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=i)
  lr=LinearRegression()
  pipe=Pipeline([('trnf1',transformer),('trnf2',lr)])
  pipe.fit(X_train,y_train)
  y_pred=pipe.predict(X_test)
  score.append(r2)

In [None]:
# for i in range(10):
#   X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=i)
#   lr=LinearRegression()
#   pipe=Pipeline([('trnf1',transformer),('trnf2',lr)])
#   pipe.fit(X_train,y_train)
#   y_pred=pipe.predict(X_test)
#   print(r2,i)

In [None]:
score[np.argmax(score)]

0.659028389313812

Import pickle

In [None]:
pickle.dump(pipe,open('LinearRegression.pkl','wb'))

In [None]:
test_input_1=pipe.predict(pd.DataFrame([['Ford Figo','Ford',2012,41000,'Diesel']],columns=['name','company','year','kms_driven','fuel_type']))

In [None]:
test_input_1

array([277861.21560938])

In [None]:
test_input_2=pipe.predict(pd.DataFrame([['Maruti Suzuki Swift','Maruti',2019,100,'Petrol']],columns=['name','company','year','kms_driven','fuel_type']))

In [None]:
test_input_2

array([431098.74055388])