In [2]:
#Import libraries
import numpy as np
import pandas as pd

In [3]:
#Importing and reading dataset
ds=pd.read_csv("new_cleaned_data.csv")
ds = ds.loc[:, ~ds.columns.str.contains('^Unnamed')]         
ds

Unnamed: 0,name,company,year,Price,kms_driven,fuel_type
0,Santro Xing XO eRLX Euro III,Hyundai,2007,80000.0,45000,Petrol
1,Grand i10 Magna 1.2 Kappa VTVT,Hyundai,2014,325000.0,28000,Petrol
2,EcoSport Titanium 1.5L TDCi,Ford,2014,575000.0,36000,Diesel
3,Figo,Ford,2012,175000.0,41000,Diesel
4,Eon,Hyundai,2013,190000.0,25000,Petrol
...,...,...,...,...,...,...
646,Suzuki Ritz VXI ABS,Maruti,2011,270000.0,50000,Petrol
647,Indica V2 DLE BS III,Tata,2009,110000.0,30000,Diesel
648,Corolla Altis,Toyota,2009,300000.0,132000,Petrol
649,Zest XM Diesel,Tata,2018,260000.0,27000,Diesel


In [4]:
ds.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 651 entries, 0 to 650
Data columns (total 6 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   name        651 non-null    object 
 1   company     651 non-null    object 
 2   year        651 non-null    int64  
 3   Price       651 non-null    float64
 4   kms_driven  651 non-null    int64  
 5   fuel_type   651 non-null    object 
dtypes: float64(1), int64(2), object(3)
memory usage: 30.6+ KB


In [5]:
#Splitting into dependent and independent variable
x=ds[['company','name','year','kms_driven','fuel_type']]
y=ds[['Price']]

In [6]:
x

Unnamed: 0,company,name,year,kms_driven,fuel_type
0,Hyundai,Santro Xing XO eRLX Euro III,2007,45000,Petrol
1,Hyundai,Grand i10 Magna 1.2 Kappa VTVT,2014,28000,Petrol
2,Ford,EcoSport Titanium 1.5L TDCi,2014,36000,Diesel
3,Ford,Figo,2012,41000,Diesel
4,Hyundai,Eon,2013,25000,Petrol
...,...,...,...,...,...
646,Maruti,Suzuki Ritz VXI ABS,2011,50000,Petrol
647,Tata,Indica V2 DLE BS III,2009,30000,Diesel
648,Toyota,Corolla Altis,2009,132000,Petrol
649,Tata,Zest XM Diesel,2018,27000,Diesel


In [7]:
y

Unnamed: 0,Price
0,80000.0
1,325000.0
2,575000.0
3,175000.0
4,190000.0
...,...
646,270000.0
647,110000.0
648,300000.0
649,260000.0


In [8]:
print(len(ds['company'].unique()))
print(len(ds['kms_driven'].unique()))
print(len(ds['fuel_type'].unique()))

24
195
3


In [9]:
#Implement OneHotEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import make_column_transformer
ohe=OneHotEncoder()
ohe.fit(x[["company","name","fuel_type"]])

In [10]:
ohe.categories_

[array(['Audi', 'BMW', 'Chevrolet', 'Datsun', 'Fiat', 'Force', 'Ford',
        'Hindustan', 'Honda', 'Hyundai', 'Jaguar', 'Land', 'Mahindra',
        'Maruti', 'Mercedes', 'Mini', 'Mitsubishi', 'Nissan', 'Renault',
        'Skoda', 'Tata', 'Toyota', 'Volkswagen', 'Volvo'], dtype=object),
 array(['3 Series 320d Sedan', '3 Series 320i', '5 Series 520d Sedan',
        '5 Series 530i', '7 Series 740Li Sedan', 'A3 Cabriolet 40 TFSI',
        'A4 1.8 TFSI Multitronic Premium Plus',
        'A4 2.0 TDI 177bhp Premium', 'A6 2.0 TDI Premium', 'Accent',
        'Accent Executive Edition', 'Accent GLE', 'Accent GLX', 'Accord',
        'Amaze', 'Amaze 1.2 S i VTEC', 'Amaze 1.5 E i DTEC',
        'Amaze 1.5 S i DTEC', 'Amaze 1.5 SX i DTEC', 'Aria Pleasure 4X2',
        'Beat', 'Beat Diesel', 'Beat LS Diesel', 'Beat LS Petrol',
        'Beat LT Diesel', 'Beat LT Opt Diesel', 'Beat LT Petrol',
        'Beat PS Diesel', 'Benz A Class A 180 Sport Petrol',
        'Benz B Class B180 Sports', 'Benz C Cla

In [11]:
#Column transformer
ct=make_column_transformer((OneHotEncoder(handle_unknown='ignore',categories=ohe.categories_),["company","name","fuel_type"]),remainder='passthrough',force_int_remainder_cols=False,sparse_threshold=0)
ct

In [12]:
#Making pipeline
from sklearn.linear_model import LinearRegression
reg=LinearRegression()

In [13]:
from sklearn.pipeline import make_pipeline
pipe=make_pipeline(ct,reg)
pipe

In [14]:
#Splitting dataset into training and testing
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score
scores=[]
for i in range(0,101):
    x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.10,random_state=i)
    pipe.fit(x_train,y_train)
    result=pipe.predict(x_test)
    score=r2_score(y_test,result)
    scores.append(score)

In [15]:
#Finding best value
bestindex=np.argmax(scores)
scores[bestindex]

0.8233392844313367

In [16]:
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.10,random_state=bestindex)
pipe.fit(x_train,y_train)

In [None]:
#Predict for user_input
company=input("Enter company name:")
name=input("Enter car name:")
year=int(input("Enter year:"))
kms_driven=int(input("Enter kms_driven"))
fuel_type=input("Enter fuel type:")
columns=["company","name","year","kms_driven","fuel_type"]
myinput=pd.DataFrame(columns=columns,data=[[company,name,year,kms_driven,fuel_type]])
result=pipe.predict(myinput)
print("You should buy it for ~ price : ",abs(round(result[0,0])))

In [None]:
import pickle as pkl

In [None]:
pkl.dump(pipe,open("CarProject.pkl","wb"))