## Load Packages

In [454]:
import pandas as pd
import numpy as np
import pickle
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder,StandardScaler,PolynomialFeatures
from sklearn.linear_model import LinearRegression

## Load Dataset

In [455]:
df1=pd.read_csv('Dataset/Raw Dataset Part 1.csv')
df1.head()

Unnamed: 0,Year,Distance,Fuel,Seller_Type,Transmission,Owner,Engine,Power,Seats,Price
0,2017,87150,Petrol,Corporate,Manual,First,1198 cc,87 bhp @ 6000 rpm,5.0,505000
1,2014,75000,Diesel,Individual,Manual,Second,1248 cc,74 bhp @ 4000 rpm,5.0,450000
2,2011,67000,Petrol,Individual,Manual,First,1197 cc,79 bhp @ 6000 rpm,5.0,220000
3,2019,37500,Petrol,Individual,Manual,First,1197 cc,82 bhp @ 6000 rpm,5.0,799000
4,2018,69000,Diesel,Individual,Manual,First,2393 cc,148 bhp @ 3400 rpm,7.0,1950000


In [456]:
df2=pd.read_csv('Dataset/Raw Dataset Part 2.csv')
df2.head()

Unnamed: 0,Year,Distance,Fuel,Seller_Type,Transmission,Owner,Engine,Power,Seats,Price
0,2014,145500,Diesel,Individual,Manual,First Owner,1248 CC,74 bhp,5.0,450000
1,2014,120000,Diesel,Individual,Manual,Second Owner,1498 CC,103.52 bhp,5.0,370000
2,2006,140000,Petrol,Individual,Manual,Third Owner,1497 CC,78 bhp,5.0,158000
3,2010,127000,Diesel,Individual,Manual,First Owner,1396 CC,90 bhp,5.0,225000
4,2007,120000,Petrol,Individual,Manual,First Owner,1298 CC,88.2 bhp,5.0,130000


## In DF2 Clean Owner

In [457]:
df2['Owner']=df2['Owner'].str.split(' ').str[0]

In [458]:
df2.head()

Unnamed: 0,Year,Distance,Fuel,Seller_Type,Transmission,Owner,Engine,Power,Seats,Price
0,2014,145500,Diesel,Individual,Manual,First,1248 CC,74 bhp,5.0,450000
1,2014,120000,Diesel,Individual,Manual,Second,1498 CC,103.52 bhp,5.0,370000
2,2006,140000,Petrol,Individual,Manual,Third,1497 CC,78 bhp,5.0,158000
3,2010,127000,Diesel,Individual,Manual,First,1396 CC,90 bhp,5.0,225000
4,2007,120000,Petrol,Individual,Manual,First,1298 CC,88.2 bhp,5.0,130000


## Combine Dataset

In [459]:
df=df1.append(df2)

df.shape

  df=df1.append(df2)


(10187, 10)

## Remove Null

In [460]:
df.isnull().sum()

Year              0
Distance          0
Fuel              0
Seller_Type       0
Transmission      0
Owner             0
Engine          301
Power           295
Seats           285
Price             0
dtype: int64

In [461]:
df.shape

(10187, 10)

In [462]:
df.dropna(inplace=True)

In [463]:
df.shape

(9886, 10)

## Preview Original Dataset

In [464]:
df.head()

Unnamed: 0,Year,Distance,Fuel,Seller_Type,Transmission,Owner,Engine,Power,Seats,Price
0,2017,87150,Petrol,Corporate,Manual,First,1198 cc,87 bhp @ 6000 rpm,5.0,505000
1,2014,75000,Diesel,Individual,Manual,Second,1248 cc,74 bhp @ 4000 rpm,5.0,450000
2,2011,67000,Petrol,Individual,Manual,First,1197 cc,79 bhp @ 6000 rpm,5.0,220000
3,2019,37500,Petrol,Individual,Manual,First,1197 cc,82 bhp @ 6000 rpm,5.0,799000
4,2018,69000,Diesel,Individual,Manual,First,2393 cc,148 bhp @ 3400 rpm,7.0,1950000


## Clean the Data

In [465]:
df['Power']=df['Power'].str.split(' ').str[0]
df["Power"]=df["Power"].str.split('@').str[0]
df["Power"]=df["Power"].str.split('bhp').str[0]
df["Power"]=df["Power"].str.split('.').str[0]
df["Power"]=df["Power"].replace('',np.nan)

In [466]:
df['Engine']=df['Engine'].str.split(' ').str[0]

In [467]:
df['Engine']=df['Engine'].astype('float64')
df['Power']=df['Power'].astype('float64')

In [468]:
df.dropna(inplace=True)

In [469]:
df.head()

Unnamed: 0,Year,Distance,Fuel,Seller_Type,Transmission,Owner,Engine,Power,Seats,Price
0,2017,87150,Petrol,Corporate,Manual,First,1198.0,87.0,5.0,505000
1,2014,75000,Diesel,Individual,Manual,Second,1248.0,74.0,5.0,450000
2,2011,67000,Petrol,Individual,Manual,First,1197.0,79.0,5.0,220000
3,2019,37500,Petrol,Individual,Manual,First,1197.0,82.0,5.0,799000
4,2018,69000,Diesel,Individual,Manual,First,2393.0,148.0,7.0,1950000


## Transform Owner 

In [470]:
df.Owner.unique()

array(['First', 'Second', 'Third', 'Fourth', 'UnRegistered Car',
       '4 or More', 'Test'], dtype=object)

In [471]:
df.loc[df['Owner']=='4 or More','Owner']

1037    4 or More
Name: Owner, dtype: object

In [472]:
df.loc[df['Owner']=='Test','Owner']

4383    Test
4950    Test
4951    Test
4952    Test
6220    Test
Name: Owner, dtype: object

In [473]:
df.drop(df.loc[df['Owner']=='4 or More'].index,inplace=True)

In [474]:
df.drop(df.loc[df['Owner']=='Test'].index,inplace=True)

In [475]:
df['Owner']=df['Owner'].replace({'First':1,'Second':2,'Third':3,'Fourth':4,'UnRegistered Car':0})

In [476]:
df.head()

Unnamed: 0,Year,Distance,Fuel,Seller_Type,Transmission,Owner,Engine,Power,Seats,Price
0,2017,87150,Petrol,Corporate,Manual,1,1198.0,87.0,5.0,505000
1,2014,75000,Diesel,Individual,Manual,2,1248.0,74.0,5.0,450000
2,2011,67000,Petrol,Individual,Manual,1,1197.0,79.0,5.0,220000
3,2019,37500,Petrol,Individual,Manual,1,1197.0,82.0,5.0,799000
4,2018,69000,Diesel,Individual,Manual,1,2393.0,148.0,7.0,1950000


## Transform Transmission

In [477]:
df=df.rename(columns={'Transmission':'isManual'})

In [478]:
df['isManual']=df['isManual'].replace({'Manual':1,'Automatic':0})

In [479]:
df.head()

Unnamed: 0,Year,Distance,Fuel,Seller_Type,isManual,Owner,Engine,Power,Seats,Price
0,2017,87150,Petrol,Corporate,1,1,1198.0,87.0,5.0,505000
1,2014,75000,Diesel,Individual,1,2,1248.0,74.0,5.0,450000
2,2011,67000,Petrol,Individual,1,1,1197.0,79.0,5.0,220000
3,2019,37500,Petrol,Individual,1,1,1197.0,82.0,5.0,799000
4,2018,69000,Diesel,Individual,1,1,2393.0,148.0,7.0,1950000


## Transform Seller_Type

In [480]:
seller_type_coder=LabelEncoder()

In [481]:
df.Seller_Type.unique()

array(['Corporate', 'Individual', 'Commercial Registration', 'Dealer',
       'Trustmark Dealer'], dtype=object)

In [482]:
df['Seller_Type']=seller_type_coder.fit_transform(df['Seller_Type'])

In [483]:
with open('seller_type.pkl', 'wb') as file1:
    pickle.dump(seller_type_coder, file1)

In [484]:
df.head()

Unnamed: 0,Year,Distance,Fuel,Seller_Type,isManual,Owner,Engine,Power,Seats,Price
0,2017,87150,Petrol,1,1,1,1198.0,87.0,5.0,505000
1,2014,75000,Diesel,3,1,2,1248.0,74.0,5.0,450000
2,2011,67000,Petrol,3,1,1,1197.0,79.0,5.0,220000
3,2019,37500,Petrol,3,1,1,1197.0,82.0,5.0,799000
4,2018,69000,Diesel,3,1,1,2393.0,148.0,7.0,1950000


## Transform Fuel Type

In [485]:
fuel_coder=LabelEncoder()

In [486]:
df['Fuel'].unique()

array(['Petrol', 'Diesel', 'CNG', 'LPG', 'CNG + CNG', 'Hybrid',
       'Petrol + CNG'], dtype=object)

In [487]:
df['Fuel'].value_counts()

Diesel          5313
Petrol          4418
CNG              102
LPG               40
Hybrid             3
CNG + CNG          1
Petrol + CNG       1
Name: Fuel, dtype: int64

In [488]:
df['Fuel']=df['Fuel'].replace({'CNG + CNG':'CNG','Petrol + CNG':'CNG','Petrol + LPG':'LPG'})


In [489]:
df['Fuel'].value_counts()

Diesel    5313
Petrol    4418
CNG        104
LPG         40
Hybrid       3
Name: Fuel, dtype: int64

In [490]:
df['Fuel']=fuel_coder.fit_transform(df['Fuel'])

In [511]:
fuel_coder.transform(["Diesel"])

1

In [491]:
with open('fuel_coder.pkl', 'wb') as file1:
    pickle.dump(fuel_coder, file1)

In [492]:
df.head()

Unnamed: 0,Year,Distance,Fuel,Seller_Type,isManual,Owner,Engine,Power,Seats,Price
0,2017,87150,4,1,1,1,1198.0,87.0,5.0,505000
1,2014,75000,1,3,1,2,1248.0,74.0,5.0,450000
2,2011,67000,4,3,1,1,1197.0,79.0,5.0,220000
3,2019,37500,4,3,1,1,1197.0,82.0,5.0,799000
4,2018,69000,1,3,1,1,2393.0,148.0,7.0,1950000


## Final Dataset

In [493]:
df.to_csv('Dataset/Preprocessed Dataset.csv',index=False)

## Data Scaling

In [494]:
sc_X=StandardScaler()

In [495]:
X=df.drop(['Price'],axis=1)
y=df['Price']

In [496]:
scaled_X=sc_X.fit_transform(X)
scaled_y=y

In [497]:
with open('scaled_X.pkl', 'wb') as file1:
    pickle.dump(sc_X, file1)

## Test Train Split

In [498]:
train_x,test_x,train_y,test_y=train_test_split(scaled_X,scaled_y,test_size=0.2,random_state=42)

## Finding Best Polynomial degree

In [499]:
Rsquared=[]
order=[1,2,3,4,5]

for n in order:
    pr=PolynomialFeatures(degree=n)
    train_x_poly=pr.fit_transform(train_x)
    test_x_poly=pr.fit_transform(test_x)
    lr=LinearRegression()
    lr.fit(train_x_poly,train_y)
    Rsquared.append(lr.score(test_x_poly,test_y))
    

In [500]:
best_index=np.argmax(Rsquared)
best_index

2

## Training

In [501]:
best_order=order[best_index]
best_order

3

In [502]:
pr=PolynomialFeatures(degree=best_order)
train_x_poly=pr.fit_transform(train_x)
test_x_poly=pr.fit_transform(test_x)

In [503]:
lr=LinearRegression()
lr.fit(train_x_poly,train_y)

In [504]:
lr.score(test_x_poly,test_y)

0.8306270580811067

In [505]:
with open('model.pkl', 'wb') as file2:
    pickle.dump(lr, file2)

## Predict

In [506]:
lr.predict(pr.transform(sc.transform([[2017	,87150,	4,	1	,1,	1	,1198.0	,87.0	,5.0]])))[0]



492537.515625