# Here we will try to build model to predict the selling price of used cars
## We are using the Vehicle data set from kaggle

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
import seaborn as sns

In [2]:
# loading the dataset

In [3]:
data = pd.read_csv("car data.csv")
data.head()

Unnamed: 0,Car_Name,Year,Selling_Price,Present_Price,Kms_Driven,Fuel_Type,Seller_Type,Transmission,Owner
0,ritz,2014,3.35,5.59,27000,Petrol,Dealer,Manual,0
1,sx4,2013,4.75,9.54,43000,Diesel,Dealer,Manual,0
2,ciaz,2017,7.25,9.85,6900,Petrol,Dealer,Manual,0
3,wagon r,2011,2.85,4.15,5200,Petrol,Dealer,Manual,0
4,swift,2014,4.6,6.87,42450,Diesel,Dealer,Manual,0


In [4]:
data.shape

(301, 9)

In [5]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 301 entries, 0 to 300
Data columns (total 9 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Car_Name       301 non-null    object 
 1   Year           301 non-null    int64  
 2   Selling_Price  301 non-null    float64
 3   Present_Price  301 non-null    float64
 4   Kms_Driven     301 non-null    int64  
 5   Fuel_Type      301 non-null    object 
 6   Seller_Type    301 non-null    object 
 7   Transmission   301 non-null    object 
 8   Owner          301 non-null    int64  
dtypes: float64(2), int64(3), object(4)
memory usage: 21.3+ KB


In [6]:
data.isnull().sum()

Car_Name         0
Year             0
Selling_Price    0
Present_Price    0
Kms_Driven       0
Fuel_Type        0
Seller_Type      0
Transmission     0
Owner            0
dtype: int64

In [7]:
# we can see that the dataset is free of missing or null values

In [8]:
data.head(50)

Unnamed: 0,Car_Name,Year,Selling_Price,Present_Price,Kms_Driven,Fuel_Type,Seller_Type,Transmission,Owner
0,ritz,2014,3.35,5.59,27000,Petrol,Dealer,Manual,0
1,sx4,2013,4.75,9.54,43000,Diesel,Dealer,Manual,0
2,ciaz,2017,7.25,9.85,6900,Petrol,Dealer,Manual,0
3,wagon r,2011,2.85,4.15,5200,Petrol,Dealer,Manual,0
4,swift,2014,4.6,6.87,42450,Diesel,Dealer,Manual,0
5,vitara brezza,2018,9.25,9.83,2071,Diesel,Dealer,Manual,0
6,ciaz,2015,6.75,8.12,18796,Petrol,Dealer,Manual,0
7,s cross,2015,6.5,8.61,33429,Diesel,Dealer,Manual,0
8,ciaz,2016,8.75,8.89,20273,Diesel,Dealer,Manual,0
9,ciaz,2015,7.45,8.92,42367,Diesel,Dealer,Manual,0


In [9]:
# Label Encoding: we have to convert some categorical value columns into numerical values
#converting a column to a category, then use those category values for your label encoding:

In [10]:
data["Fuel_Type"] = data["Fuel_Type"].astype("category")

In [11]:
data.dtypes

Car_Name           object
Year                int64
Selling_Price     float64
Present_Price     float64
Kms_Driven          int64
Fuel_Type        category
Seller_Type        object
Transmission       object
Owner               int64
dtype: object

In [12]:
#Then we can assign the encoded variable to a new column using the cat.codes accessor:

In [13]:
data["Fuel_Type_cat"] = data["Fuel_Type"].cat.codes
data.head()

Unnamed: 0,Car_Name,Year,Selling_Price,Present_Price,Kms_Driven,Fuel_Type,Seller_Type,Transmission,Owner,Fuel_Type_cat
0,ritz,2014,3.35,5.59,27000,Petrol,Dealer,Manual,0,2
1,sx4,2013,4.75,9.54,43000,Diesel,Dealer,Manual,0,1
2,ciaz,2017,7.25,9.85,6900,Petrol,Dealer,Manual,0,2
3,wagon r,2011,2.85,4.15,5200,Petrol,Dealer,Manual,0,2
4,swift,2014,4.6,6.87,42450,Diesel,Dealer,Manual,0,1


In [14]:
data["Seller_Type"] = data["Seller_Type"].astype("category")
data["Car_Name"] = data["Car_Name"].astype("category")
data["Transmission"] = data["Transmission"].astype("category")

In [15]:
data["Car_Name_cat"] = data["Car_Name"].cat.codes
data["Seller_Type_cat"] = data["Seller_Type"].cat.codes
data["Transmission"] = data["Transmission"].cat.codes
data.head()

Unnamed: 0,Car_Name,Year,Selling_Price,Present_Price,Kms_Driven,Fuel_Type,Seller_Type,Transmission,Owner,Fuel_Type_cat,Car_Name_cat,Seller_Type_cat
0,ritz,2014,3.35,5.59,27000,Petrol,Dealer,1,0,2,90,0
1,sx4,2013,4.75,9.54,43000,Diesel,Dealer,1,0,1,93,0
2,ciaz,2017,7.25,9.85,6900,Petrol,Dealer,1,0,2,68,0
3,wagon r,2011,2.85,4.15,5200,Petrol,Dealer,1,0,2,96,0
4,swift,2014,4.6,6.87,42450,Diesel,Dealer,1,0,1,92,0


In [16]:
#We can move on to the model cration

In [17]:
x = data.loc[:, ['Year', "Present_Price", "Kms_Driven", "Transmission", "Owner", "Fuel_Type_cat", "Car_Name_cat", "Seller_Type_cat"]]

In [18]:
x

Unnamed: 0,Year,Present_Price,Kms_Driven,Transmission,Owner,Fuel_Type_cat,Car_Name_cat,Seller_Type_cat
0,2014,5.59,27000,1,0,2,90,0
1,2013,9.54,43000,1,0,1,93,0
2,2017,9.85,6900,1,0,2,68,0
3,2011,4.15,5200,1,0,2,96,0
4,2014,6.87,42450,1,0,1,92,0
...,...,...,...,...,...,...,...,...
296,2016,11.60,33988,1,0,1,69,0
297,2015,5.90,60000,1,0,2,66,0
298,2009,11.00,87934,1,0,2,69,0
299,2017,12.50,9000,1,0,1,69,0


In [19]:
y = data.loc[:,['Selling_Price']]

In [20]:
y

Unnamed: 0,Selling_Price
0,3.35
1,4.75
2,7.25
3,2.85
4,4.60
...,...
296,9.50
297,4.00
298,3.35
299,11.50


In [21]:
y.shape

(301, 1)

In [22]:
from sklearn.linear_model import LinearRegression

In [23]:
x_train, x_test, y_train, y_test = train_test_split(x,y, test_size = 0.2, random_state = 10)

In [24]:
model = LinearRegression()

In [25]:
model.fit(x_train,y_train)

LinearRegression()

In [26]:
y_prediction = model.predict(x_test)

In [27]:
from sklearn.metrics import r2_score

In [28]:
r2_score(y_test, y_prediction)

0.8415951780397837

## Saving the data file

In [29]:
import pickle

In [32]:
file = open("model.pkl", "wb")

In [31]:
pickle.dump(model,file)