In [4]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.metrics  import accuracy_score, scorer,f1_score,recall_score,precision_score,confusion_matrix




In [5]:
data = pd.read_csv("car_data.csv")
data.tail(10)

Unnamed: 0,Car_Name,Year,Selling_Price,Present_Price,Kms_Driven,Fuel_Type,Seller_Type,Transmission,Owner
291,brio,2015,5.4,6.1,31427,Petrol,Dealer,Manual,0
292,jazz,2016,6.4,8.4,12000,Petrol,Dealer,Manual,0
293,city,2010,3.25,9.9,38000,Petrol,Dealer,Manual,0
294,amaze,2014,3.75,6.8,33019,Petrol,Dealer,Manual,0
295,city,2015,8.55,13.09,60076,Diesel,Dealer,Manual,0
296,city,2016,9.5,11.6,33988,Diesel,Dealer,Manual,0
297,brio,2015,4.0,5.9,60000,Petrol,Dealer,Manual,0
298,city,2009,3.35,11.0,87934,Petrol,Dealer,Manual,0
299,city,2017,11.5,12.5,9000,Diesel,Dealer,Manual,0
300,brio,2016,5.3,5.9,5464,Petrol,Dealer,Manual,0


In [6]:
data.shape

(301, 9)

In [7]:
data.columns

Index(['Car_Name', 'Year', 'Selling_Price', 'Present_Price', 'Kms_Driven',
       'Fuel_Type', 'Seller_Type', 'Transmission', 'Owner'],
      dtype='object')

In [8]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 301 entries, 0 to 300
Data columns (total 9 columns):
Car_Name         301 non-null object
Year             301 non-null int64
Selling_Price    301 non-null float64
Present_Price    301 non-null float64
Kms_Driven       301 non-null int64
Fuel_Type        301 non-null object
Seller_Type      301 non-null object
Transmission     301 non-null object
Owner            301 non-null int64
dtypes: float64(2), int64(3), object(4)
memory usage: 21.3+ KB


In [9]:
print(data["Fuel_Type"].unique())
print(data["Seller_Type"].unique())
print(data["Transmission"].unique())
print(data["Owner"].unique())

['Petrol' 'Diesel' 'CNG']
['Dealer' 'Individual']
['Manual' 'Automatic']
[0 1 3]


In [10]:
#checking null values in dataset
data.isnull().sum()

Car_Name         0
Year             0
Selling_Price    0
Present_Price    0
Kms_Driven       0
Fuel_Type        0
Seller_Type      0
Transmission     0
Owner            0
dtype: int64

In [11]:
data.describe()

Unnamed: 0,Year,Selling_Price,Present_Price,Kms_Driven,Owner
count,301.0,301.0,301.0,301.0,301.0
mean,2013.627907,4.661296,7.628472,36947.20598,0.043189
std,2.891554,5.082812,8.644115,38886.883882,0.247915
min,2003.0,0.1,0.32,500.0,0.0
25%,2012.0,0.9,1.2,15000.0,0.0
50%,2014.0,3.6,6.4,32000.0,0.0
75%,2016.0,6.0,9.9,48767.0,0.0
max,2018.0,35.0,92.6,500000.0,3.0


In [12]:
dataset = data[[ 'Year', 'Selling_Price', 'Present_Price', 'Kms_Driven',
       'Fuel_Type', 'Seller_Type', 'Transmission', 'Owner']]

In [13]:
dataset["car_current_year"]=2020

In [14]:
dataset

Unnamed: 0,Year,Selling_Price,Present_Price,Kms_Driven,Fuel_Type,Seller_Type,Transmission,Owner,car_current_year
0,2014,3.35,5.59,27000,Petrol,Dealer,Manual,0,2020
1,2013,4.75,9.54,43000,Diesel,Dealer,Manual,0,2020
2,2017,7.25,9.85,6900,Petrol,Dealer,Manual,0,2020
3,2011,2.85,4.15,5200,Petrol,Dealer,Manual,0,2020
4,2014,4.60,6.87,42450,Diesel,Dealer,Manual,0,2020
...,...,...,...,...,...,...,...,...,...
296,2016,9.50,11.60,33988,Diesel,Dealer,Manual,0,2020
297,2015,4.00,5.90,60000,Petrol,Dealer,Manual,0,2020
298,2009,3.35,11.00,87934,Petrol,Dealer,Manual,0,2020
299,2017,11.50,12.50,9000,Diesel,Dealer,Manual,0,2020


In [15]:
# creating new feature of how much old the car is (year)
dataset['car_old_year']= dataset["car_current_year"]-data["Year"]

In [16]:
#Here we will drop the dataset cloumn["car_current_year"] because now we dont want that
dataset.drop(["car_current_year"], axis=1, inplace=True)
dataset.drop(["Year"], axis=1, inplace=True)

In [18]:
dataset.shape

(301, 8)

In [19]:
dataset.keys()

Index(['Selling_Price', 'Present_Price', 'Kms_Driven', 'Fuel_Type',
       'Seller_Type', 'Transmission', 'Owner', 'car_old_year'],
      dtype='object')

In [20]:
# converting categorical value into onehot encoding converting into dummie variable
dataset = pd.get_dummies(dataset,drop_first=True)
dataset

Unnamed: 0,Selling_Price,Present_Price,Kms_Driven,Owner,car_old_year,Fuel_Type_Diesel,Fuel_Type_Petrol,Seller_Type_Individual,Transmission_Manual
0,3.35,5.59,27000,0,6,0,1,0,1
1,4.75,9.54,43000,0,7,1,0,0,1
2,7.25,9.85,6900,0,3,0,1,0,1
3,2.85,4.15,5200,0,9,0,1,0,1
4,4.60,6.87,42450,0,6,1,0,0,1
...,...,...,...,...,...,...,...,...,...
296,9.50,11.60,33988,0,4,1,0,0,1
297,4.00,5.90,60000,0,5,0,1,0,1
298,3.35,11.00,87934,0,11,0,1,0,1
299,11.50,12.50,9000,0,3,1,0,0,1


In [21]:
print(data["Fuel_Type"].unique())
print(data["Seller_Type"].unique())
print(data["Transmission"].unique())
print(data["Owner"].unique())

['Petrol' 'Diesel' 'CNG']
['Dealer' 'Individual']
['Manual' 'Automatic']
[0 1 3]


In [22]:
dataset.corr()

Unnamed: 0,Selling_Price,Present_Price,Kms_Driven,Owner,car_old_year,Fuel_Type_Diesel,Fuel_Type_Petrol,Seller_Type_Individual,Transmission_Manual
Selling_Price,1.0,0.878983,0.029187,-0.088344,-0.236141,0.552339,-0.540571,-0.550724,-0.367128
Present_Price,0.878983,1.0,0.203647,0.008057,0.047584,0.473306,-0.465244,-0.51203,-0.348715
Kms_Driven,0.029187,0.203647,1.0,0.089216,0.524342,0.172515,-0.172874,-0.101419,-0.16251
Owner,-0.088344,0.008057,0.089216,1.0,0.182104,-0.053469,0.055687,0.124269,-0.050316
car_old_year,-0.236141,0.047584,0.524342,0.182104,1.0,-0.064315,0.059959,0.039896,-0.000394
Fuel_Type_Diesel,0.552339,0.473306,0.172515,-0.053469,-0.064315,1.0,-0.979648,-0.350467,-0.098643
Fuel_Type_Petrol,-0.540571,-0.465244,-0.172874,0.055687,0.059959,-0.979648,1.0,0.358321,0.091013
Seller_Type_Individual,-0.550724,-0.51203,-0.101419,0.124269,0.039896,-0.350467,0.358321,1.0,0.06324
Transmission_Manual,-0.367128,-0.348715,-0.16251,-0.050316,-0.000394,-0.098643,0.091013,0.06324,1.0


In [23]:

X=dataset.iloc[:,1:]
y=dataset.iloc[:,0]

X_train,X_test,y_train,y_test = train_test_split(X,y, test_size=30, random_state=22)

In [24]:
from sklearn.ensemble import ExtraTreesRegressor
Feature_imp = ExtraTreesRegressor()
Feature_imp.fit(X,y)

ExtraTreesRegressor()

In [25]:
print(Feature_imp.feature_importances_)

[0.39477114 0.04178032 0.000927   0.07562942 0.21653636 0.01723205
 0.12178742 0.1313363 ]


In [26]:
X=dataset.iloc[:,1:]
y=dataset.iloc[:,0]

X_train,X_test,y_train,y_test = train_test_split(X,y, test_size=30, random_state=22)

# Decision Tree Regressor 

In [28]:
from sklearn.tree import DecisionTreeRegressor

In [29]:
DT_Regressor = DecisionTreeRegressor(criterion='mse', max_depth=5)
DT_Regressor.fit(X_train,y_train)

DecisionTreeRegressor(max_depth=5)

In [30]:
DT_Regressor.score(X_train,y_train)

0.9783538864719713

In [31]:
DT_Regressor.score(X_test,y_test)

0.9152546657791584

In [32]:
DT_Regressor_predict =DT_Regressor.predict(X_test)
DT_Regressor_predict

array([ 1.1052381 ,  5.13703704,  5.13703704,  6.87727273,  3.05357143,
        5.13703704,  4.03333333,  5.13703704,  7.45227273,  0.31083333,
        0.31083333,  4.        ,  4.        , 10.12647059,  1.1052381 ,
        4.6565625 ,  7.45227273,  0.31083333,  0.31083333,  6.87727273,
        7.45227273,  4.6565625 ,  5.13703704, 10.12647059,  2.69375   ,
        4.6565625 ,  5.13703704,  0.59441176,  0.31083333,  6.87727273])

In [36]:
# DT_Regressor_accuracy = accuracy_score(y_test,DT_Regressor_predict)
# DT_Regressor_accuracy

In [39]:
result_chart = pd.DataFrame({"Predicted_value":DT_Regressor_predict,"Actuall_value":y_test})
# result_chart.shape
result_chart
 

Unnamed: 0,Predicted_value,Actuall_value
111,1.105238,1.15
56,5.137037,4.5
300,5.137037,5.3
254,6.877273,5.25
41,3.053571,2.55
280,5.137037,5.25
90,4.033333,3.8
243,5.137037,6.25
267,7.452273,8.35
151,0.310833,0.5


# Random Forest Regressor

In [40]:
from sklearn.ensemble import RandomForestRegressor
RF_Regressor = RandomForestRegressor(n_estimators=50,max_depth=6)
RF_Regressor.fit(X_train,y_train)

RandomForestRegressor(max_depth=6, n_estimators=50)

In [41]:
RF_Regressor.score(X_train,y_train)

0.9794270419200217

In [42]:
RF_Regressor.score(X_test,y_test)

0.5256866718322349

# Linear regression

In [43]:
from sklearn.linear_model import LinearRegression

In [44]:
lr = LinearRegression()
lr.fit(X_train,y_train)

LinearRegression()

In [46]:
lr.score(X_train,y_train)

0.8908762598508815

In [47]:
lr.score(X_test,y_test)

0.5261395554503856

In [49]:
import pickle
#open a file , where you want to store the data
file = open('decision_tree_regressor.pkl','wb')

# dump information to that file 
pickle.dump(DT_Regressor,file)