In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
data = pd.read_csv('./data/car data.csv')

In [3]:
data.head()

Unnamed: 0,Car_Name,Year,Selling_Price,Present_Price,Kms_Driven,Fuel_Type,Seller_Type,Transmission,Owner
0,ritz,2014,3.35,5.59,27000,Petrol,Dealer,Manual,0
1,sx4,2013,4.75,9.54,43000,Diesel,Dealer,Manual,0
2,ciaz,2017,7.25,9.85,6900,Petrol,Dealer,Manual,0
3,wagon r,2011,2.85,4.15,5200,Petrol,Dealer,Manual,0
4,swift,2014,4.6,6.87,42450,Diesel,Dealer,Manual,0


In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 301 entries, 0 to 300
Data columns (total 9 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Car_Name       301 non-null    object 
 1   Year           301 non-null    int64  
 2   Selling_Price  301 non-null    float64
 3   Present_Price  301 non-null    float64
 4   Kms_Driven     301 non-null    int64  
 5   Fuel_Type      301 non-null    object 
 6   Seller_Type    301 non-null    object 
 7   Transmission   301 non-null    object 
 8   Owner          301 non-null    int64  
dtypes: float64(2), int64(3), object(4)
memory usage: 21.3+ KB


In [5]:
data.describe()

Unnamed: 0,Year,Selling_Price,Present_Price,Kms_Driven,Owner
count,301.0,301.0,301.0,301.0,301.0
mean,2013.627907,4.661296,7.628472,36947.20598,0.043189
std,2.891554,5.082812,8.644115,38886.883882,0.247915
min,2003.0,0.1,0.32,500.0,0.0
25%,2012.0,0.9,1.2,15000.0,0.0
50%,2014.0,3.6,6.4,32000.0,0.0
75%,2016.0,6.0,9.9,48767.0,0.0
max,2018.0,35.0,92.6,500000.0,3.0


In [6]:
#The column car name doesn't seem to add much value to our analysis and hence dropping the column

data= data.drop('Car_Name',axis=1)

In [7]:
# It's important to know how many years old the car is.

data['Car_age']= 2020-data['Year']
data.drop('Year', axis=1, inplace=True)

In [8]:
fuel = pd.get_dummies(data['Fuel_Type'])

transmission = pd.get_dummies(data['Transmission'])

seller = pd.get_dummies(data['Seller_Type'])

In [9]:
data.drop(['Fuel_Type','Transmission','Seller_Type'], axis=1, inplace=True)

In [10]:
data_final = pd.concat([data, fuel, transmission, seller], axis=1)

In [11]:
data_final.head()

Unnamed: 0,Selling_Price,Present_Price,Kms_Driven,Owner,Car_age,CNG,Diesel,Petrol,Automatic,Manual,Dealer,Individual
0,3.35,5.59,27000,0,6,0,0,1,0,1,1,0
1,4.75,9.54,43000,0,7,0,1,0,0,1,1,0
2,7.25,9.85,6900,0,3,0,0,1,0,1,1,0
3,2.85,4.15,5200,0,9,0,0,1,0,1,1,0
4,4.6,6.87,42450,0,6,0,1,0,0,1,1,0


In [12]:
X = data_final.iloc[:, 1:]
y = data_final.iloc[:, 0]

In [13]:
from sklearn.model_selection import train_test_split

np.random.seed(0)
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 100)

In [18]:
from sklearn.linear_model import LinearRegression, LogisticRegression, RidgeCV, Lasso
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import LinearSVR
from xgboost import XGBRegressor

regressors = {'LinearRegression':LinearRegression(),
               'RidgeCV' :RidgeCV(),
               'Lasso': Lasso(),
               'XGBRegressor': XGBRegressor(),
                'DecisionTreeReg':DecisionTreeRegressor(random_state=0),
              'RandomForestReg':RandomForestRegressor(n_estimators=300, random_state=0),
              'LinearSVR':LinearSVR()}

def Test_models(X_train, Y_train, X_test, Y_test, regressors, score):
    models=[]
    scores=[]
    for key in regressors:
        clf = regressors[key]
        clf.fit(X_train, Y_train)
        pred = clf.predict(X_test)
        scr = score(Y_test, pred)
        models.append(key)
        scores.append(scr)
        result = pd.DataFrame({'models':models,'scores':scores})
        
    return result

In [19]:
from sklearn.metrics import r2_score

Test_models(x_train, y_train, x_test, y_test, regressors, r2_score)





Unnamed: 0,models,scores
0,LinearRegression,0.831698
1,RidgeCV,0.8319
2,Lasso,0.80165
3,XGBRegressor,0.879221
4,DecisionTreeReg,0.826624
5,RandomForestReg,0.850749
6,LinearSVR,-152.019847
