In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import Lasso
from sklearn import metrics
from sklearn.preprocessing import OneHotEncoder

In [2]:
df = pd.read_csv("/content/car_data.csv")

In [3]:
df.head()

Unnamed: 0,Car_Name,Year,Selling_Price,Present_Price,Kms_Driven,Fuel_Type,Seller_Type,Transmission,Owner
0,ritz,2014,3.35,5.59,27000,Petrol,Dealer,Manual,0
1,sx4,2013,4.75,9.54,43000,Diesel,Dealer,Manual,0
2,ciaz,2017,7.25,9.85,6900,Petrol,Dealer,Manual,0
3,wagon r,2011,2.85,4.15,5200,Petrol,Dealer,Manual,0
4,swift,2014,4.6,6.87,42450,Diesel,Dealer,Manual,0


In [4]:
df.shape

(301, 9)

In [5]:
df['Car_Name'].value_counts()

Unnamed: 0_level_0,count
Car_Name,Unnamed: 1_level_1
city,26
corolla altis,16
verna,14
fortuner,11
brio,10
ciaz,9
innova,9
i20,9
grand i10,8
jazz,7


In [6]:
# To get the list of all the unique values present in a column

df['Car_Name'].unique() # or pd.unique(df['Car_Name'])

array(['ritz', 'sx4', 'ciaz', 'wagon r', 'swift', 'vitara brezza',
       's cross', 'alto 800', 'ertiga', 'dzire', 'alto k10', 'ignis',
       '800', 'baleno', 'omni', 'fortuner', 'innova', 'corolla altis',
       'etios cross', 'etios g', 'etios liva', 'corolla', 'etios gd',
       'camry', 'land cruiser', 'Royal Enfield Thunder 500',
       'UM Renegade Mojave', 'KTM RC200', 'Bajaj Dominar 400',
       'Royal Enfield Classic 350', 'KTM RC390', 'Hyosung GT250R',
       'Royal Enfield Thunder 350', 'KTM 390 Duke ',
       'Mahindra Mojo XT300', 'Bajaj Pulsar RS200',
       'Royal Enfield Bullet 350', 'Royal Enfield Classic 500',
       'Bajaj Avenger 220', 'Bajaj Avenger 150', 'Honda CB Hornet 160R',
       'Yamaha FZ S V 2.0', 'Yamaha FZ 16', 'TVS Apache RTR 160',
       'Bajaj Pulsar 150', 'Honda CBR 150', 'Hero Extreme',
       'Bajaj Avenger 220 dtsi', 'Bajaj Avenger 150 street',
       'Yamaha FZ  v 2.0', 'Bajaj Pulsar  NS 200', 'Bajaj Pulsar 220 F',
       'TVS Apache RTR 180', 

In [7]:
len(df['Car_Name'].unique())

98

In [8]:
## To remove all the cars which appears 2 or less times in our data set

new_df = df.groupby('Car_Name').filter(lambda x: len(x) >= 3)

In [9]:
new_df['Car_Name'].value_counts()

Unnamed: 0_level_0,count
Car_Name,Unnamed: 1_level_1
city,26
corolla altis,16
verna,14
fortuner,11
brio,10
innova,9
i20,9
ciaz,9
grand i10,8
amaze,7


In [10]:
len(new_df['Car_Name'].unique())

34

Encoding the data

In [11]:
new_df.replace({'Fuel_Type':{'Petrol':0, 'Diesel':1, 'CNG':2}, 'Seller_Type':{'Dealer':0, 'Individual':1}, 'Transmission':{'Manual':0, 'Automatic':1}}, inplace=True)

**One Hot Encoding**

There are 2 ways to perform this :-
1. Using pandas get_dummies
2. Using sklearn OneHotEncoder

# 1. Using .get_dummies()

In [12]:
pd.get_dummies(new_df['Car_Name'])

Unnamed: 0,Bajaj Avenger 220,Bajaj Pulsar 150,Bajaj Pulsar NS 200,Honda CB Hornet 160R,Royal Enfield Classic 350,Royal Enfield Thunder 350,Royal Enfield Thunder 500,TVS Apache RTR 160,Yamaha FZ S V 2.0,alto k10,...,i10,i20,innova,jazz,ritz,swift,sx4,verna,wagon r,xcent
0,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,True,False,False,False,False,False
1,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,True,False,False,False
2,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
3,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,True,False
4,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,True,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
296,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
297,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
298,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
299,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


In [13]:
pd.get_dummies(new_df['Car_Name'], dtype=int)

Unnamed: 0,Bajaj Avenger 220,Bajaj Pulsar 150,Bajaj Pulsar NS 200,Honda CB Hornet 160R,Royal Enfield Classic 350,Royal Enfield Thunder 350,Royal Enfield Thunder 500,TVS Apache RTR 160,Yamaha FZ S V 2.0,alto k10,...,i10,i20,innova,jazz,ritz,swift,sx4,verna,wagon r,xcent
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
296,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
297,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
298,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
299,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [14]:
dummies = pd.get_dummies(new_df['Car_Name'], dtype=int)

In [15]:
df1 = pd.concat([new_df, dummies], axis=1)

In [16]:
df1.head()

Unnamed: 0,Car_Name,Year,Selling_Price,Present_Price,Kms_Driven,Fuel_Type,Seller_Type,Transmission,Owner,Bajaj Avenger 220,...,i10,i20,innova,jazz,ritz,swift,sx4,verna,wagon r,xcent
0,ritz,2014,3.35,5.59,27000,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
1,sx4,2013,4.75,9.54,43000,1,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
2,ciaz,2017,7.25,9.85,6900,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,wagon r,2011,2.85,4.15,5200,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
4,swift,2014,4.6,6.87,42450,1,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0


In [17]:
df1.shape

(220, 43)

We can drop one column of one hot encoding. Its because if every one hot encoding entries are zeroes in a row, then that will also tell the model that it falls into a different category.

This is also known as **dummy variable trap**.

In [18]:
df1 = df1.drop(columns=['Car_Name', 'xcent']) # Now we don't need the 'Car_Name' either

In [19]:
df1.shape

(220, 41)

**Model Training and Evaluation**

In [20]:
X = df1.drop(columns='Selling_Price')
y = df1['Selling_Price']

In [21]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

In [22]:
lasso = Lasso()

In [23]:
lasso.fit(X_train, y_train)

In [43]:
y_pred = lasso.predict(X_test)

In [25]:
err_score = metrics.r2_score(y_test, y_pred)

In [26]:
print("R square error using Lasso Regression Model :", err_score)

R square error using Lasso Regression Model : 0.91111825960642


# 2. Using OneHotEncoder()

In [31]:
ohe = OneHotEncoder(sparse_output=False).set_output(transform='pandas')

In [36]:
ohetransform = ohe.fit_transform(new_df[['Car_Name']])

In [37]:
ohetransform

Unnamed: 0,Car_Name_Bajaj Avenger 220,Car_Name_Bajaj Pulsar 150,Car_Name_Bajaj Pulsar NS 200,Car_Name_Honda CB Hornet 160R,Car_Name_Royal Enfield Classic 350,Car_Name_Royal Enfield Thunder 350,Car_Name_Royal Enfield Thunder 500,Car_Name_TVS Apache RTR 160,Car_Name_Yamaha FZ S V 2.0,Car_Name_alto k10,...,Car_Name_i10,Car_Name_i20,Car_Name_innova,Car_Name_jazz,Car_Name_ritz,Car_Name_swift,Car_Name_sx4,Car_Name_verna,Car_Name_wagon r,Car_Name_xcent
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
296,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
297,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
298,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
299,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [38]:
df2 = pd.concat([new_df, ohetransform], axis=1)

In [39]:
df2.head()

Unnamed: 0,Car_Name,Year,Selling_Price,Present_Price,Kms_Driven,Fuel_Type,Seller_Type,Transmission,Owner,Car_Name_Bajaj Avenger 220,...,Car_Name_i10,Car_Name_i20,Car_Name_innova,Car_Name_jazz,Car_Name_ritz,Car_Name_swift,Car_Name_sx4,Car_Name_verna,Car_Name_wagon r,Car_Name_xcent
0,ritz,2014,3.35,5.59,27000,0,0,0,0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
1,sx4,2013,4.75,9.54,43000,1,0,0,0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
2,ciaz,2017,7.25,9.85,6900,0,0,0,0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,wagon r,2011,2.85,4.15,5200,0,0,0,0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
4,swift,2014,4.6,6.87,42450,1,0,0,0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0


In [40]:
df2.shape

(220, 43)

In [41]:
df2 = df2.drop(columns=['Car_Name', 'Car_Name_Bajaj Avenger 220'])

In [42]:
df2.shape

(220, 41)

**Model Training and Evaluation**

In [45]:
X = df2.drop(columns='Selling_Price')
y = df2['Selling_Price']

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

lasso = Lasso()

lasso.fit(X_train, y_train)

y_pred = lasso.predict(X_test)

In [46]:
err_score = metrics.r2_score(y_test, y_pred)

In [47]:
print("R square error using Lasso Regression Model :", err_score)

R square error using Lasso Regression Model : 0.91111825960642
