In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.linear_model import SGDRegressor
from sklearn.metrics import mean_squared_error

In [2]:
df = pd.read_csv("Automobile_data.csv")

In [3]:
df.head()

Unnamed: 0,symboling,normalized-losses,make,fuel-type,body-style,drive-wheels,engine-location,width,height,engine-type,engine-size,horsepower,city-mpg,highway-mpg,price
0,3,?,alfa-romero,gas,convertible,rwd,front,64.1,48.8,dohc,130,111,21,27,13495
1,3,?,alfa-romero,gas,convertible,rwd,front,64.1,48.8,dohc,130,111,21,27,16500
2,1,?,alfa-romero,gas,hatchback,rwd,front,65.5,52.4,ohcv,152,154,19,26,16500
3,2,164,audi,gas,sedan,fwd,front,66.2,54.3,ohc,109,102,24,30,13950
4,2,164,audi,gas,sedan,4wd,front,66.4,54.3,ohc,136,115,18,22,17450


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 205 entries, 0 to 204
Data columns (total 15 columns):
symboling            205 non-null int64
normalized-losses    205 non-null object
make                 205 non-null object
fuel-type            205 non-null object
body-style           205 non-null object
drive-wheels         205 non-null object
engine-location      205 non-null object
width                205 non-null float64
height               205 non-null float64
engine-type          205 non-null object
engine-size          205 non-null int64
horsepower           205 non-null object
city-mpg             205 non-null int64
highway-mpg          205 non-null int64
price                205 non-null int64
dtypes: float64(2), int64(5), object(8)
memory usage: 24.1+ KB


In [5]:
df["normalized-losses"] = df["normalized-losses"].replace("?",np.nan)

In [6]:
df["horsepower"] = df["horsepower"].replace("?",np.nan)

In [7]:
df["normalized-losses"] = df["normalized-losses"].astype('float')

In [8]:
df["horsepower"] = df["horsepower"].astype('float')

In [9]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 205 entries, 0 to 204
Data columns (total 15 columns):
symboling            205 non-null int64
normalized-losses    164 non-null float64
make                 205 non-null object
fuel-type            205 non-null object
body-style           205 non-null object
drive-wheels         205 non-null object
engine-location      205 non-null object
width                205 non-null float64
height               205 non-null float64
engine-type          205 non-null object
engine-size          205 non-null int64
horsepower           203 non-null float64
city-mpg             205 non-null int64
highway-mpg          205 non-null int64
price                205 non-null int64
dtypes: float64(4), int64(5), object(6)
memory usage: 24.1+ KB


In [10]:
horsepower_mean = df['horsepower'].mean()

In [11]:
horsepower_mean

104.25615763546799

In [12]:
df['horsepower'].fillna(horsepower_mean,inplace=True)

In [13]:
df['horsepower'].unique()

array([111.        , 154.        , 102.        , 115.        ,
       110.        , 140.        , 160.        , 101.        ,
       121.        , 182.        ,  48.        ,  70.        ,
        68.        ,  88.        , 145.        ,  58.        ,
        76.        ,  60.        ,  86.        , 100.        ,
        78.        ,  90.        , 176.        , 262.        ,
       135.        ,  84.        ,  64.        , 120.        ,
        72.        , 123.        , 155.        , 184.        ,
       175.        , 116.        ,  69.        ,  55.        ,
        97.        , 152.        , 200.        ,  95.        ,
       142.        , 143.        , 207.        , 288.        ,
       104.25615764,  73.        ,  82.        ,  94.        ,
        62.        ,  56.        , 112.        ,  92.        ,
       161.        , 156.        ,  52.        ,  85.        ,
       114.        , 162.        , 134.        , 106.        ])

In [14]:
normalized_mean = df['normalized-losses'].mean()

In [15]:
df['normalized-losses'].fillna(normalized_mean,inplace=True)

In [16]:
df['normalized-losses'].unique()

array([122., 164., 158., 192., 188., 121.,  98.,  81., 118., 148., 110.,
       145., 137., 101.,  78., 106.,  85., 107., 104., 113., 150., 129.,
       115.,  93., 142., 161., 153., 125., 128., 103., 168., 108., 194.,
       231., 119., 154.,  74., 186.,  83., 102.,  89.,  87.,  77.,  91.,
       134.,  65., 197.,  90.,  94., 256.,  95.])

In [17]:
# Selecting only numerical columns
df_num = df.select_dtypes(["int64","float64"])

In [18]:
df_num.head()

Unnamed: 0,symboling,normalized-losses,width,height,engine-size,horsepower,city-mpg,highway-mpg,price
0,3,122.0,64.1,48.8,130,111.0,21,27,13495
1,3,122.0,64.1,48.8,130,111.0,21,27,16500
2,1,122.0,65.5,52.4,152,154.0,19,26,16500
3,2,164.0,66.2,54.3,109,102.0,24,30,13950
4,2,164.0,66.4,54.3,136,115.0,18,22,17450


In [19]:
# Selecting only object columns
df_cat = df.select_dtypes(["object"])

In [20]:
df_cat.head()

Unnamed: 0,make,fuel-type,body-style,drive-wheels,engine-location,engine-type
0,alfa-romero,gas,convertible,rwd,front,dohc
1,alfa-romero,gas,convertible,rwd,front,dohc
2,alfa-romero,gas,hatchback,rwd,front,ohcv
3,audi,gas,sedan,fwd,front,ohc
4,audi,gas,sedan,4wd,front,ohc


#Label encoding

In [21]:
from sklearn.preprocessing import LabelEncoder

In [22]:
for col in df_cat:
    le = LabelEncoder()
    df_cat[col] = le.fit_transform(df_cat[col])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until


In [23]:
df_cat.head()

Unnamed: 0,make,fuel-type,body-style,drive-wheels,engine-location,engine-type
0,0,1,0,2,0,0
1,0,1,0,2,0,0
2,0,1,2,2,0,5
3,1,1,3,1,0,3
4,1,1,3,0,0,3


In [24]:
Final_df = pd.concat([df_num,df_cat],axis=1)

In [25]:
# Final data frame is ready for cleaning process
Final_df.head() 

Unnamed: 0,symboling,normalized-losses,width,height,engine-size,horsepower,city-mpg,highway-mpg,price,make,fuel-type,body-style,drive-wheels,engine-location,engine-type
0,3,122.0,64.1,48.8,130,111.0,21,27,13495,0,1,0,2,0,0
1,3,122.0,64.1,48.8,130,111.0,21,27,16500,0,1,0,2,0,0
2,1,122.0,65.5,52.4,152,154.0,19,26,16500,0,1,2,2,0,5
3,2,164.0,66.2,54.3,109,102.0,24,30,13950,1,1,3,1,0,3
4,2,164.0,66.4,54.3,136,115.0,18,22,17450,1,1,3,0,0,3


In [26]:
X = Final_df.drop("price",axis=1)
y = df['price']

In [27]:
# Split data for train and test
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.3,random_state=1)

In [28]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
# Evaluating model
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import r2_score

In [29]:
lr = LinearRegression()

In [30]:
lr.fit(X_train,y_train)
y_pred = lr.predict(X_test)

In [31]:
lr.score(X_test,y_test)

0.7965566780397381

In [32]:
r2_score(y_test,y_pred)

0.7965566780397381

#Now we will apply Redge and Lasso module to make model from overfit to goodfit

In [33]:
from sklearn.linear_model import Lasso
from sklearn.linear_model import Ridge

In [34]:
# Now we are checking l2 score with Ridge model
for i in range(50):
    l2 = Ridge(alpha=i)
    l2.fit(X_train,y_train)
    print(i,":",l2.score(X_test,y_test))

0 : 0.7965566780397377
1 : 0.8074518758147271
2 : 0.8110292248150515
3 : 0.8126933383890033
4 : 0.81361486450293
5 : 0.8141745853539418
6 : 0.8145301242133357
7 : 0.8147582608502814
8 : 0.8149010602831952
9 : 0.8149836949253051
10 : 0.8150222867376528
11 : 0.815027724543179
12 : 0.8150076788279418
13 : 0.8149677381788262
14 : 0.8149120868051174
15 : 0.8148439278252518
16 : 0.8147657584947389
17 : 0.8146795554128122
18 : 0.8145869029046835
19 : 0.8144890843369587
20 : 0.8143871485445551
21 : 0.8142819591129827
22 : 0.8141742315788492
23 : 0.8140645619421539
24 : 0.813953448816606
25 : 0.8138413108452036
26 : 0.8137285005403773
27 : 0.8136153153884842
28 : 0.813502006836264
29 : 0.8133887876197213
30 : 0.813275837783145
31 : 0.8131633096537951
32 : 0.8130513319772293
33 : 0.8129400133729974
34 : 0.8128294452363019
35 : 0.8127197041851856
36 : 0.8126108541327799
37 : 0.812502948048576
38 : 0.8123960294605085
39 : 0.8122901337400302
40 : 0.8121852892047273
41 : 0.8120815180669062
42 : 0.81

In [35]:
# Now we are checking l1 score with Lasso model
for i in range(200,500):
    l1 = Lasso(alpha=i)
    l1.fit(X_train,y_train)
    print(i,":",l1.score(X_test,y_test))

200 : 0.8139201358023779
201 : 0.8139280662762406
202 : 0.8139351596604211
203 : 0.8139414452413288
204 : 0.8139468631678922
205 : 0.8139514440169051
206 : 0.813955187854152
207 : 0.8139580947656483
208 : 0.8139601648600466
209 : 0.8139614191109495
210 : 0.8139618129219517
211 : 0.813961388660969
212 : 0.8139601251073796
213 : 0.813958006711075
214 : 0.8139550662926404
215 : 0.8139512871144874
216 : 0.8139466685438558
217 : 0.8139412205900795
218 : 0.8139349321776367
219 : 0.8139278044131173
220 : 0.8139198412531333
221 : 0.8139110395684608
222 : 0.813901396083556
223 : 0.8138907327491185
224 : 0.8138793495175788
225 : 0.813867941317315
226 : 0.8138517841273681
227 : 0.8138300965530064
228 : 0.813808862026844
229 : 0.8137720953012534
230 : 0.8137112454115847
231 : 0.8136506073167213
232 : 0.8135895879419388
233 : 0.8135281292925489
234 : 0.813466234547569
235 : 0.8134039469376061
236 : 0.8133412664626607
237 : 0.8132781990167929
238 : 0.8132147054024552
239 : 0.8131507954227812
240 : 0

In [44]:
l2 = Ridge(alpha=11)
l2.fit(X_train,y_train)
print(l2.score(X_test,y_test))

0.815027724543179


In [45]:
l1 = Lasso(alpha=212)
l1.fit(X_train,y_train)
print(l1.score(X_test,y_test))

0.8139601251073796


In [46]:
#Coefficient with Lasso
l1.coef_

array([   0.        ,    2.06688612,  293.00804229,  475.74946113,
        116.54461228,   15.88311143,   18.71477938, -106.73349254,
       -165.63385126,   -0.        , -417.94284571, 1064.54140452,
       1219.50325351,  307.62453398])

In [47]:
#Coefficient with Ridge

l2.coef_

array([ 2.08075572e+02, -4.19381533e-01,  3.60059061e+02,  5.73951520e+02,
        1.04837625e+02,  2.25081577e+01,  2.09621781e+02, -2.70364868e+02,
       -1.85682528e+02, -8.73561956e+02, -6.31723522e+02,  1.53165287e+03,
        2.37667318e+03,  5.13039213e+02])

In [48]:
#If we have values in both l1 & l2 then we can add them both and divide by 2

Elastic_Net = (l1.coef_+l2.coef_)/2

In [49]:
Elastic_Net

array([ 1.04037786e+02,  8.23752296e-01,  3.26533552e+02,  5.24850491e+02,
        1.10691119e+02,  1.91956346e+01,  1.14168280e+02, -1.88549180e+02,
       -1.75658190e+02, -4.36780978e+02, -5.24833184e+02,  1.29809714e+03,
        1.79808822e+03,  4.10331873e+02])

In [50]:
from sklearn.model_selection import cross_val_score

In [51]:
l1_corssScore = cross_val_score(l1,X,y,cv=4)

In [52]:
l1_corssScore

array([0.76829534, 0.81734961, 0.43670651, 0.44902514])

In [53]:
l2_corssScore = cross_val_score(l2,X,y,cv=4)
l2_corssScore

array([0.74102278, 0.859919  , 0.40754955, 0.45287323])

In [54]:
np.mean(l1_corssScore)

0.6178441498868903

In [55]:
np.mean(l2_corssScore)

0.6153411408406563