In [None]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error,r2_score,mean_absolute_error

In [None]:
df=pd.read_csv('cars.csv')
df.head()

Unnamed: 0,symboling,normalized-losses,make,fuel-type,body-style,drive-wheels,engine-location,width,height,engine-type,engine-size,horsepower,city-mpg,highway-mpg,price
0,3,?,alfa-romero,gas,convertible,rwd,front,64.1,48.8,dohc,130,111,21,27,13495
1,3,?,alfa-romero,gas,convertible,rwd,front,64.1,48.8,dohc,130,111,21,27,16500
2,1,?,alfa-romero,gas,hatchback,rwd,front,65.5,52.4,ohcv,152,154,19,26,16500
3,2,164,audi,gas,sedan,fwd,front,66.2,54.3,ohc,109,102,24,30,13950
4,2,164,audi,gas,sedan,4wd,front,66.4,54.3,ohc,136,115,18,22,17450


In [None]:
'''
1. read data--- basics analysis
2. Missing values and encoding
3. build a baseline model --- without removing outliers scaling skewness
4. perform skewness ,outliers ,scaling
5. next model'''

'\n1. read data--- basics analysis\n2. Missing values and encoding\n3. build a baseline model --- without removing outliers scaling skewness\n4. perform skewness ,outliers ,scaling\n5. next model'

In [None]:
#step 1 --replace '?' with nan
df['normalized-losses'].replace('?',np.nan,inplace=True)
df['horsepower'].replace('?',np.nan,inplace=True)

In [None]:
#step2 --changing data type cat col into float
df['normalized-losses']=df['normalized-losses'].astype('float64')
df['horsepower']=df['horsepower'].astype('float64')


In [None]:
from sklearn.impute import SimpleImputer
si=SimpleImputer(missing_values=np.nan,strategy='mean')

In [None]:
#seprate feture and target
X=df.iloc[:,:-1]#all col except last
Y=df.iloc[:,-1]#only last col

In [None]:
#fit nan with mean
X[['normalized-losses','horsepower']]=si.fit_transform(X[['normalized-losses','horsepower']])

In [None]:
X.head()

Unnamed: 0,symboling,normalized-losses,make,fuel-type,body-style,drive-wheels,engine-location,width,height,engine-type,engine-size,horsepower,city-mpg,highway-mpg
0,3,122.0,alfa-romero,gas,convertible,rwd,front,64.1,48.8,dohc,130,111.0,21,27
1,3,122.0,alfa-romero,gas,convertible,rwd,front,64.1,48.8,dohc,130,111.0,21,27
2,1,122.0,alfa-romero,gas,hatchback,rwd,front,65.5,52.4,ohcv,152,154.0,19,26
3,2,164.0,audi,gas,sedan,fwd,front,66.2,54.3,ohc,109,102.0,24,30
4,2,164.0,audi,gas,sedan,4wd,front,66.4,54.3,ohc,136,115.0,18,22


In [None]:

#seprating categorical col
cat_col=X.select_dtypes(object).columns
cat_col

Index(['make', 'fuel-type', 'body-style', 'drive-wheels', 'engine-location',
       'engine-type'],
      dtype='object')

In [None]:
#spliting training and test data
xtrain ,xtest,ytrain,ytest=train_test_split(X,Y,test_size=0.3,random_state=1)

In [None]:
#Encoding
from sklearn.preprocessing import LabelEncoder
for col in cat_col:
  le=LabelEncoder()
  xtrain[col]=le.fit_transform(xtrain[col])
  xtest[col]=le.fit_transform(xtest[col])


In [None]:
xtest

Unnamed: 0,symboling,normalized-losses,make,fuel-type,body-style,drive-wheels,engine-location,width,height,engine-type,engine-size,horsepower,city-mpg,highway-mpg
78,2,161.0,9,1,2,1,0,64.4,50.8,2,92,68.0,31,38
97,1,103.0,10,1,4,1,0,63.8,53.5,2,97,69.0,31,37
151,1,87.0,16,1,2,1,0,63.6,54.5,2,92,62.0,31,38
44,1,122.0,5,1,3,1,0,63.6,52.0,2,90,70.0,38,43
40,0,85.0,4,1,3,1,0,62.5,54.1,2,110,86.0,27,33
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
39,0,85.0,4,1,3,1,0,65.2,54.1,2,110,86.0,27,33
110,0,122.0,11,0,4,2,0,68.4,58.7,1,152,95.0,25,25
164,1,168.0,16,1,2,2,0,64.0,52.6,2,98,70.0,29,34
56,3,150.0,7,1,2,2,0,65.7,49.6,5,70,101.0,17,23


In [None]:
#linear regression
lr=LinearRegression()
lr.fit(xtrain,ytrain)

In [None]:
#testing model on training data
from sklearn.metrics import  r2_score
y_pred=lr.predict(xtrain)
r2_score(ytrain,y_pred)

0.8504573774895473

In [None]:
#testing model on testing data
from sklearn.metrics import  r2_score
y_pred=lr.predict(xtest)
r2_score(ytest,y_pred)

0.7986748934705591

In [None]:
lr.score(xtrain,ytrain)

0.8504573774895473

In [None]:
lr.score(xtest,ytest)

0.7986748934705591

In [None]:
# as the r2 score for train dat> r2 score test data , this is an overmodel

In [1]:
#Regularization
from sklearn.linear_model import Lasso,Ridge

In [None]:
#Ridge regularization
l2=Ridge(0.1)
l2.fit(xtrain,ytrain)

In [None]:
l2.score(xtest,ytest)

0.7988681197300086

In [None]:
l2.coef_

array([ 5.80531057e+01,  1.25218729e+00, -1.99664337e+02, -6.93146496e+02,
       -2.02148035e+02,  1.87554249e+03,  1.55353516e+04,  7.59674152e+02,
        3.76888292e+02,  3.02876163e+02,  9.84783068e+01, -8.63170960e+00,
        3.01108871e+02, -4.07873646e+02])

In [None]:
#taking diff values of error to l2 model
for alpha in range(1,10):
  l2=Ridge(alpha)
  l2.fit(xtrain,ytrain)
  test_score=l2.score(xtest,ytest)
  print('Alpha:',alpha)
  print('Test score:',test_score)
  print('---------------')

Alpha: 1
Test score: 0.8089947857221718
---------------
Alpha: 2
Test score: 0.8124501770702479
---------------
Alpha: 3
Test score: 0.8140694573519199
---------------
Alpha: 4
Test score: 0.8149628463558188
---------------
Alpha: 5
Test score: 0.815497812207385
---------------
Alpha: 6
Test score: 0.8158287849888739
---------------
Alpha: 7
Test score: 0.8160319295488672
---------------
Alpha: 8
Test score: 0.816149376781685
---------------
Alpha: 9
Test score: 0.8162065136494411
---------------


In [None]:
'''after an alpha value 2, we cam observe that there is small
 change in the r2 score therefore the
 ridge with alpha =2 is a good model with an r2 score'''

'after an alpha value 2, we cam observe that there is small change in the r2 score therefore the ridge with alpha =2 is a good model with an r2 score'

In [None]:
#taking diff values of error to l1 model in Lasso
for alpha in range(100,151,10):
  l1=Lasso(alpha)
  l1.fit(xtrain,ytrain)
  test_score=l1.score(xtest,ytest)
  print('Alpha:',alpha)
  print('Test score:',test_score)
  print('---------------')

Alpha: 100
Test score: 0.8099442605980115
---------------
Alpha: 110
Test score: 0.8107170532455643
---------------
Alpha: 120
Test score: 0.8114096239609679
---------------
Alpha: 130
Test score: 0.8120220046932067
---------------
Alpha: 140
Test score: 0.8125541671598384
---------------
Alpha: 150
Test score: 0.8130061225505073
---------------


In [None]:
l2=Ridge(50)
l2.fit(xtest,ytest)
l2.coef_

array([ -93.25439735,    2.02222668, -180.85300041, -162.6448192 ,
       -121.5412447 ,  554.35432891,    0.        ,  535.01296513,
        208.16135657,   32.6618789 ,   81.41947667,   84.53613089,
        -12.10247632,   58.85375523])

#helps to improve your r2 score Ridge and lasso

In [None]:
l2=Ridge(2)
l2.fit(xtest,ytest)
l2.coef_

array([ -95.64621625,   -4.79921701, -230.02678027, -788.65942285,
       -247.97913258, 2370.07465113,    0.        ,  627.60370346,
        218.22830205,   62.34625242,   70.12547165,   86.62693138,
       -147.37906244,  215.9348402 ])

In [None]:
l2=Lasso(130)
l2.fit(xtest,ytest)
l2.coef_

array([  -0.        ,   -4.30711592, -210.30609322,   -0.        ,
         -0.        , 2189.03612552,    0.        ,  684.8902633 ,
        162.46429076,    0.        ,   71.56019596,   84.94087357,
         -0.        ,   87.3735309 ])

In [None]:
#cross validation

In [None]:
from sklearn.model_selection import cross_val_score

In [None]:
catcol=X.select_dtypes(object).columns


In [None]:
for col in catcol:
  le=LabelEncoder()
  X[col]=le.fit_transform(X[col])


In [None]:
X.head()

Unnamed: 0,symboling,normalized-losses,make,fuel-type,body-style,drive-wheels,engine-location,width,height,engine-type,engine-size,horsepower,city-mpg,highway-mpg
0,3,122.0,0,1,0,2,0,64.1,48.8,0,130,111.0,21,27
1,3,122.0,0,1,0,2,0,64.1,48.8,0,130,111.0,21,27
2,1,122.0,0,1,2,2,0,65.5,52.4,5,152,154.0,19,26
3,2,164.0,1,1,3,1,0,66.2,54.3,3,109,102.0,24,30
4,2,164.0,1,1,3,0,0,66.4,54.3,3,136,115.0,18,22


In [None]:
cross_val_score(l2,X,Y,cv=4) # returns r2 score for all 4 parts of data training

array([0.74048997, 0.8346221 , 0.41264006, 0.47040374])

In [None]:
cross_val_score(l1,X,Y,cv=4) # returns r2 score for all 4 parts of data testing

array([0.74881656, 0.8283863 , 0.41889792, 0.46178525])