![Ravedata logo](https://ravedata.in/wp-content/uploads/2020/07/RaveData_Full.png)

# Model Selection

In [1]:
import pandas as pd
import numpy as np
import matplotlib.ticker as ticker
## Let's import the vizaulization libraries.
import matplotlib.pyplot as plt
import seaborn as sns ## Seaborn is based on matplotlib and offers more control
## This is written so that we can see our results within the jupyter notebook
%matplotlib inline

In [2]:
## customizing rcParams
rcparam_dict={"figure.figsize":"16,10","axes.labelsize":50,"axes.titlesize":"large","font.family":"Comic Sans MS","text.color":"#A53010","axes.labelcolor":"A53010",
              "axes.labelsize":"medium",'xtick.color':'#A53010', 'ytick.color':'#A53010'}
context= "poster"
style= "darkgrid"
font_family = "Comic Sans MS"
sns.set(context=context,style=style,font=font_family,rc=rcparam_dict) ## Setiing context, style and rcparams in a single lin

## AUTO_MPG DATSET SOURCE - http://archive.ics.uci.edu/ml/datasets/Auto+MPG

In [3]:
auto_data= pd.read_csv("auto.csv")

- check head of the dataset
- check info method
- check describe method

In [4]:
auto_data.head()

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model year,origin,car name
0,18.0,8,307.0,130,3504,12.0,70,1,chevrolet chevelle malibu
1,15.0,8,350.0,165,3693,11.5,70,1,buick skylark 320
2,18.0,8,318.0,150,3436,11.0,70,1,plymouth satellite
3,16.0,8,304.0,150,3433,12.0,70,1,amc rebel sst
4,17.0,8,302.0,140,3449,10.5,70,1,ford torino


In [5]:
auto_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 398 entries, 0 to 397
Data columns (total 9 columns):
mpg             398 non-null float64
cylinders       398 non-null int64
displacement    398 non-null float64
horsepower      398 non-null object
weight          398 non-null int64
acceleration    398 non-null float64
model year      398 non-null int64
origin          398 non-null int64
car name        398 non-null object
dtypes: float64(3), int64(4), object(2)
memory usage: 28.1+ KB


In [6]:
auto_data.describe()

Unnamed: 0,mpg,cylinders,displacement,weight,acceleration,model year,origin
count,398.0,398.0,398.0,398.0,398.0,398.0,398.0
mean,23.514573,5.454774,193.425879,2970.424623,15.56809,76.01005,1.572864
std,7.815984,1.701004,104.269838,846.841774,2.757689,3.697627,0.802055
min,9.0,3.0,68.0,1613.0,8.0,70.0,1.0
25%,17.5,4.0,104.25,2223.75,13.825,73.0,1.0
50%,23.0,4.0,148.5,2803.5,15.5,76.0,1.0
75%,29.0,8.0,262.0,3608.0,17.175,79.0,2.0
max,46.6,8.0,455.0,5140.0,24.8,82.0,3.0


In [7]:
auto_data.columns

Index(['mpg', 'cylinders', 'displacement', 'horsepower', 'weight',
       'acceleration', 'model year', 'origin', 'car name'],
      dtype='object')

In [8]:
auto_data.horsepower.unique()

array(['130', '165', '150', '140', '198', '220', '215', '225', '190',
       '170', '160', '95', '97', '85', '88', '46', '87', '90', '113',
       '200', '210', '193', '?', '100', '105', '175', '153', '180', '110',
       '72', '86', '70', '76', '65', '69', '60', '80', '54', '208', '155',
       '112', '92', '145', '137', '158', '167', '94', '107', '230', '49',
       '75', '91', '122', '67', '83', '78', '52', '61', '93', '148',
       '129', '96', '71', '98', '115', '53', '81', '79', '120', '152',
       '102', '108', '68', '58', '149', '89', '63', '48', '66', '139',
       '103', '125', '133', '138', '135', '142', '77', '62', '132', '84',
       '64', '74', '116', '82'], dtype=object)

remove all the rows which has missing values for horsepower

In [9]:
auto_data = auto_data[auto_data.horsepower != '?']
auto_data.reset_index(drop=True,inplace=True)

In [10]:
auto_data.horsepower = auto_data.horsepower.astype('float')
auto_data.dtypes

mpg             float64
cylinders         int64
displacement    float64
horsepower      float64
weight            int64
acceleration    float64
model year        int64
origin            int64
car name         object
dtype: object

In [11]:
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score

In [12]:
features=['horsepower']
max_degree=10
mse_test_list=[]
mse_train_list=[]
test_list_data=[]
train_list_data=[]

In [13]:
X = auto_data[features]
y = auto_data['mpg']

In [14]:
X

Unnamed: 0,horsepower
0,130.0
1,165.0
2,150.0
3,150.0
4,140.0
...,...
387,86.0
388,52.0
389,84.0
390,79.0


In [15]:
from sklearn.model_selection import cross_val_score

# Performance Estimation using10 Fold Cross-Validation

In this example, we will evaluate two models using 10 fold cross validation

- linear regression classifier
- polynomial regression classifier with degree 2

In [16]:
# scikit-learn k-fold cross-validation
from numpy import array
from sklearn.model_selection import KFold

In [17]:
X = auto_data['horsepower'].values.reshape(-1,1) ## as the X has only one variable, it is to be reshaped
y = auto_data['mpg']

In [18]:
## Linear regression classifier
model_linear = LinearRegression()

## various predefined scoring parameters can be found [here](https://scikit-learn.org/stable/modules/model_evaluation.html)

In [19]:
score_mse_linear = abs(cross_val_score(model_linear, X, y, cv=10,scoring='neg_mean_squared_error'))
score_mse_linear

array([28.34783584, 17.22640854, 26.92535793, 23.36016122, 15.55763304,
       17.89383456, 17.04476867, 22.83657872, 65.93489567, 39.27186233])

In [20]:
score_mse_linear.mean()

27.439933652339867

In [21]:
polynomial_features= PolynomialFeatures(degree=2) ##-- degree means the highest exponent that can be possible in the polynomial
x_poly = polynomial_features.fit_transform(X)
model_polynomial_2 = LinearRegression()

In [22]:
score_mse_polynomial_2 = abs(cross_val_score(model_polynomial_2, x_poly, y, cv=10,scoring='neg_mean_squared_error'))
score_mse_polynomial_2

array([12.76634828, 16.55513797, 18.8823729 , 21.59619594, 13.81072657,
       10.53307937, 12.02264689, 20.63685547, 50.17510286, 35.3799343 ])

In [23]:
score_mse_polynomial_2.mean()

21.23584005580211

In [24]:
polynomial_features= PolynomialFeatures(degree=3) ##-- degree means the highest exponent that can be possible in the polynomial
x_poly = polynomial_features.fit_transform(X)
model_polynomial_3 = LinearRegression()
score_mse_polynomial_3 = abs(cross_val_score(model_polynomial_3, x_poly, y, cv=10,scoring='neg_mean_squared_error'))
score_mse_polynomial_3

array([12.56821522, 16.72742348, 19.97659557, 21.51654569, 13.88689296,
       10.32004012, 11.81509234, 20.58935444, 49.97092377, 35.99497823])

In [25]:
score_mse_polynomial_3.mean()

21.336606183328694