In [1]:
#Load Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
#Load Dataset
df1 = pd.read_csv('./mtcars.csv')
df1.head()

Unnamed: 0,model,mpg,cyl,disp,hp,drat,wt,qsec,vs,am,gear,carb
0,Mazda RX4,21.0,6,160.0,110,3.9,2.62,16.46,0,1,4,4
1,Mazda RX4 Wag,21.0,6,160.0,110,3.9,2.875,17.02,0,1,4,4
2,Datsun 710,22.8,4,108.0,93,3.85,2.32,18.61,1,1,4,1
3,Hornet 4 Drive,21.4,6,258.0,110,3.08,3.215,19.44,1,0,3,1
4,Hornet Sportabout,18.7,8,360.0,175,3.15,3.44,17.02,0,0,3,2


In [3]:
#Dataset Attributes
df1 = df1.drop('model',axis=1)
df1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32 entries, 0 to 31
Data columns (total 11 columns):
mpg     32 non-null float64
cyl     32 non-null int64
disp    32 non-null float64
hp      32 non-null int64
drat    32 non-null float64
wt      32 non-null float64
qsec    32 non-null float64
vs      32 non-null int64
am      32 non-null int64
gear    32 non-null int64
carb    32 non-null int64
dtypes: float64(5), int64(6)
memory usage: 2.8 KB


In [4]:
# importing package
import pandas_profiling as pp
from IPython.display import IFrame

# Profile Report
carslmRe = pp.ProfileReport(df1)
carslmRe.to_file('carslmRe.html')
display(IFrame('carslmRe.html', width=900, height=350))

In [5]:
#Define x and y variable
x = df1.drop('mpg',axis=1).values
y = df1['mpg'].values

#Train dataset using 20% test set
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.2,random_state=100)

#Scale the Data
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
x_train2 = sc.fit_transform(x_train)
x_test2 = sc.fit_transform(x_test)

x_2 = sc.fit_transform(x)

In [6]:
#Load and Create Linear Regression library
from sklearn.linear_model import LinearRegression
model = [LinearRegression()]

In [7]:
#Create Model Comparison
from sklearn import metrics
from sklearn.model_selection import KFold
from sklearn.model_selection import RepeatedKFold
from sklearn.model_selection import LeaveOneOut
from sklearn.model_selection import ShuffleSplit
from sklearn.model_selection import cross_validate

#Scoring Parameters
scoring = {'MAE':'neg_mean_absolute_error',
           'MSE':'neg_mean_squared_error'}

#Model Creation
MLA_columns = []
MLA_compare = pd.DataFrame(columns = MLA_columns)

row_index = 0
for alg in model:
    
    #Linear Model
    predicted = alg.fit(x_train2, y_train).predict(x_test2)
    rmse_test = np.sqrt(metrics.mean_squared_error(y_test, predicted))
    
    #K-Fold CV
    kfcv = alg.fit(x_train2, y_train)
    kf = KFold(n_splits=10, shuffle=True, random_state=100)
    KFcv = cross_validate(kfcv, x_2, y, scoring=scoring,
                         cv=kf)
    
    #Repeated K-Fold CV
    rkfcv = alg.fit(x_train2, y_train)
    rkf = RepeatedKFold(n_splits=10, n_repeats=5, random_state=100)
    rKFcv = cross_validate(rkfcv, x_2, y, scoring=scoring,
                         cv=rkf)
    
    #LOOCV
    loomodel=alg.fit(x_train2, y_train)
    loo = LeaveOneOut()
    loocv = cross_validate(loomodel, x_2, y, scoring=scoring,
                         cv=loo)
    
    #Shuffle-Split
    rsmodel=alg.fit(x_train2, y_train)
    rs = ShuffleSplit(n_splits=10, test_size=.20, random_state=100)
    rscv = cross_validate(rsmodel, x_2, y, scoring=scoring,
                         cv=rs)
        
    MLA_name = alg.__class__.__name__
    MLA_compare.loc[row_index,'Algorithm Name'] = MLA_name
    MLA_compare.loc[row_index, 'Original Model-RMSE'] = round(rmse_test,2)
    MLA_compare.loc[row_index, 'KFcv Model'] = round(np.sqrt(-KFcv['test_MSE'].mean()),2)
    MLA_compare.loc[row_index, 'rKFcv Model'] = round(np.sqrt(-rKFcv['test_MSE'].mean()),2)
    MLA_compare.loc[row_index, 'Loocv Model'] = round(np.sqrt(-loocv['test_MSE'].mean()),2)
    MLA_compare.loc[row_index, 'ShuffleSplit Model'] = round(np.sqrt(-rscv['test_MSE'].mean()),2)
    
    row_index+=1

MLA_compare

Unnamed: 0,Algorithm Name,Original Model-RMSE,KFcv Model,rKFcv Model,Loocv Model,ShuffleSplit Model
0,LinearRegression,3.74,3.83,3.67,3.49,3.64


In [8]:
#Bootstrapping
from sklearn.utils import resample
model2 = LinearRegression()
x2 = x
y2 = y

bootx2 = resample(x2, replace=True, n_samples=len(x2), random_state=100)
booty2 = resample(y2, replace=True, n_samples=len(y2), random_state=100)

#Train dataset using 20% test set
from sklearn.model_selection import train_test_split
x_trainboot,x_testboot,y_trainboot,y_testboot=train_test_split(bootx2,booty2,test_size=0.2,random_state=100)

#Scale the Data
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
x_trainboot2 = sc.fit_transform(x_trainboot)
x_testboot2 = sc.fit_transform(x_testboot)

predicted = model2.fit(x_trainboot2, y_trainboot).predict(x_testboot2)
rmse_test = np.sqrt(metrics.mean_squared_error(y_testboot, predicted))
print('Bootstrapping RMSE:',round(rmse_test,2))

Bootstrapping RMSE: 3.28
