In [1]:
# importing necessary libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

In [2]:
#Load Dataset
data=pd.read_csv('./mtcars.csv')
data.head()

Unnamed: 0,model,mpg,cyl,disp,hp,drat,wt,qsec,vs,am,gear,carb
0,Mazda RX4,21.0,6,160.0,110,3.9,2.62,16.46,0,1,4,4
1,Mazda RX4 Wag,21.0,6,160.0,110,3.9,2.875,17.02,0,1,4,4
2,Datsun 710,22.8,4,108.0,93,3.85,2.32,18.61,1,1,4,1
3,Hornet 4 Drive,21.4,6,258.0,110,3.08,3.215,19.44,1,0,3,1
4,Hornet Sportabout,18.7,8,360.0,175,3.15,3.44,17.02,0,0,3,2


In [3]:
#Drop Model
data2=data.drop('model',axis=1)
data2.head()

Unnamed: 0,mpg,cyl,disp,hp,drat,wt,qsec,vs,am,gear,carb
0,21.0,6,160.0,110,3.9,2.62,16.46,0,1,4,4
1,21.0,6,160.0,110,3.9,2.875,17.02,0,1,4,4
2,22.8,4,108.0,93,3.85,2.32,18.61,1,1,4,1
3,21.4,6,258.0,110,3.08,3.215,19.44,1,0,3,1
4,18.7,8,360.0,175,3.15,3.44,17.02,0,0,3,2


In [4]:
#VIF Factors
from statsmodels.stats.outliers_influence import variance_inflation_factor

df = data2._get_numeric_data() #This line will drop non-numeric cols
x = data2.drop('mpg',axis=1)

# For each X, calculate VIF and save in dataframe
vif = pd.DataFrame()
vif["VIF Factor"] = [variance_inflation_factor(x.values, i) for i in range(x.shape[1])]
vif["features"] = x.columns
vif.sort_values('VIF Factor')

Unnamed: 0,VIF Factor,features
7,7.41202,am
6,8.752581,vs
9,32.213836,carb
2,56.047781,hp
1,98.930791,disp
0,112.629828,cyl
8,119.804879,gear
3,132.214353,drat
4,182.948049,wt
5,317.534376,qsec


In [5]:
#Remove high VIF Variables (greater than 100) and Run Multivariate Regression

#Define x and y variable
x2 = data2.drop(['mpg','cyl','gear','drat','wt','qsec'],axis=1).values
y2 = data2['mpg'].values

#Training and Test
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test=train_test_split(x2,y2,test_size=0.2,random_state=100)

#Script for Linear Regression Model
from sklearn.linear_model import LinearRegression
from sklearn import metrics

#Scale the Data
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
x_train2 = sc.fit_transform(x_train)
x_test2 = sc.fit_transform(x_test)

x_2 = sc.fit_transform(x2)

In [6]:
#Create Standard Model - Removing VIF Variable
for name,method in [('Linear regression', LinearRegression())]: 
    method.fit(x_train2,y_train)
    predict = method.predict(x_test2)

print('\n Regression Model - using VIF information')
print('\nMethod: {}'.format(name))   

#Coefficents
print('\nIntercept: {:.2f}'.format(float(method.intercept_)))
coeff_table=pd.DataFrame(np.transpose(method.coef_),
                         data2.drop(['mpg','cyl','gear','drat','wt','qsec'],axis=1).columns,
                         columns=['Coefficients'])
print(coeff_table)
    
#R2,MAE,MSE and RMSE
print('\nR2: {:.2f}'.format(metrics.r2_score(y_test,predict)))
print('Mean Absolute Error: {:.2f}'.format(metrics.mean_absolute_error(y_test, predict)))  
print('Mean Squared Error: {:.2f}'.format(metrics.mean_squared_error(y_test, predict)))  
print('Root Mean Squared Error: {:.2f}'.format(np.sqrt(metrics.mean_squared_error(y_test, predict)))) 


 Regression Model - using VIF information

Method: Linear regression

Intercept: 19.87
      Coefficients
disp     -2.281568
hp       -0.136462
vs        0.713722
am        1.982224
carb     -2.017801

R2: 0.88
Mean Absolute Error: 1.53
Mean Squared Error: 3.24
Root Mean Squared Error: 1.80


In [7]:
# Construct some pipelines 
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

#Create Pipeline

pipeline =[]

pipe_lm = Pipeline([('scl', StandardScaler()),
                    ('clf', LinearRegression())])
pipeline.insert(0,pipe_lm)

# Set grid search params 

modelpara =[]

param_gridlm = {'clf__fit_intercept':[True,False],
             'clf__normalize':[True,False], 
             'clf__copy_X':[True, False]}
modelpara.insert(0,param_gridlm)

In [8]:
#Define Gridsearch Function

from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RepeatedKFold
from sklearn.model_selection import cross_val_score

def Gridsearch_cv(model, params):
    
    #Cross-validation Function
    cv2=RepeatedKFold(n_splits=10, n_repeats=5, random_state=100)
        
    #GridSearch CV
    gs_clf = GridSearchCV(model, params, n_jobs=-1, cv=cv2,scoring='neg_mean_squared_error')
    gs_clf = gs_clf.fit(x_train2, y_train)
    model = gs_clf.best_estimator_
    
    #Nested CV
    scoresNested = cross_val_score(gs_clf, x_2, y2, 
                             scoring='neg_mean_squared_error', cv=5,
                             n_jobs= -1)    
      
    # Use best model and test data for final evaluation
    y_pred = model.predict(x_test2)

    #Identify Best Parameters to Optimize the Model
    bestpara=str(gs_clf.best_params_)
   
    #Output Heading
    print('\nOptimized Model')
    print('\nModel Name:',str(pipeline.named_steps['clf']))
        
    #Output Validation Statistics
    print('\nBest Parameters:',bestpara)
    
    #Test data accuracy of model with best params    
    print('\nIntercept: {:.2f}'.format(float(gs_clf.best_estimator_.named_steps['clf'].intercept_)))
    print('\nModel coefficients: ')   
    for name, score in zip(list(data2.drop(['mpg','cyl','gear','drat','wt','qsec'],axis=1)),
                           gs_clf.best_estimator_.named_steps['clf'].coef_):
         print(name, round(score,2)) 
    
    #Print R2
    print('\nR2: {:0.2f}'.format(metrics.r2_score(y_test,y_pred)))
    #Print MSE and RMSE
    print('\nNestedCV MSE:  {:0.2f}'.format(np.mean(-scoresNested)))
    print('NestedCV RMSE:  {:0.2f}'.format(np.sqrt(np.mean(-scoresNested))))  

In [9]:
#Run Models
for pipeline, modelpara in zip(pipeline,modelpara):
    Gridsearch_cv(pipeline,modelpara)




Optimized Model

Model Name: LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

Best Parameters: {'clf__copy_X': True, 'clf__fit_intercept': True, 'clf__normalize': False}

Intercept: 19.87

Model coefficients: 
disp -2.28
hp -0.14
vs 0.71
am 1.98
carb -2.02

R2: 0.88

NestedCV MSE:  9.54
NestedCV RMSE:  3.09


In [10]:
#Using Partial Least Squares Regression

#Define x and y variable
x3 = data2.drop(['mpg','cyl','gear','drat','wt','qsec'],axis=1).values
y3 = data2['mpg'].values

#Training and Test
from sklearn.model_selection import train_test_split
x_train2,x_test2,y_train2,y_test2=train_test_split(x3,y3,test_size=0.2,random_state=100)

#Scale the Data
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
x_train3 = sc.fit_transform(x_train2)
x_test3 = sc.fit_transform(x_test2)

In [11]:
#Determine the Number of Components for PLS
from sklearn.cross_decomposition import PLSRegression
from sklearn.model_selection import cross_val_score

dfcolmns=x3.shape[1]+1
dfcolmns

pls_range = range(1, dfcolmns)
pls_scores = []

#Determine Best Score
for pls in pls_range:
    plsreg = PLSRegression(n_components=pls)
    #obtain cross_val_score for PLS with pls components
    scores = cross_val_score(plsreg, x_train3, y_train2, cv=10,
                                            scoring='neg_mean_squared_error').mean()
    #append mean
    pls_scores.append(scores.mean())
    
#Print Best Score
BestScore = [1 - x for x in pls_scores]
best_pls = pls_range[BestScore.index(min(BestScore))]
print('Optimal Number of Components: {}'.format(best_pls))

Optimal Number of Components: 3


In [12]:
# PLS Model
for name2,method2 in [('PLS regression', PLSRegression(n_components=best_pls))]: 
    method2.fit(x_train3,y_train2)
    predict2 = method2.predict(x_test3)

print('\nPLS Model')
print('\nMethod: {}'.format(name))   
print('\nOptimal Number of Components: {}'.format(best_pls))

#R2,MAE,MSE and RMSE
print('\nR2: {:.2f}'.format(metrics.r2_score(y_test2,predict2)))
print('Mean Absolute Error: {:.2f}'.format(metrics.mean_absolute_error(y_test2, predict2)))  
print('Mean Squared Error: {:.2f}'.format(metrics.mean_squared_error(y_test2, predict2)))  
print('Root Mean Squared Error: {:.2f}'.format(np.sqrt(metrics.mean_squared_error(y_test2, predict2)))) 


PLS Model

Method: Linear regression

Optimal Number of Components: 3

R2: 0.88
Mean Absolute Error: 1.47
Mean Squared Error: 3.12
Root Mean Squared Error: 1.77
