# Handling MultiCollinearity

#### 1.Correlation
#### 2.variance inflation factor

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

In [3]:
cars = pd.read_csv('datasets/cars_processed_new.csv')
cars.head()

Unnamed: 0,MPG,Cylinders,Displacement,Horsepower,Weight,Acceleration,Origin,Age
0,18.0,8,307.0,130,3504,12.0,US,49
1,16.0,8,304.0,150,3433,12.0,US,49
2,17.0,8,302.0,140,3449,10.5,US,49
3,14.0,8,454.0,220,4354,9.0,US,49
4,23.551429,8,440.0,215,4312,8.5,US,49


In [4]:
cars.describe()

Unnamed: 0,MPG,Cylinders,Displacement,Horsepower,Weight,Acceleration,Age
count,387.0,387.0,387.0,387.0,387.0,387.0,387.0
mean,23.672514,5.410853,192.184755,103.645995,2965.387597,15.573643,42.917313
std,7.736579,1.667795,103.703706,38.128651,846.332848,2.74626,3.668715
min,9.0,3.0,68.0,46.0,1613.0,8.0,37.0
25%,17.6,4.0,102.5,75.0,2221.5,13.9,40.0
50%,23.2,4.0,146.0,92.0,2790.0,15.5,43.0
75%,29.0,6.0,260.0,121.0,3589.5,17.05,46.0
max,46.6,8.0,455.0,230.0,5140.0,24.8,49.0


In [10]:
from sklearn import preprocessing

cars['MPG'] = preprocessing.scale(cars[['MPG']].astype('float64'))
cars['Cylinders'] = preprocessing.scale(cars[['Cylinders']].astype('float64'))
cars['Displacement'] = preprocessing.scale(cars[['Displacement']].astype('float64'))
cars['Horsepower'] = preprocessing.scale(cars[['Horsepower']].astype('float64'))
cars['Weight'] = preprocessing.scale(cars[['Weight']].astype('float64'))
cars['Acceleration'] = preprocessing.scale(cars[['Acceleration']].astype('float64'))
cars['Age'] = preprocessing.scale(cars[['Age']].astype('float64'))

In [11]:
cars.describe()

Unnamed: 0,MPG,Cylinders,Displacement,Horsepower,Weight,Acceleration,Age
count,387.0,387.0,387.0,387.0,387.0,387.0,387.0
mean,0.0,3.672055e-17,3.672055e-17,1.101617e-16,1.101617e-16,2.203233e-16,8.078522e-16
std,1.001294,1.001294,1.001294,1.001294,1.001294,1.001294,1.001294
min,-1.898967,-1.447404,-1.199046,-1.513838,-1.600007,-2.761372,-1.615
25%,-0.785926,-0.847034,-0.8659368,-0.752271,-0.8800918,-0.6102152,-0.796216
50%,-0.061154,-0.847034,-0.4459295,-0.3058349,-0.2075007,-0.0268506,0.02256768
75%,0.689501,0.3537065,0.6547792,0.4557326,0.738386,0.5382839,0.8413513
max,2.967354,1.554447,2.53757,3.318176,2.572779,3.363956,1.660135


In [12]:
from sklearn.model_selection import train_test_split

x = cars.drop(['MPG', 'Origin'], axis=1)
y = cars['MPG']

In [14]:
x_train, x_test, y_train, y_test = train_test_split(x,y, test_size =0.2)

In [15]:
from sklearn.linear_model import LinearRegression
linear_model = LinearRegression(normalize=True).fit(x_train, y_train)

In [16]:
print('Training Score: ', linear_model.score(x_train, y_train))

Training Score:  0.7871741885644601


In [18]:
y_pred = linear_model.predict(x_test)

In [20]:
from sklearn.metrics import r2_score

print('Testing score: ', r2_score(y_test, y_pred))

Testing score:  0.7835653828513398


In [21]:
def adjusted_r2(r_square, labels, features):
    adj_r2 = 1 - ((1-r_square) * (len(labels)-1)) / (len(labels) - features.shape[1] - 1)
    return adj_r2

In [31]:
print(' R2 Score : ', adjusted_r2(r2_score(y_test, y_pred), y_test, x_test))

 R2 Score :  0.7652751335148333


In [32]:
features_corr = x.corr()
features_corr

Unnamed: 0,Cylinders,Displacement,Horsepower,Weight,Acceleration,Age
Cylinders,1.0,0.922633,0.811466,0.873029,-0.458161,0.32185
Displacement,0.922633,1.0,0.894199,0.932822,-0.526901,0.357047
Horsepower,0.811466,0.894199,1.0,0.863388,-0.67092,0.404458
Weight,0.873029,0.932822,0.863388,1.0,-0.397181,0.299049
Acceleration,-0.458161,-0.526901,-0.67092,-0.397181,1.0,-0.292705
Age,0.32185,0.357047,0.404458,0.299049,-0.292705,1.0


In [35]:
abs(features_corr) > 0.8

Unnamed: 0,Cylinders,Displacement,Horsepower,Weight,Acceleration,Age
Cylinders,True,True,True,True,False,False
Displacement,True,True,True,True,False,False
Horsepower,True,True,True,True,False,False
Weight,True,True,True,True,False,False
Acceleration,False,False,False,False,True,False
Age,False,False,False,False,False,True


In [36]:
trimmed_features = x.drop(['Cylinders', 'Displacement', 'Weight'], axis=1)

In [38]:
trimmed_features_corr = trimmed_features.corr()
trimmed_features_corr

Unnamed: 0,Horsepower,Acceleration,Age
Horsepower,1.0,-0.67092,0.404458
Acceleration,-0.67092,1.0,-0.292705
Age,0.404458,-0.292705,1.0


In [39]:
abs(trimmed_features_corr)> 0.8

Unnamed: 0,Horsepower,Acceleration,Age
Horsepower,True,False,False
Acceleration,False,True,False
Age,False,False,True


In [41]:
!pip install statsmodels

Collecting statsmodels
[?25l  Downloading https://files.pythonhosted.org/packages/fa/c6/b0dd71340f91beef5f9140221c8d13e19636958b42d1e9e4c36bd0aa8f95/statsmodels-0.10.2-cp38-cp38-manylinux1_x86_64.whl (8.1MB)
[K     |████████████████████████████████| 8.1MB 4.3MB/s eta 0:00:01
[?25hCollecting patsy>=0.4.0
[?25l  Downloading https://files.pythonhosted.org/packages/ea/0c/5f61f1a3d4385d6bf83b83ea495068857ff8dfb89e74824c6e9eb63286d8/patsy-0.5.1-py2.py3-none-any.whl (231kB)
[K     |████████████████████████████████| 235kB 12.6MB/s eta 0:00:01
Installing collected packages: patsy, statsmodels
Successfully installed patsy-0.5.1 statsmodels-0.10.2


In [43]:
from statsmodels.stats.outliers_influence import variance_inflation_factor

In [45]:
vif = pd.DataFrame()
vif['VIF Factor'] = [variance_inflation_factor(x.values, i) for i in range(x.shape[1])]
vif['Features'] = x.columns
vif

# 1       Not Correlated with other features
# 1- 5    Moderately correlated with other features
# >5      Highly correlated with other features

Unnamed: 0,VIF Factor,Features
0,6.841102,Cylinders
1,16.099578,Displacement
2,8.820275,Horsepower
3,10.691363,Weight
4,2.493882,Acceleration
5,1.223454,Age


In [46]:
x = x.drop(['Displacement', 'Weight'], axis =1)
vif = pd.DataFrame()
vif['VIF Factor'] = [variance_inflation_factor(x.values, i) for i in range(x.shape[1])]
vif['Features'] = x.columns
vif

Unnamed: 0,VIF Factor,Features
0,3.048996,Cylinders
1,4.559951,Horsepower
2,1.895344,Acceleration
3,1.196806,Age


In [47]:
x = cars.drop(['MPG', 'Origin', 'Displacement', 'Weight'], axis=1)
y = cars['MPG']
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2)

In [49]:
linear_model = LinearRegression(normalize=True).fit(x_train, y_train)
print('Training Score:', linear_model.score(x_train, y_train))

Training Score: 0.7180583160240379


In [50]:
y_pred = linear_model.predict(x_test)
print('R2 Score: ', r2_score(y_test, y_pred))

R2 Score:  0.7506296360402803


In [51]:
print(' R2 Score : ', adjusted_r2(r2_score(y_test, y_pred), y_test, x_test))

 R2 Score :  0.7369655065082409
