<a href="https://colab.research.google.com/github/shivangi-975/Data-Preparation-for-ML/blob/main/Multicollinearity.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### Importing libraries

In [None]:
import pandas as pd

### Loading dataset

In [None]:
automobile = pd.read_csv('datasets/cars_processed.csv')

automobile.head(5)

Unnamed: 0,MPG,Cylinders,Displacement,Horsepower,Weight,Acceleration,Origin,Age
0,18.0,8,307.0,130,3504,12.0,US,49
1,16.0,8,304.0,150,3433,12.0,US,49
2,17.0,8,302.0,140,3449,10.5,US,49
3,14.0,8,454.0,220,4354,9.0,US,49
4,23.551429,8,440.0,215,4312,8.5,US,49


In [None]:
automobile.describe()

Unnamed: 0,MPG,Cylinders,Displacement,Horsepower,Weight,Acceleration,Age
count,387.0,387.0,387.0,387.0,387.0,387.0,387.0
mean,23.672514,5.410853,192.184755,103.645995,2965.387597,15.573643,42.917313
std,7.736579,1.667795,103.703706,38.128651,846.332848,2.74626,3.668715
min,9.0,3.0,68.0,46.0,1613.0,8.0,37.0
25%,17.6,4.0,102.5,75.0,2221.5,13.9,40.0
50%,23.2,4.0,146.0,92.0,2790.0,15.5,43.0
75%,29.0,6.0,260.0,121.0,3589.5,17.05,46.0
max,46.6,8.0,455.0,230.0,5140.0,24.8,49.0


#### Preprocessing the features

In [None]:
from sklearn import preprocessing

automobile[['Cylinders']] = preprocessing.scale(automobile[['Cylinders']].astype('float64'))
automobile[['Displacement']] = preprocessing.scale(automobile[['Displacement']].astype('float64'))
automobile[['Horsepower']] = preprocessing.scale(automobile[['Horsepower']].astype('float64'))
automobile[['Weight']] = preprocessing.scale(automobile[['Weight']].astype('float64'))
automobile[['Acceleration']] = preprocessing.scale(automobile[['Acceleration']].astype('float64'))
automobile[['Age']] = preprocessing.scale(automobile[['Age']].astype('float64'))

In [None]:
automobile.describe()

Unnamed: 0,MPG,Cylinders,Displacement,Horsepower,Weight,Acceleration,Age
count,387.0,387.0,387.0,387.0,387.0,387.0,387.0
mean,23.672514,1.095879e-16,8.778508e-17,-1.061454e-17,-2.008155e-18,2.66224e-16,5.817913e-16
std,7.736579,1.001294,1.001294,1.001294,1.001294,1.001294,1.001294
min,9.0,-1.447404,-1.199046,-1.513838,-1.600007,-2.761372,-1.615
25%,17.6,-0.847034,-0.8659368,-0.752271,-0.8800918,-0.6102152,-0.796216
50%,23.2,-0.847034,-0.4459295,-0.3058349,-0.2075007,-0.0268506,0.02256768
75%,29.0,0.3537065,0.6547792,0.4557326,0.738386,0.5382839,0.8413513
max,46.6,1.554447,2.53757,3.318176,2.572779,3.363956,1.660135


In [None]:
automobile.shape

(387, 8)

### Building a regression model

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X = automobile.drop(['MPG', 'Origin'], axis=1)
Y = automobile['MPG']

x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.2)

In [None]:
from sklearn.linear_model import LinearRegression

linear_model = LinearRegression(normalize=True).fit(x_train, y_train)

In [None]:
print("Training_score : " , linear_model.score(x_train, y_train))

Training_score :  0.7930592969178385


In [None]:
y_pred = linear_model.predict(x_test)

In [None]:
from sklearn.metrics import r2_score

print("Testing_score : ", r2_score(y_test, y_pred))

Testing_score :  0.7556855845195039


### Calculating the adjusted r_2 value

In [None]:
def adjusted_r2(r_square, labels, features):
    
    adj_r_square = 1 - ((1 - r_square) * (len(labels) - 1)) / (len(labels) - features.shape[1] - 1)
    
    return adj_r_square

In [None]:
print("Adjusted_r2_score : ", adjusted_r2(r2_score(y_test, y_pred), y_test, x_test))

Adjusted_r2_score :  0.7350392958873493


### Showing correlation matrix for multicollinearity detection
Higher the correlation value means features are highly correlated

In [None]:
features_corr = X.corr()

features_corr

Unnamed: 0,Cylinders,Displacement,Horsepower,Weight,Acceleration,Age
Cylinders,1.0,0.922633,0.811466,0.873029,-0.458161,0.32185
Displacement,0.922633,1.0,0.894199,0.932822,-0.526901,0.357047
Horsepower,0.811466,0.894199,1.0,0.863388,-0.67092,0.404458
Weight,0.873029,0.932822,0.863388,1.0,-0.397181,0.299049
Acceleration,-0.458161,-0.526901,-0.67092,-0.397181,1.0,-0.292705
Age,0.32185,0.357047,0.404458,0.299049,-0.292705,1.0


#### Checking for correlation value greater than 0.75
Here we can figure out that cylinders, displacement, horsepower and weight are correlated

In [None]:
abs(features_corr) > 0.8

Unnamed: 0,Cylinders,Displacement,Horsepower,Weight,Acceleration,Age
Cylinders,True,True,True,True,False,False
Displacement,True,True,True,True,False,False
Horsepower,True,True,True,True,False,False
Weight,True,True,True,True,False,False
Acceleration,False,False,False,False,True,False
Age,False,False,False,False,False,True


#### So we are dropping 'cylinders', 'displacement', 'weight' column

In [None]:
trimmed_features_df = X.drop(['Cylinders', 'Displacement', 'Weight'], axis=1)

#### After dropping the columns we can see now correlation value is less than 0.75 

In [None]:
trimmed_features_corr = trimmed_features_df.corr()

trimmed_features_corr

Unnamed: 0,Horsepower,Acceleration,Age
Horsepower,1.0,-0.67092,0.404458
Acceleration,-0.67092,1.0,-0.292705
Age,0.404458,-0.292705,1.0


In [None]:
abs(trimmed_features_corr) > 0.8

Unnamed: 0,Horsepower,Acceleration,Age
Horsepower,True,False,False
Acceleration,False,True,False
Age,False,False,True


### Calculating VIF score for multicollinearity detection
* 1 = not correlated.
* Between 1 and 5 = moderately correlated.
* Greater than 5 = highly correlated

In [None]:
from statsmodels.stats.outliers_influence import variance_inflation_factor

In [None]:
vif = pd.DataFrame()
vif["VIF Factor"] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]

In [None]:
vif["features"] = X.columns

In [None]:
vif.round(2)

Unnamed: 0,VIF Factor,features
0,6.84,Cylinders
1,16.1,Displacement
2,8.82,Horsepower
3,10.69,Weight
4,2.49,Acceleration
5,1.22,Age


#### Dropping top 4 highest VIF factor column

In [None]:
X = X.drop(['Displacement', 'Weight'], axis=1)

In [None]:
vif = pd.DataFrame()
vif["VIF Factor"] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]

In [None]:
vif["features"] = X.columns

In [None]:
vif.round(2)

Unnamed: 0,VIF Factor,features
0,3.05,Cylinders
1,4.56,Horsepower
2,1.9,Acceleration
3,1.2,Age


#### Here we are dropping that features which are causing multicollinearity and then training the model
Here we can see the difference between training, testing and adjusted r2 scores between the models that we build in the starting of the demo and this model.

In [None]:
X = automobile.drop(['MPG', 'Displacement', 'Weight', 'Origin'], axis=1)
Y = automobile['MPG']

x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.2)

In [None]:
linear_model = LinearRegression(normalize=True).fit(x_train, y_train)

In [None]:
print("Training_score : " , linear_model.score(x_train, y_train))

Training_score :  0.7309760422436427


In [None]:
y_pred = linear_model.predict(x_test)

In [None]:
print("Testing_score : ", r2_score(y_test, y_pred))

Testing_score :  0.7009622546471816


In [None]:
print("Adjusted_r2_score : ", adjusted_r2(r2_score(y_test, y_pred), y_test, x_test))

Adjusted_r2_score :  0.6845766247648355
