Building a Linear Regression and KNN models with and without using PCA

In [None]:
%matplotlib inline
import numpy as np
import pandas as pd
from sklearn.model_selection import cross_val_score
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
from sklearn import metrics
from sklearn import linear_model
from sklearn.model_selection import train_test_split
import seaborn as sns

In [None]:
import os
print(os.listdir("../input"))

In [None]:
student_data_mat   = pd.read_csv("../input/student_math.csv",delimiter=";")
student_data_por   = pd.read_csv("../input/student_language.csv",delimiter=";")

In [None]:
student_data_mat.head()

In [None]:
student_data_mat.shape

In [None]:
student_data_por.head()

In [None]:
student_data_por.shape

In [None]:
student_data = pd.merge(student_data_mat,student_data_por,how="outer")
student_data.head()

In [None]:
student_data.shape

In [None]:
student_data.info()

In [None]:
columns_string = student_data.columns[student_data.dtypes == object]
columns_string

In [None]:
student_data = pd.get_dummies(student_data, columns = columns_string, drop_first = True)
student_data.info()

In [None]:
student_data.head()

In [None]:
student_data.shape

In [None]:
student_data[["G1","G2","G3"]].corr()

In [None]:
student_data.drop(axis = 1,labels= ["G1"],inplace=True)
student_data.head()

In [None]:
label = student_data["G3"].values
predictors = student_data.drop(axis = 1,labels= ["G3"]).values

In [None]:
predictors

# Principal Component Analysis

Now we perform Principal Component Analysis to identify which of the predictors are the most valuable. For that we first calculate the explained_variance_

In [None]:
pca = PCA(n_components=len(student_data.columns)-1)
pca.fit(predictors)
variance = pca.explained_variance_
variance

In [None]:
print(pca.explained_variance_ratio_)

In [None]:
variance_ratio_cum_sum=np.cumsum(np.round(pca.explained_variance_ratio_, decimals=4)*100)
print(variance_ratio_cum_sum)
plt.plot(variance_ratio_cum_sum)

In [None]:
pca = PCA(n_components=9)
pca.fit(predictors)
Transformed_vector =pca.fit_transform(predictors)
print(Transformed_vector)

In [None]:
student_data_without_output=student_data.drop(axis = 1,labels= ["G3"],inplace=False)
features=student_data_without_output.columns
features

In [None]:
#Visualize coefficients using heat map

plt.figure(figsize=[25,5])
sns.heatmap(pca.components_[0:,:],annot=True,cmap='viridis')
plt.yticks([1,2,3,4,5,6,7,8,9],["First component","Second component","Third component","Fourth component","Fifth component","Sixth component","Seventh component","Eighth component"],rotation=360,ha="right")
plt.xticks(range(len(features)),features,rotation=90,ha="left")
plt.xlabel("Feature")
plt.ylabel("Principal components")

Inference : In the heatmap, few features with different colors apart from common color indicates its importance played in each pricipal component

In [None]:
lr_pca = linear_model.LinearRegression()
lr_withoutpca = linear_model.LinearRegression()

In [None]:
score_lr_withoutpca = cross_val_score(lr_withoutpca, predictors, label, cv=5)
print("PCA Model Cross Validation score : " + str(score_lr_withoutpca))
print("PCA Model Cross Validation Mean score : " + str(score_lr_withoutpca.mean()))

In [None]:
score_lr_pca = cross_val_score(lr_pca, Transformed_vector, label, cv=5)
print("PCA Model Cross Validation score : " + str(score_lr_pca))
print("PCA Model Cross Validation Mean score : " + str(score_lr_pca.mean()))

In [None]:
from sklearn.neighbors import KNeighborsRegressor
from sklearn.model_selection import GridSearchCV

In [None]:
knn = KNeighborsRegressor()

In [None]:
n_neighbors=[5,7,9,10,11]
weights=['distance','uniform']
metric =['euclidean','manhattan','chebyshev']


In [None]:
grid = GridSearchCV(estimator=knn,param_grid=dict(n_neighbors=n_neighbors,weights=weights,metric=metric))
grid.fit(predictors,label)

In [None]:
grid.best_params_

In [None]:
knn_withoutpca = KNeighborsRegressor(n_neighbors=10,weights='uniform',metric='euclidean')
knn_withoutpca.fit(predictors,label)

In [None]:
grid = GridSearchCV(estimator=knn,param_grid=dict(n_neighbors=n_neighbors,weights=weights,metric=metric))
grid.fit(Transformed_vector,label)

In [None]:
grid.best_params_

In [None]:
knn_withpca = KNeighborsRegressor(n_neighbors=11,weights='distance',metric='euclidean')
knn_withpca.fit(Transformed_vector,label)

In [None]:
score_knn_withoutpca = cross_val_score(knn_withoutpca, predictors, label, cv=5)
print("Model Without Cross Validation score : " + str(score_knn_withoutpca))
print("Model Without Cross Validation Mean score : " + str(score_knn_withoutpca.mean()))

In [None]:
score_knn_withpca = cross_val_score(knn_withpca, Transformed_vector, label, cv=5)
print("PCA Model Cross Validation score : " + str(score_knn_withpca))
print("PCA Model Cross Validation Mean score : " + str(score_knn_withpca.mean()))