# Lab 12: PCA
## Sam Bacon - March 29, 2021
### Using Principal Component Analysis for Dimensionality Reduction

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report


In [None]:
# read in data
data_csv = 'https://drive.google.com/uc?export=download&id=1vqMBid4r0C8apwYcpBQfVQ_TwEUNWY8r'
data = pd.read_csv(data_csv)
data.head(5)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30
0,2.536741,-3.301172,-0.401756,-1.202885,1.634385,0.32821,0.509842,0.307943,-1.330579,0.690989,2.056854,4.245124,-1.122276,-1.286053,0.570713,0.332914,0.065116,1.965521,-2.677922,0.145463,-0.765567,-0.108297,1.063411,-0.438486,-1.151013,1.475069,0.997267,2.782064,1.896763,-0.039366,0.0
1,2.700367,-2.672759,-0.683936,-2.41057,1.046541,0.06631,-0.602418,1.015624,-2.061458,0.174439,1.189218,3.81656,-0.600531,1.093787,0.323073,0.176187,1.493695,2.650732,-3.26099,0.149962,-0.141244,0.271675,2.083393,0.119871,0.704316,1.143842,-1.723951,2.419307,1.03395,-0.033414,0.0
2,-1.020962,-0.85187,1.270827,-0.921015,-1.20501,-0.552378,0.009268,0.409418,0.62719,-0.683375,0.076625,-1.163256,0.459435,-0.642715,0.376824,0.466007,-0.018943,-0.668694,-0.867758,-1.384927,0.455188,-2.445057,-0.628649,-0.592058,0.149045,1.808181,0.636952,1.196119,1.75184,1.279966,1.0
3,-1.450912,-1.156441,-2.247361,1.74452,-1.183791,-1.837285,3.91536,0.129572,1.739377,1.295061,5.047583,2.291222,3.047817,2.3299,-1.681013,2.591229,-0.429771,0.237015,0.372583,-0.389226,0.087047,-0.059553,-2.438575,-1.691106,-0.258106,-0.306145,-0.884099,1.796661,0.03387,0.763104,0.0
4,2.239896,2.216549,-2.511707,0.581304,1.978495,0.272168,-1.235817,-0.958662,-2.358019,1.147567,-1.565297,1.967257,-0.618975,0.037661,-1.619541,-1.517656,-0.840908,1.65165,1.042925,2.301618,0.242111,-0.232808,2.306771,2.040415,-0.339121,-0.049949,0.015628,-2.624473,-4.741839,-0.050563,0.0


In [None]:
# split data
X = data.drop(data.columns[30], axis = 1)
y = data.iloc[:,30]

In [None]:
# train/test data
Xtrain, Xtest, ytrain, ytest = train_test_split(X, y, test_size=0.5, random_state=0)

In [None]:
# define PCA
pca = PCA(n_components=2)
pca.fit(Xtrain)

PCA(copy=True, iterated_power='auto', n_components=2, random_state=None,
    svd_solver='auto', tol=0.0, whiten=False)

In [None]:
# transform Xtrain
Xtrain2 = pca.transform(Xtrain)
print("shape:", Xtrain2.shape)


shape: (500, 2)


In [None]:
# explained variance ration
pca.explained_variance_ratio_

array([0.35609967, 0.19444318])

In [None]:
# KNN model on Xtrain
model = KNeighborsClassifier(n_neighbors=1)
model.fit(Xtrain, ytrain)
ypred = model.predict(Xtest)

print(classification_report(ytest, ypred))

              precision    recall  f1-score   support

         0.0       0.93      0.95      0.94       250
         1.0       0.95      0.93      0.94       250

    accuracy                           0.94       500
   macro avg       0.94      0.94      0.94       500
weighted avg       0.94      0.94      0.94       500



In [None]:
# KNN model on Xtrain2
model_t = KNeighborsClassifier(n_neighbors=1)
model_t.fit(Xtrain2, ytrain)

Xtest2 = pca.transform(Xtest)
ypred_t = model_t.predict(Xtest2)

print(classification_report(ytest, ypred_t))

              precision    recall  f1-score   support

         0.0       0.59      0.58      0.59       250
         1.0       0.59      0.60      0.59       250

    accuracy                           0.59       500
   macro avg       0.59      0.59      0.59       500
weighted avg       0.59      0.59      0.59       500



The KNN model had an accuracy of 0.94 when predicting Xtest using Xtrain, but it only had an accuracy of 0.59 when attempting to predict Xtest2 using Xtrain2. This decrease in accuracy is not necessarily surprising, but it is a clear indication that the dimensionality reduction has removed attributes from the training data that are useful for predicting ytest. It would probably make sense to increase the number of components to increase accuracy. 

In [None]:
# experimenting with different number of components

# components = 3
for i in np.arange(2,31):
  pca = PCA(n_components=i)
  pca.fit(Xtrain)
  Xtrain2 = pca.transform(Xtrain)
  model_t = KNeighborsClassifier(n_neighbors=1)
  model_t.fit(Xtrain2, ytrain)
  Xtest2 = pca.transform(Xtest)
  ypred_t = model_t.predict(Xtest2)
  print("n_components =", i)
  print(classification_report(ytest, ypred_t))
  print("Cumulative variance explained by model:")
  print(np.cumsum(pca.explained_variance_ratio_))
  print()

n_components = 2
              precision    recall  f1-score   support

         0.0       0.59      0.58      0.59       250
         1.0       0.59      0.60      0.59       250

    accuracy                           0.59       500
   macro avg       0.59      0.59      0.59       500
weighted avg       0.59      0.59      0.59       500

Cumulative variance explained by model:
[0.35609967 0.55054285]

n_components = 3
              precision    recall  f1-score   support

         0.0       0.90      0.88      0.89       250
         1.0       0.88      0.91      0.89       250

    accuracy                           0.89       500
   macro avg       0.89      0.89      0.89       500
weighted avg       0.89      0.89      0.89       500

Cumulative variance explained by model:
[0.35609967 0.55054285 0.7000931 ]

n_components = 4
              precision    recall  f1-score   support

         0.0       0.91      0.88      0.89       250
         1.0       0.89      0.91      0.90  

The highest accuracy obtained was 0.96, which occured when n_components = 5-9. For these models, all of the variance in the original data set is explained by the components (see cumulative explained variances above). Therefore, we clearly do not need all 30 attributes to accurately predict ytest.