# Principle Component Analysis (PCA) for Data Visualization

In [1]:
#Imports

import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
%matplotlib inline

In [2]:
#loading data

datasets_train = pd.read_csv("C:/Users/fortn/train_data.csv")
datasets_train.shape

datasets_test = pd.read_csv("C:/Users/fortn/valid_data.csv")
datasets_test.shape

(239, 515)

In [3]:
x_train = datasets_train.iloc[:, 0:514]
x_train
y_train = datasets_train.iloc[:,-1]
y_train

0      1
1      1
2      1
3      1
4      1
      ..
714    0
715    0
716    0
717    0
718    0
Name: 1, Length: 719, dtype: int64

In [4]:
x_test = datasets_test.iloc[:, 0:514]
x_test
y_test = datasets_test.iloc[:,-1]
y_test

0      1
1      1
2      1
3      1
4      1
      ..
234    0
235    0
236    0
237    0
238    0
Name: 1, Length: 239, dtype: int64

# Standardize the Data
"""
Since PCA yields a feature subspace that maximizes the variance along the axes, it makes sense to standardize the data,
especially, if it was measured on different scales. Although, all features in the Iris dataset were measured in centimeters,
let us continue with the transformation of the data onto unit scale (mean=0 and variance=1),
which is a requirement for the optimal performance of many machine learning algorithms.
"""

In [5]:
scaler = StandardScaler().fit(x_train)
x_train_st= scaler.transform(x_train)
x_test_st = scaler.transform(x_test)

# PCA Projection to 2D

In [6]:
pca = PCA(n_components=7)

In [7]:
x_train_pca = pca.fit_transform(x_train_st)
x_test_pca = pca.transform(x_test_st)

In [8]:
print(x_train.shape)
print(x_train_pca.shape)
print(x_test.shape)
print(x_test_pca.shape)

(719, 514)
(719, 7)
(239, 514)
(239, 7)


In [9]:
explained_variance = pca.explained_variance_ratio_
explained_variance

array([0.71676102, 0.11419757, 0.05090892, 0.04139264, 0.01309308,
       0.01049111, 0.00800012])

In [None]:
"""
It can be seen that first principal component is responsible for 71.68% variance.
Similarly, the second principal component causes 11.42% variance in the dataset.
Collectively we can say that (71.68 + 11.42) 83.1% percent of the classification information contained in the feature set is captured by the first two principal components.
"""

In [None]:
###Training and Making Predictions

#In this case, we'll use random forest classification for making the predictions.

In [10]:
from sklearn.ensemble import RandomForestClassifier

In [12]:
classifier = RandomForestClassifier(max_depth=2, random_state=0)
classifier.fit(x_train_pca, y_train)

RandomForestClassifier(max_depth=2, random_state=0)

In [13]:
# Predicting the Test set results
y_pred = classifier.predict(x_test_pca)

In [None]:
#Performance Evaluation

In [14]:
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score

In [15]:
cm = confusion_matrix(y_test, y_pred)
print(cm)

[[174   0]
 [ 35  30]]


In [16]:
print('Accuracy: ', accuracy_score(y_test, y_pred))

Accuracy:  0.8535564853556485
