# PCA Performance Evaluation

In [1]:
import numpy as np
import pandas as pd

df = pd.read_csv("iris.csv")
df.head(2)

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa


In [43]:
# Preprocessing
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

X = df.drop('species', 1)
y = df['species']

# Splitting the dataset into the Training set and Test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)

sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)
print(X_train[0])

[0.61303014 0.10850105 0.94751783 0.736072  ]


In [9]:
# Applying PCA (unsupervised learning)
from sklearn.decomposition import PCA

pca = PCA() # All 4 components
X_train = pca.fit_transform(X_train)
X_test = pca.transform(X_test)

explained_variance = pca.explained_variance_ratio_
print(explained_variance)

[0.72229951 0.2397406  0.03335483 0.00460506]


The 1st principal component is responsible for 72.22% variance.
The 2nd principal component is responsible for 23.9% variance in the dataset. 
Together, we can say that (72.22 + 23.9) 96.21% percent of the classification information contained in the feature set is captured by the first two principal components. The other two contribute marginally.

In [31]:
# 1 principal component

pca = PCA(n_components=1)
X_train = pca.fit_transform(X_train)
X_test = pca.transform(X_test)

explained_variance = pca.explained_variance_ratio_
print(explained_variance)

[0.72229951]


In [32]:
# Making predictions for 1 principal components
from sklearn.ensemble import RandomForestClassifier

classifier = RandomForestClassifier(max_depth=2, random_state=0, n_estimators=10)
classifier.fit(X_train, y_train)

# Predicting the Test set results
y_pred = classifier.predict(X_test)

In [33]:
# Performance results for 1 principal components
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score

c_matrix = confusion_matrix(y_test, y_pred)
print(c_matrix)
print('Accuracy ')
print(accuracy_score(y_test, y_pred))

[[11  0  0]
 [ 0 12  1]
 [ 0  1  5]]
Accuracy 
0.9333333333333333


In [36]:
# 2 principal components

pca = PCA(n_components=2)
X_train = pca.fit_transform(X_train)
X_test = pca.transform(X_test)

explained_variance = pca.explained_variance_ratio_
print(explained_variance)

[0.72229951 0.2397406 ]


In [37]:
# Making predictions for 2 principal components

classifier = RandomForestClassifier(max_depth=2, random_state=0, n_estimators=10)
classifier.fit(X_train, y_train)

# Predicting the Test set results
y_pred = classifier.predict(X_test)

In [38]:
# Performance results for 2 principal components

c_matrix = confusion_matrix(y_test, y_pred)
print(c_matrix)
print('Accuracy ')
print(accuracy_score(y_test, y_pred))

[[11  0  0]
 [ 0 10  3]
 [ 0  2  4]]
Accuracy 
0.8333333333333334


In [40]:
# 3 principal components

pca = PCA(n_components=3)
X_train = pca.fit_transform(X_train)
X_test = pca.transform(X_test)

explained_variance = pca.explained_variance_ratio_
print(explained_variance)

[0.72229951 0.2397406  0.03335483]


In [41]:
# Making predictions for 3 principal components

classifier = RandomForestClassifier(max_depth=2, random_state=0, n_estimators=10)
classifier.fit(X_train, y_train)

# Predicting the Test set results
y_pred = classifier.predict(X_test)

In [42]:
# Performance results for 3 principal components

c_matrix = confusion_matrix(y_test, y_pred)
print(c_matrix)
print('Accuracy ')
print(accuracy_score(y_test, y_pred))

[[11  0  0]
 [ 0 12  1]
 [ 0  1  5]]
Accuracy 
0.9333333333333333


In [44]:
# Performance results for full feature set

classifier = RandomForestClassifier(max_depth=2, random_state=0, n_estimators=10)
classifier.fit(X_train, y_train)

# Predicting the Test set results
y_pred = classifier.predict(X_test)

c_matrix = confusion_matrix(y_test, y_pred)
print(c_matrix)
print('Accuracy ')
print(accuracy_score(y_test, y_pred))

[[11  0  0]
 [ 0 13  0]
 [ 0  2  4]]
Accuracy 
0.9333333333333333


It can be concluded that an optimal level of accuracy is achieved while reducing the number of features in the dataset.

The accuracy with full feature data set is same as with 1 principal component. Also, the accuracy decreased when we increased the number of principal components.

It is recommended to take the number of principal components such that they contribute to significant variance.