In [1]:
# Importing Libraries
import seaborn as sns
import numpy as np
import pandas as pd  
import matplotlib.pyplot as plt
from sklearn import svm, datasets

# import some data to play with. We are loading the popular Iris Data set
irisdata = sns.load_dataset('iris')
irisdata.head()  # have a look at the attributres(=> X) and Labels(=> y) 

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa
3,4.6,3.1,1.5,0.2,setosa
4,5.0,3.6,1.4,0.2,setosa


In [2]:
# Preprocessing data
X = irisdata.drop('species', axis=1)  
y = irisdata['species']

# Train Test Split
from sklearn.model_selection import train_test_split  
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20)


#  PCA performs best with a normalized feature set. 
#  We will perform standard scalar normalization to normalize our feature set.
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()  
X_train = sc.fit_transform(X_train)  
X_test = sc.transform(X_test)

# Performing PCA using Scikit-Learn is a two-step process:
# 1. Initialize the PCA class by passing the number of components to the constructor.
# 2. Call the fit and then transform method by passing the feature set to these methods. 
#    The transform method returns the specified number of principal components.

# Applying PCA
from sklearn.decomposition import PCA
pca = PCA()  
X_train = pca.fit_transform(X_train)  
X_test = pca.transform(X_test)  


print(X_train[:5])

[[ 1.86585863  0.66072644 -0.0168809  -0.20120114]
 [ 1.3548494   0.40179469  0.19810987  0.20592094]
 [-2.38483191 -0.45175726  0.20057524  0.04728643]
 [ 0.89710379  0.21504664 -0.46176653 -0.10044891]
 [ 2.04539715  0.90342091  0.22746313 -0.17786793]]


In [3]:
explained_variance = pca.explained_variance_ratio_ 
explained_variance

array([0.73838804, 0.22318067, 0.03408865, 0.00434264])

In [4]:
# Let's first try to use 1 principal component to train our algorithm. 
# To do so, execute the following code:

from sklearn.model_selection import train_test_split  
X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    test_size = 0.20, 
                                                    random_state=100)

#  PCA performs best with a normalized feature set. 
#  We will perform standard scalar normalization to normalize our feature set.
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()  
X_train = sc.fit_transform(X_train)  
X_test = sc.transform(X_test)

from sklearn.decomposition import PCA

pca = PCA(n_components=1)  
X_train = pca.fit_transform(X_train)  
X_test = pca.transform(X_test)  

print(X_train[:5])

[[ 0.01197422]
 [ 0.29176303]
 [ 1.27693979]
 [-2.25154785]
 [-2.15421304]]


In [5]:
# Training and Making Predictions
# In this case we'll use random forest classification 
# for making the predictions.
from sklearn.ensemble import RandomForestClassifier
classifier = RandomForestClassifier(max_depth=2, random_state=0, 
                                    n_estimators=10)  
classifier.fit(X_train, y_train)
# Please Note : if n_estimators is not specified in RandomForestClassifier
# default value of 10 is taken. For this you may get FutureWarning
# To avoid the warning, either specify n_estimators or suppress warnings


# Predicting the Test set results
y_pred = classifier.predict(X_test)  


# Performance Evaluation
from sklearn.metrics import confusion_matrix  
from sklearn.metrics import accuracy_score

cm = confusion_matrix(y_test, y_pred)  
print(cm)  
print('Accuracy ' ,  accuracy_score(y_test, y_pred))

[[11  0  0]
 [ 0  5  1]
 [ 0  1 12]]
Accuracy  0.9333333333333333


In [6]:
# Try PCA with 2 Principal Components
# pca = PCA(n_components=2)
#------------------------------------------
# all the above steps would have to be repeated.
from sklearn.model_selection import train_test_split  
X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    test_size = 0.20, 
                                                    random_state=100)
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()  
X_train = sc.fit_transform(X_train)  
X_test = sc.transform(X_test)

from sklearn.decomposition import PCA

pca=PCA(n_components=2)
X_train=pca.fit_transform(X_train)
X_test=pca.transform(X_test)

print(X_test[:5])

from sklearn.ensemble import RandomForestClassifier
classifier = RandomForestClassifier(max_depth=2, random_state=0, 
                                    n_estimators=10)  
classifier.fit(X_train, y_train)

y_pred = classifier.predict(X_test)

from sklearn.metrics import confusion_matrix  
from sklearn.metrics import accuracy_score

cm = confusion_matrix(y_test, y_pred)  
print(cm)  
print('Accuracy ' ,  accuracy_score(y_test, y_pred))

[[ 1.87728863e+00 -1.97725225e-01]
 [-2.32463091e+00  2.28963933e-01]
 [ 3.43464467e+00 -1.29167636e-03]
 [-2.16262897e+00  2.79560533e+00]
 [ 1.39650638e+00 -4.73982558e-01]]
[[11  0  0]
 [ 0  4  2]
 [ 0  5  8]]
Accuracy  0.7666666666666667


In [8]:
# Try PCA with 3 Principal Components
# pca = PCA(n_components=3)
#------------------------------------------
# all the above steps would have to be repeated.


from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=100)
 
from sklearn.preprocessing import StandardScaler
sc=StandardScaler()
X_train=sc.fit_transform(X_train)
X_test=sc.transform(X_test)

from sklearn.decomposition import PCA
pca=PCA(n_components=3)
X_train=pca.fit_transform(X_train)
X_test=pca.transform(X_test)

print(X_test[:5])

from sklearn.ensemble import RandomForestClassifier
classifier = RandomForestClassifier(max_depth=2, random_state=0, 
                                    n_estimators=10)  
classifier.fit(X_train, y_train)

y_pred = classifier.predict(X_test)

from sklearn.metrics import confusion_matrix  
from sklearn.metrics import accuracy_score

cm = confusion_matrix(y_test, y_pred)  
print(cm)  
print('Accuracy ' ,  accuracy_score(y_test, y_pred))


[[ 1.87728863e+00 -1.97725225e-01  2.18513954e-01]
 [-2.32463091e+00  2.28963933e-01  9.86858462e-02]
 [ 3.43464467e+00 -1.29167636e-03 -7.99173028e-01]
 [-2.16262897e+00  2.79560533e+00  6.51532670e-02]
 [ 1.39650638e+00 -4.73982558e-01 -6.22294960e-02]]
[[11  0  0]
 [ 0  6  0]
 [ 0  9  4]]
Accuracy  0.7
