## Applying PCA on the Iris dataset

We will apply PCA on the Iris dataset.
We will check the performance of logistic regression on the ds with and without PCA.

Metrics for performance here are:
1. Speed
2. Score/Accuracy

In [74]:
import time
import numpy as np
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from sklearn.linear_model import LogisticRegression
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

In [75]:
# Load dataset
iris = datasets.load_iris()

# Form data points and labels
X = iris.data
Y = iris.target

In [76]:
# Inspect data 
X.shape, Y.shape

((150, 4), (150,))

In [77]:
# Split data into training and testing
x_train, x_test, y_train, y_test = train_test_split(X, Y, random_state = 0)

In [78]:
# Inspect data after form train and test 
x_train.shape, y_train.shape, x_test.shape, y_test.shape

((112, 4), (112,), (38, 4), (38,))

In [79]:
# Iris without PCA
clf = LogisticRegression()
clf

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False)

In [80]:
# Train
start = time.time()
clf.fit(x_train, y_train)
end = time.time()



In [81]:
# score and time
print('Time without PCA = ', end-start)
print('Score without PCA = ', clf.score(x_test, y_test))

Time without PCA =  0.002507448196411133
Score without PCA =  0.868421052631579


## Applying PCA

In [82]:
# Feature scaling
#scaler = StandardScaler()
#scaler.fit(x_train)

In [83]:
#scaler.mean_

In [84]:
# Transfrom data
#scaler.transform(x_train)
#x_test = scaler.transform(x_test)

In [85]:
# Applying PCA without reducing dimensionality to determine total variance 
# Which will later help us in determining the best possible number of dimensions to take for maintaining
# a particular amount of variance in our dataset

pca = PCA()
x_train_transformed = pca.fit_transform(x_train)
x_train_transformed

array([[ 4.51491676e-01, -1.26508046e-01,  1.45980696e-01,
         3.43798747e-02],
       [ 1.62351943e-01, -4.31157858e-01, -2.03218243e-01,
        -6.20101585e-03],
       [ 2.11263303e+00,  1.48987433e-01,  3.50122071e-02,
         1.45724571e-01],
       [-2.94949688e+00, -8.89238512e-02,  3.96508566e-02,
         1.07522143e-02],
       [ 1.87495486e+00,  3.45547954e-01,  1.06009743e-01,
         4.91892738e-01],
       [-2.45736119e+00,  2.38864150e-01,  2.72403306e-01,
         1.72566278e-01],
       [-2.54886975e+00,  6.88622040e-01, -7.96051316e-02,
        -1.56731549e-02],
       [-5.96539014e-01, -1.27590863e+00, -2.14619244e-01,
         1.25248024e-02],
       [ 1.89191153e+00, -1.52731482e-02,  5.48928242e-02,
        -1.76618791e-01],
       [ 2.36903557e+00,  2.45646530e-01,  5.01913054e-01,
         2.28901236e-01],
       [ 1.22332113e+00, -8.16245233e-01, -3.07901686e-01,
        -7.75696392e-02],
       [ 2.25580406e+00, -3.23597291e-01, -3.29247439e-01,
      

In [86]:
# components
pca.components_

array([[ 0.37649644, -0.06637905,  0.85134571,  0.35924188],
       [ 0.6240207 ,  0.75538031, -0.18479376, -0.07648543],
       [-0.60667794,  0.57674603,  0.08522779,  0.54040922],
       [ 0.31747515, -0.30390531, -0.48352659,  0.75700273]])

In [87]:
pca.explained_variance_

array([4.45407391, 0.25193059, 0.07478621, 0.0218018 ])

In [88]:
# determining best number of components
total_variance = sum(pca.explained_variance_)
current = 0
k = 0

# maintaining a 99% variance in our data 
while current/total_variance <= 0.99:
    current += pca.explained_variance_[k]
    k += 1

k

3

In [89]:
# Now reduce dimensionality to k components
pca2 = PCA(n_components=k)
x_train_final = pca2.fit_transform(x_train)
x_test_final = pca2.transform(x_test)

x_train_final.shape, x_test_final.shape

((112, 3), (38, 3))

In [90]:
# Apply logistic regression and measure the two metrics
clf2 = LogisticRegression()

In [91]:
start_pca = time.time()
clf2.fit(x_train_final, y_train)
end_pca = time.time()



In [92]:
print('Time with PCA = ', end_pca - start_pca)
print('Score with PCA = ', clf2.score(x_test_final, y_test))

Time with PCA =  0.0015025138854980469
Score with PCA =  0.8157894736842105
