# Introduction

In [105]:
import pprint

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn import datasets
from sklearn.decomposition import PCA
from sklearn.base import TransformerMixin

# Preprocessing

## Loading Dataset

In [106]:
iris = datasets.load_iris()
breast = datasets.load_breast_cancer()

dataset = iris
X = dataset.data
y = dataset.target

xTrain, xTest, yTrain, yTest = train_test_split(X,y)

sc = StandardScaler().fit(xTrain)
xTrain = sc.transform(xTrain)
xTest = sc.transform(xTest)

# PCA - Principal Component Analysis

In [120]:
class CustomPCA(TransformerMixin):
    def __init__(self, n_components=None):
        self.n_components = n_components
    
    
    def fit(self, X, y=None):
        self._check_params(X, y)
        self._pca(X, y)
        
        return self
    
    
    def fit_transform(self, X, y=None):
        self.fit(X, y)
        return self.transform(X,y)
    
    
    def transform(self, X, y=None):
        projection_matrix = np.hstack(tuple([pair[1].reshape(pair[1].shape[0], 1) for pair in self.pairs]))
        return X.dot(projection_matrix)
    
    
    def _pca(self, X, y=None):
        cov_matrix = np.cov(X.T)
        eig_values, eig_vectors = np.linalg.eig(cov_matrix)
        
        total_variance = eig_values.sum()
        
        self.explained_variance_ = np.sort(eig_values)[::-1][0:self.n_components]
        self.explained_variance_ratio_ = np.array([(value / total_variance) for value in np.sort(eig_values)[::-1][0:self.n_components]])
        
        pairs = [(eig_values[i], eig_vectors[:,i]) for i in range(len(eig_values))]
        pairs.sort(key=lambda pair: pair[0], reverse=True)
        pairs = pairs[0:self.n_components]
        
        self.pairs = pairs
        
        return
        
        
    def _check_params(self, X, y):
        if self.n_components is None:
            self.n_components = X.shape[1]

### CustomPCA vs PCA

In [121]:
skModel = PCA().fit(X)
custModel = CustomPCA().fit(X)

In [122]:
print(skModel.explained_variance_ratio_)
print(custModel.explained_variance_ratio_)

[ 0.92461621  0.05301557  0.01718514  0.00518309]
[ 0.92461621  0.05301557  0.01718514  0.00518309]


In [123]:
X.shape

(150, 4)

In [124]:
skModel = PCA(n_components=2).fit(X)
custModel = CustomPCA(n_components=2).fit(X)

In [125]:
print(skModel.explained_variance_ratio_)
print(custModel.explained_variance_ratio_)

[ 0.92461621  0.05301557]
[ 0.92461621  0.05301557]


In [126]:
skX = skModel.transform(X)
custX = custModel.transform(X)

In [127]:
print(skX.shape)
print(custX.shape)

(150, 2)
(150, 2)
