## Implementation of PCA from Scratch

In [2]:
class myPCA:
    def __init__(self, n_components : int = 2, method : str = 'svd') -> None:
        '''
            The constructor of the Pca alghoritm.
        :param n_components: int, default = 2
            The dimension in which the data will be reduced
        :param method: str, default = 'svd'
            The way used by PCA to reduce the dimensionality of the data
        '''
        self.__n_components = n_components
        if method in ['svd', 'eigen']:
            self.__method  = method
        else:
            raise ValueError(f"'{method}' isn't a method implemented in this model")
    
    def fit(self, X : 'np.array'):
        '''
            The fitting method
        :param X: np.array
            The data on which we want to fit the pca
        '''
        if self.__method == 'svd':
            U, S, V = np.linalg.svd(X)
            self.__V = V[:self.__n_components, :]
        elif self.__method == 'eigen':
            corr_mat = np.corrcoef(X.T)
            
            #Getting the eigenvectors and eigenvalues
            self.eig_vals, self.eig_vecs = np.linalg.eig(corr_mat)
            
            #Sorting the list of tuples (eigenvalue, eigenvector)
            self.eig_pairs = [(np.abs(self.eig_vals[i]), self.eig_vecs[:, i]) for i in range(len(self.eig_vals))]
            
            self.eig_pairs.sort(key = lambda x: x[0], reverse = True)
            
            #Calculating the explainet ration
            total = sum(self.eig_vals)
            self.explained_variance_ratio = [(i/total) * 100 for i in sorted(self.eig_vals, reverse = True)]
            
            self.cumulative_variance_ratio = np.cumsum(self.explained_variance_ratio)
            
            #Creating the projection matrix
            self.matrix_w = np.hstack((self.eig_pairs[i][1].reshape(np.size(X, 1), 1)
                                      for i in range(self.__n_components)))
            
        return self
        
    def transform(self, X : 'np.array') -> 'np.array':
        '''
            The transform function
        :param X: np.array
            The data that we must reduce
        '''
        if self.__method == 'svd':
            return X.dot(self.__V.T)
        elif self.__method == 'eigen':
            return X.dot(self.matrix_w)

## Implementation of PCA from sklearn

In [4]:
from sklearn.datasets import load_boston
X, y = load_boston(return_X_y=True)
print(X.shape)

(506, 13)



    The Boston housing prices dataset has an ethical problem. You can refer to
    the documentation of this function for further details.

    The scikit-learn maintainers therefore strongly discourage the use of this
    dataset unless the purpose of the code is to study and educate about
    ethical issues in data science and machine learning.

    In this special case, you can fetch the dataset from the original
    source::

        import pandas as pd
        import numpy as np


        data_url = "http://lib.stat.cmu.edu/datasets/boston"
        raw_df = pd.read_csv(data_url, sep="\s+", skiprows=22, header=None)
        data = np.hstack([raw_df.values[::2, :], raw_df.values[1::2, :2]])
        target = raw_df.values[1::2, 2]

    Alternative datasets include the California housing dataset (i.e.
    :func:`~sklearn.datasets.fetch_california_housing`) and the Ames housing
    dataset. You can load the datasets as follows::

        from sklearn.datasets import fetch_california_h

In [5]:
import numpy as np

#Let's import the PCA alghoritm from sklearn and reduce to 2 dimensions 
from sklearn.decomposition import PCA

pca = PCA(n_components = 2)
pca.fit(X)

#Let's transform the data, pca_x represents the new reduced data
pca_X = pca.transform(X)

In [6]:
pca_X

array([[-119.81884272,   -5.56005586],
       [-168.89015548,   10.11620863],
       [-169.31170747,   14.0805323 ],
       ...,
       [-138.38716306,    0.9380922 ],
       [-137.50517338,    4.2518251 ],
       [-139.19033295,    1.00906423]])

In [9]:
print(pca_X.shape)

(506, 2)
