<a href="https://colab.research.google.com/github/rajdeepbanerjee-git/JNCLectures_Intro_to_ML/blob/main/Week7/Lec7_PCA_from_scratch.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#### Basic steps:
- Standardize the data: Z = (X-mu)/sigma
- Calculate Covariance matrix
- Diagonalize to get eigenvalues and eignenvectors
- Select vectors that explain p% of total variance
- project data onto those vectors

In [1]:
import pandas as pd
import numpy as np

# Here we are using inbuilt dataset of scikit learn
from sklearn.datasets import load_breast_cancer

In [2]:
# instantiating
cancer = load_breast_cancer(as_frame=True)
# creating dataframe
cancer_df = cancer.frame

# checking shape
print('Original Dataframe shape :', cancer_df.shape)

# Input features
X = cancer_df[cancer['feature_names']]
print('Inputs Dataframe shape   :', X.shape)

Original Dataframe shape : (569, 31)
Inputs Dataframe shape   : (569, 30)


In [4]:
# features ...
cancer['data'].columns

Index(['mean radius', 'mean texture', 'mean perimeter', 'mean area',
       'mean smoothness', 'mean compactness', 'mean concavity',
       'mean concave points', 'mean symmetry', 'mean fractal dimension',
       'radius error', 'texture error', 'perimeter error', 'area error',
       'smoothness error', 'compactness error', 'concavity error',
       'concave points error', 'symmetry error', 'fractal dimension error',
       'worst radius', 'worst texture', 'worst perimeter', 'worst area',
       'worst smoothness', 'worst compactness', 'worst concavity',
       'worst concave points', 'worst symmetry', 'worst fractal dimension'],
      dtype='object')

Now, the idea is that you want to build a model with features that take care 80% of the variance in the data


In [9]:
# standardize the data

Z = (X - np.mean(X, axis = 0))/np.std(X, axis = 0)
Z[:5]

Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,...,worst radius,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension
0,1.097064,-2.073335,1.269934,0.984375,1.568466,3.283515,2.652874,2.532475,2.217515,2.255747,...,1.88669,-1.359293,2.303601,2.001237,1.307686,2.616665,2.109526,2.296076,2.750622,1.937015
1,1.829821,-0.353632,1.685955,1.908708,-0.826962,-0.487072,-0.023846,0.548144,0.001392,-0.868652,...,1.805927,-0.369203,1.535126,1.890489,-0.375612,-0.430444,-0.146749,1.087084,-0.24389,0.28119
2,1.579888,0.456187,1.566503,1.558884,0.94221,1.052926,1.363478,2.037231,0.939685,-0.398008,...,1.51187,-0.023974,1.347475,1.456285,0.527407,1.082932,0.854974,1.955,1.152255,0.201391
3,-0.768909,0.253732,-0.592687,-0.764464,3.283553,3.402909,1.915897,1.451707,2.867383,4.910919,...,-0.281464,0.133984,-0.249939,-0.550021,3.394275,3.893397,1.989588,2.175786,6.046041,4.93501
4,1.750297,-1.151816,1.776573,1.826229,0.280372,0.53934,1.371011,1.428493,-0.00956,-0.56245,...,1.298575,-1.46677,1.338539,1.220724,0.220556,-0.313395,0.613179,0.729259,-0.868353,-0.3971


In [10]:
# covariance matrix
cov_mat = Z.cov()
cov_mat[:5]

Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,...,worst radius,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension
mean radius,1.001761,0.324352,0.999612,0.989095,0.170882,0.507015,0.677955,0.823977,0.148001,-0.312179,...,0.971246,0.297531,0.966836,0.942739,0.119827,0.414191,0.527839,0.745524,0.164242,0.007078
mean texture,0.324352,1.001761,0.330113,0.321651,-0.02343,0.237119,0.30295,0.293981,0.071527,-0.076572,...,0.353194,0.91365,0.35867,0.344151,0.07764,0.278319,0.301555,0.295836,0.105193,0.119415
mean perimeter,0.999612,0.330113,1.001761,0.988244,0.207643,0.557917,0.717396,0.852475,0.183349,-0.261937,...,0.971183,0.303572,0.972095,0.943207,0.150814,0.456577,0.564872,0.772599,0.189448,0.051108
mean area,0.989095,0.321651,0.988244,1.001761,0.17734,0.499379,0.687191,0.824718,0.151559,-0.283608,...,0.964441,0.287995,0.960808,0.960902,0.12374,0.391098,0.513508,0.723288,0.143823,0.003744
mean smoothness,0.170882,-0.02343,0.207643,0.17734,1.001761,0.660284,0.522903,0.55467,0.558757,0.585822,...,0.213495,0.036135,0.239273,0.207082,0.806742,0.4733,0.435691,0.503939,0.395004,0.500195


In [13]:
# Diagonalize

eigenvalues, eigenvectors = np.linalg.eig(cov_mat)

In [18]:
print(eigenvalues)
print(eigenvalues.argsort()[::-1])
eigenvalues[eigenvalues.argsort()[::-1]]

[1.33049908e+01 5.70137460e+00 2.82291016e+00 1.98412752e+00
 1.65163324e+00 1.20948224e+00 6.76408882e-01 4.77456255e-01
 4.17628782e-01 3.51310875e-01 2.94433153e-01 2.61621161e-01
 2.41782421e-01 1.57286149e-01 9.43006956e-02 8.00034045e-02
 5.95036135e-02 5.27114222e-02 4.95647002e-02 1.33279057e-04
 7.50121413e-04 1.59213600e-03 6.91261258e-03 8.19203712e-03
 1.55085271e-02 1.80867940e-02 2.43836914e-02 2.74877113e-02
 3.12142606e-02 3.00256631e-02]
[ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 28 29 27 26 25
 24 23 22 21 20 19]


array([1.33049908e+01, 5.70137460e+00, 2.82291016e+00, 1.98412752e+00,
       1.65163324e+00, 1.20948224e+00, 6.76408882e-01, 4.77456255e-01,
       4.17628782e-01, 3.51310875e-01, 2.94433153e-01, 2.61621161e-01,
       2.41782421e-01, 1.57286149e-01, 9.43006956e-02, 8.00034045e-02,
       5.95036135e-02, 5.27114222e-02, 4.95647002e-02, 3.12142606e-02,
       3.00256631e-02, 2.74877113e-02, 2.43836914e-02, 1.80867940e-02,
       1.55085271e-02, 8.19203712e-03, 6.91261258e-03, 1.59213600e-03,
       7.50121413e-04, 1.33279057e-04])

In [None]:
# Index the eigenvalues in descending order
idx = eigenvalues.argsort()[::-1]

# Sort the eigenvalues in descending order
eigenvalues = eigenvalues[idx]

# sort the corresponding eigenvectors accordingly
eigenvectors = eigenvectors[:,idx]

In [19]:
explained_var = np.cumsum(eigenvalues) / np.sum(eigenvalues)
explained_var

array([0.44272026, 0.63243208, 0.72636371, 0.79238506, 0.84734274,
       0.88758796, 0.9100953 , 0.92598254, 0.93987903, 0.95156881,
       0.961366  , 0.97007138, 0.97811663, 0.98335029, 0.98648812,
       0.98915022, 0.99113018, 0.99288414, 0.9945334 , 0.99453783,
       0.99456279, 0.99461577, 0.99484578, 0.99511837, 0.99563442,
       0.99623625, 0.99704761, 0.99796226, 0.9990009 , 1.        ])

#### Note: 1st four components explain almost 80% of the variance ... so we take only these

In [20]:
# take n components that explain p% variance
p = 0.8
n_components = np.argmax(explained_var >= p) + 1
n_components


5

In [24]:
# filter out these components and put in a df

# PCA component or unit matrix
eigenvectors_chosen = eigenvectors[:,:n_components]
pca_component = pd.DataFrame(eigenvectors_chosen,
                             index = cancer['feature_names'],
                            )

In [35]:
# project original standardized feature matrix along these components

Z_pca = Z @ pca_component

# rename columns
Z_pca.rename(columns = lambda x: "PC_"+ str(x), inplace = True)
Z_pca[:5]

Unnamed: 0,PC_0,PC_1,PC_2,PC_3,PC_4
0,9.192837,1.948583,-1.123166,3.633731,1.19511
1,2.387802,-3.768172,-0.529293,1.118264,-0.621775
2,5.733896,-1.075174,-0.551748,0.912083,0.177086
3,7.122953,10.275589,-3.23279,0.152547,2.960878
4,3.935302,-1.948072,1.389767,2.940639,-0.546747


#### functional programming ...

- return components given X, p
- project components given Z, components

In [40]:
def calc_pca_components(X, p):

  # standardize
  Z = (X - np.mean(X, axis = 0))/np.std(X, axis = 0)

  # covariance mat and eigenvalues, eigenvectors
  eigenval, eigenvec = np.linalg.eig(Z.cov())

  # Index the eigenvalues in descending order
  idx = eigenval.argsort()[::-1]
  # Sort the eigenvalues in descending order
  eigenval = eigenval[idx]
  # sort the corresponding eigenvectors accordingly
  eigenvec = eigenvec[:,idx]

  # number of components required to reach p
  explained_var = np.cumsum(eigenval) / np.sum(eigenval)
  n_components = np.argmax(explained_var >= p) + 1

  # pca components matrix
  eigenvectors_chosen = eigenvec[:,:n_components]
  pca_component = pd.DataFrame(eigenvectors_chosen)

  return pca_component, Z

def project_components(Z, pca_comp):

  # standardize
  Z = (X - np.mean(X, axis = 0))/np.std(X, axis = 0)
  # project
  Z_pca = Z @ pca_component
  # rename columns
  Z_pca.rename(columns = lambda x: "PC_"+ str(x), inplace = True)

  return Z_pca




In [41]:
pca_comp , Z = calc_pca_components(X =X, p = 0.8)
project_components(Z = Z, pca_comp = pca_comp)

Unnamed: 0,PC_0,PC_1,PC_2,PC_3,PC_4
0,9.192837,1.948583,-1.123166,3.633731,1.195110
1,2.387802,-3.768172,-0.529293,1.118264,-0.621775
2,5.733896,-1.075174,-0.551748,0.912083,0.177086
3,7.122953,10.275589,-3.232790,0.152547,2.960878
4,3.935302,-1.948072,1.389767,2.940639,-0.546747
...,...,...,...,...,...
564,6.439315,-3.576817,2.459487,1.177314,0.074824
565,3.793382,-3.584048,2.088476,-2.506028,0.510723
566,1.256179,-1.902297,0.562731,-2.089227,-1.809991
567,10.374794,1.672010,-1.877029,-2.356031,0.033742


#### what if you have missing values ... will it work?
- create a function to randomly select values and delete
- impute by mean
- recalculate components
- as you increase the percentageof missing values, what do you expect to happen?


In [43]:
X.shape

(569, 30)

In [57]:
def remove_vals(X, percentage):
  X_new = X.copy()
  num_vals = int(percentage*(X.shape[0]*X.shape[1]))
  # replace by nan
  for a, b in zip(np.random.randint(0,X.shape[0], num_vals), np.random.randint(0,X.shape[1], num_vals)):
    X_new.iloc[a,b] = np.nan
  return X_new


In [68]:
X_with_null = remove_vals(X = X, percentage = 0.1)
X_with_null.isnull().sum()

mean radius                47
mean texture               54
mean perimeter             55
mean area                  61
mean smoothness            50
mean compactness           65
mean concavity             62
mean concave points        49
mean symmetry              46
mean fractal dimension     46
radius error               56
texture error              46
perimeter error            61
area error                 47
smoothness error           54
compactness error          55
concavity error            58
concave points error       59
symmetry error             54
fractal dimension error    60
worst radius               51
worst texture              48
worst perimeter            56
worst area                 39
worst smoothness           56
worst compactness          53
worst concavity            71
worst concave points       51
worst symmetry             63
worst fractal dimension    57
dtype: int64

In [69]:
column_names = X_with_null.columns
for cols in column_names:
  col_mean = X_with_null[cols].mean()
  X_with_null[cols].fillna(col_mean, inplace = True)

X_with_null.isnull().sum()

mean radius                0
mean texture               0
mean perimeter             0
mean area                  0
mean smoothness            0
mean compactness           0
mean concavity             0
mean concave points        0
mean symmetry              0
mean fractal dimension     0
radius error               0
texture error              0
perimeter error            0
area error                 0
smoothness error           0
compactness error          0
concavity error            0
concave points error       0
symmetry error             0
fractal dimension error    0
worst radius               0
worst texture              0
worst perimeter            0
worst area                 0
worst smoothness           0
worst compactness          0
worst concavity            0
worst concave points       0
worst symmetry             0
worst fractal dimension    0
dtype: int64

In [70]:
pca_comp , Z = calc_pca_components(X = X_with_null, p = 0.8)

In [71]:
pca_comp

Unnamed: 0,0,1,2,3,4,5
0,-0.225366,-0.224422,-0.003519,0.034068,-0.044408,0.02408
1,-0.108336,-0.052073,0.030564,-0.593015,0.150501,-0.049987
2,-0.229505,-0.21204,0.007166,0.048572,-0.041806,0.026846
3,-0.218516,-0.230167,0.003334,0.036199,-0.038437,0.003748
4,-0.14789,0.191687,-0.09871,0.207192,0.330055,-0.237812
5,-0.239961,0.159815,-0.065295,0.028801,-0.020608,0.01327
6,-0.254236,0.070323,-0.009802,-0.001508,-0.105472,-0.022111
7,-0.261085,-0.030662,-0.011118,0.071937,0.047229,-0.042166
8,-0.140697,0.197425,-0.028469,0.125268,0.259376,0.371711
9,-0.067208,0.368837,-0.033624,0.049679,0.047909,-0.105498
