In [1]:
import pandas as pd
import numpy as np

In [2]:
def mean(x):
  return sum(x)/len(x)

In [3]:
def std(x):
  return (sum((i-mean(x))**2 for i in x)/len(x))**0.5

In [5]:
def standardize_data(x):
  std_data=x-mean(x)/std(x)
  return std_data


In [9]:
def covariance(x):
 cov_mat=(standardize_data(x).T @ standardize_data(x))/(standardize_data(x).shape[0]-1)
 return cov_mat

In [10]:
def eigen_val_vector(x,k):
  eig_vals,eig_vecs = np.linalg.eig(covariance(x))
  sort_idx=np.argsort(eig_vals)[::-1]
  eig_vals=eig_vals[sort_idx]
  eig_vecs=eig_vecs[:,sort_idx]
  components=eig_vecs[:k]
  explained_variance=np.sum(eig_vals[:k])/np.sum(eig_vals)
  return explained_variance

In [11]:
def transform(x,k):
  x-=mean(x)
  return np.dot(x,eigen_val_vector.components(x,k).T)


In [20]:
class PCA:
  def __init__(self,num_components):
    self.num_components=num_components


  def standardize_data(self,x):
    self.mean= np.sum(x)/len(x)
    self.std= (sum((i-mean(x))**2 for i in x)/len(x))**0.5
    std_data=x-mean(x)/std(x)
    return std_data
  def covariance(self,x):
    cov_mat=(self.standardize_data(x).T @ self.standardize_data(x))/(self.standardize_data(x).shape[0]-1)
    return cov_mat
  def fit_n_transform(self,x):
    eig_vals,eig_vecs = np.linalg.eig(self.covariance(x))
    sort_idx=np.argsort(eig_vals)[::-1]
    eig_vals=eig_vals[sort_idx]
    eig_vecs=eig_vecs[:,sort_idx]
    self.components=eig_vecs[:self.num_components]
    explained_variance=np.sum(eig_vals[:self.num_components])/np.sum(eig_vals)
    print("component variance:\n",explained_variance)
    comp=self.components.T
    print("PCA:\n",np.dot(standardize_data(x),comp))

In [29]:
data=pd.read_csv("/content/sample_data/california_housing_train.csv")
data

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value
0,-114.31,34.19,15.0,5612.0,1283.0,1015.0,472.0,1.4936,66900.0
1,-114.47,34.40,19.0,7650.0,1901.0,1129.0,463.0,1.8200,80100.0
2,-114.56,33.69,17.0,720.0,174.0,333.0,117.0,1.6509,85700.0
3,-114.57,33.64,14.0,1501.0,337.0,515.0,226.0,3.1917,73400.0
4,-114.57,33.57,20.0,1454.0,326.0,624.0,262.0,1.9250,65500.0
...,...,...,...,...,...,...,...,...,...
16995,-124.26,40.58,52.0,2217.0,394.0,907.0,369.0,2.3571,111400.0
16996,-124.27,40.69,36.0,2349.0,528.0,1194.0,465.0,2.5179,79000.0
16997,-124.30,41.84,17.0,2677.0,531.0,1244.0,456.0,3.0313,103600.0
16998,-124.30,41.80,19.0,2672.0,552.0,1298.0,478.0,1.9797,85800.0


In [34]:
x=data.drop('median_house_value',axis=1)
x=np.asarray(x)

In [38]:
x

array([[-114.31  ,   34.19  ,   15.    , ..., 1015.    ,  472.    ,
           1.4936],
       [-114.47  ,   34.4   ,   19.    , ..., 1129.    ,  463.    ,
           1.82  ],
       [-114.56  ,   33.69  ,   17.    , ...,  333.    ,  117.    ,
           1.6509],
       ...,
       [-124.3   ,   41.84  ,   17.    , ..., 1244.    ,  456.    ,
           3.0313],
       [-124.3   ,   41.8   ,   19.    , ..., 1298.    ,  478.    ,
           1.9797],
       [-124.35  ,   40.54  ,   52.    , ...,  806.    ,  270.    ,
           3.0147]])

In [42]:
x.shape

(17000, 8)

In [39]:
xnew=standardize_data(x)

In [43]:
xnew.shape

(17000, 8)

In [41]:
xnew

array([[-5.46812209e+01,  1.75214887e+01,  1.27285821e+01, ...,
         1.01375453e+03,  4.70696464e+02, -5.41711197e-01],
       [-5.48412209e+01,  1.77314887e+01,  1.67285821e+01, ...,
         1.12775453e+03,  4.61696464e+02, -2.15311197e-01],
       [-5.49312209e+01,  1.70214887e+01,  1.47285821e+01, ...,
         3.31754530e+02,  1.15696464e+02, -3.84411197e-01],
       ...,
       [-6.46712209e+01,  2.51714887e+01,  1.47285821e+01, ...,
         1.24275453e+03,  4.54696464e+02,  9.95988803e-01],
       [-6.46712209e+01,  2.51314887e+01,  1.67285821e+01, ...,
         1.29675453e+03,  4.76696464e+02, -5.56111972e-02],
       [-6.47212209e+01,  2.38714887e+01,  4.97285821e+01, ...,
         8.04754530e+02,  2.68696464e+02,  9.79388803e-01]])

In [44]:
PCA(num_components=2).fit_n_transform(x)

component variance:
 0.9977592804937393
PCA:
 [[ 673.97779405 -540.00589271]
 [ 962.51922422 -630.29468679]
 [ 172.55623587 -136.07609091]
 ...
 [ 550.91077689 -494.70940483]
 [ 589.42397248 -521.3590202 ]
 [ 311.01020754 -289.49490706]]
