# performing PCA on some dataset.

In [2]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split as tts

In [19]:
# making the function or class for performing the pca from scratch
class PCA_linear:
    def __init__(self, x):
        self.x = x
        self.values = np.array([])
        self.vectors = np.array([])
        # making the data zero mean
        self.x -= np.mean(self.x, axis=0)
        a = self.x.T @ self.x
        # finding the eigen values and eigen vectors
        self.values, self.vectors = np.linalg.eig(a)
        alphas = self.vectors / np.sqrt(self.values)
        scalers = self.x @ alphas
        self.x = scalers @ alphas

    def retention(self):
        retained_variance = np.sum(self.values[:self.x.shape[1]])
        total_variance = np.sum(self.values)
        return (retained_variance / total_variance) * 100

In [8]:
#GPTs corrections 
class PCA_linear_corrected:
    def __init__(self, x):
        self.x = x
        self.mean = np.mean(self.x, axis=0)
        self.centered_x = self.x - self.mean
        self.cov_matrix = np.cov(self.centered_x, rowvar=False)
        self.eigenvalues, self.eigenvectors = np.linalg.eigh(self.cov_matrix)
        # Sort the eigenvalues and eigenvectors in descending order
        idx = np.argsort(self.eigenvalues)[::-1]
        self.eigenvalues = self.eigenvalues[idx]
        self.eigenvectors = self.eigenvectors[:, idx]

    def Wi(self):
        return self.eigenvectors

    def components(self, n_components=None):
        if n_components is None:
            n_components = self.eigenvectors.shape[1]
        return self.centered_x @ self.eigenvectors[:, :n_components]

    def retention(self, n_components=None):
        if n_components is None:
            n_components = len(self.eigenvalues)
        retained_variance = np.sum(self.eigenvalues[:n_components])
        total_variance = np.sum(self.eigenvalues)
        return (retained_variance / total_variance) * 100

In [5]:
# getting the data using kaggle api
import os
from kaggle.api.kaggle_api_extended import KaggleApi

# Set up the Kaggle API client
api = KaggleApi()
api.authenticate()

# Download the dataset
dataset = 'krishd123/high-dimensional-datascape'

# Check if you have the necessary permissions to access the dataset
# If not, request access from the dataset owner or administrator

# Uncomment the following line to download the dataset
api.dataset_download_files(dataset, path='.', unzip=True)

file_path = r'C:\Users\Ankit\Documents\Vedanshi\progg-and-DS-from-IITM\Unsupervised learning\all_data.csv'
df = pd.read_csv(file_path, header=0, on_bad_lines='skip')
df

Dataset URL: https://www.kaggle.com/datasets/krishd123/high-dimensional-datascape


Unnamed: 0.1,Unnamed: 0,Unnamed: 1,Unnamed: 2,Unnamed: 3,Unnamed: 4,Unnamed: 5,Unnamed: 6,Unnamed: 7,Unnamed: 8,Unnamed: 9,...,Unnamed: 527,Unnamed: 528,Unnamed: 529,Unnamed: 530,Unnamed: 531,Unnamed: 532,Unnamed: 533,Unnamed: 534,Unnamed: 535,Label
0,-0.000133,0.000262,0.001099,0.001834,0.002109,0.002223,0.002233,0.002036,0.001582,0.000969,...,0.82953,2.9079,3.7557,1.3344,0.74247,0.22507,0.56249,1.5705,0.79906,0
1,-0.000842,-0.001011,-0.001071,-0.000944,-0.000794,-0.000610,-0.000445,-0.000173,0.000077,0.000285,...,0.84335,3.0110,3.9877,1.2461,0.74423,0.22567,0.61034,1.6645,0.74574,0
2,-0.000766,-0.000535,0.000162,0.000898,0.001287,0.001582,0.001704,0.001659,0.001574,0.001438,...,0.87413,3.0613,3.9749,1.1560,0.52508,0.19934,0.45707,1.3386,0.74574,0
3,-0.000301,-0.000377,-0.000451,-0.000529,-0.000685,-0.000845,-0.000899,-0.000822,-0.000550,-0.000182,...,0.85467,3.3337,3.9205,1.3341,0.46024,0.20031,0.45924,1.7969,0.32451,0
4,-0.000589,-0.000857,-0.001135,-0.001171,-0.001128,-0.001039,-0.000959,-0.000937,-0.000916,-0.000819,...,0.82978,3.5814,3.7667,1.1151,0.44572,0.20538,0.41882,1.4422,0.32451,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
225,-0.000978,-0.001481,-0.002011,-0.001986,-0.001594,-0.001217,-0.000997,-0.000547,0.000031,0.000534,...,0.84507,3.0606,3.8405,1.3064,1.80800,0.72727,0.81104,2.5046,2.73800,1
226,-0.001599,-0.001084,0.000465,0.002076,0.002883,0.003667,0.004457,0.005221,0.005871,0.006257,...,0.84004,2.8913,3.7049,1.2511,1.41830,0.64878,0.82323,2.3979,2.35770,1
227,-0.000511,-0.001370,-0.002769,-0.003582,-0.003714,-0.003721,-0.003621,-0.003399,-0.003106,-0.002629,...,0.82726,3.2587,3.5905,1.2684,1.56390,0.66174,0.83218,2.4844,2.33220,1
228,-0.000880,-0.001734,-0.003095,-0.003891,-0.004015,-0.003949,-0.003827,-0.003839,-0.003775,-0.003182,...,0.84793,2.9933,3.9598,1.3825,1.59220,0.76657,0.80723,2.5448,2.67160,1


In [20]:
pca=PCA_linear(df)
print(pca.retention())

(100+0j)


In [10]:
pca_corrected = PCA_linear_corrected(df)
print("retention of information: ",pca_corrected.retention())
print("components: ",pca_corrected.components())

retention of information:  100.0
components:            0         1         2         3         4         5         6    \
0   -1.660944  0.048165 -0.199738  0.034583 -0.089599  0.093611 -0.107930   
1   -1.673027  0.217301 -0.159715 -0.123025 -0.203299  0.065293 -0.002006   
2   -1.902329  0.326525 -0.136641  0.206183 -0.230475 -0.018015  0.027604   
3   -2.069695  0.400411 -0.177723 -0.380106 -0.002654 -0.223974 -0.107429   
4   -2.233770  0.570973 -0.001374 -0.058130  0.236687 -0.213569 -0.015431   
..        ...       ...       ...       ...       ...       ...       ...   
225  0.975953  0.313648  0.022250 -0.003942 -0.020807  0.105357  0.035613   
226  0.514758  0.181669 -0.267459  0.069090  0.017570 -0.088862  0.043060   
227  0.564484  0.349614 -0.046955 -0.073946  0.295664 -0.103505 -0.030671   
228  0.879257  0.318559 -0.078813 -0.044340 -0.181314 -0.019109  0.021372   
229  0.514758  0.181669 -0.267459  0.069090  0.017570 -0.088862  0.043060   

          7         8        

### splitting the data before sending

In [None]:
x_train,x_test,y_train,y_test=tts(df,df,test_size=0.2,random_state=42)
