In [3]:
import numpy as np
import pandas as pd
from sklearn.decomposition import PCA, KernelPCA
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt

In [4]:
# Load the dataset
data = pd.read_csv('telescope_data.csv')


In [5]:
# Display the first few rows of the dataset
data.head()

Unnamed: 0,fLength,fWidth,fSize,fConc,fConc1,fAsym,fM3Long,fM3Trans,fAlpha,fDist,class
0,28.7967,16.0021,2.6449,0.3918,0.1982,27.7004,22.011,-8.2027,40.092,81.8828,g
1,31.6036,11.7235,2.5185,0.5303,0.3773,26.2722,23.8238,-9.9574,6.3609,205.261,g
2,162.052,136.031,4.0612,0.0374,0.0187,116.741,-64.858,-45.216,76.96,256.788,g
3,23.8172,9.5728,2.3385,0.6147,0.3922,27.2107,-6.4633,-7.1513,10.449,116.737,g
4,75.1362,30.9205,3.1611,0.3168,0.1832,-5.5277,28.5525,21.8393,4.648,356.462,g


In [6]:
# Module 1.1 Implement PCA from scratch

def compute_covariance_matrix(X):
    # Compute the covariance matrix
    mean_vec = np.mean(X, axis=0)
    cov_matrix = (X - mean_vec).T.dot(X - mean_vec) / (X.shape[0] - 1)
    return cov_matrix

def compute_eigen(matrix):
    # Compute eigenvalues and eigenvectors
    eigenvalues, eigenvectors = np.linalg.eig(matrix)
    return eigenvalues, eigenvectors

def pca_from_scratch(X, n_components):
    # Standardize the data
    X_std = StandardScaler().fit_transform(X)
    
    # Compute covariance matrix
    cov_matrix = compute_covariance_matrix(X_std)
    
    # Compute eigenvalues and eigenvectors
    eigenvalues, eigenvectors = compute_eigen(cov_matrix)
    
    # Sort eigenvalues and corresponding eigenvectors
    sorted_indices = np.argsort(eigenvalues)[::-1]
    eigenvalues = eigenvalues[sorted_indices]
    eigenvectors = eigenvectors[:, sorted_indices]
    
    # Select the top n_components eigenvectors
    top_eigenvectors = eigenvectors[:, :n_components]
    
    # Project the original data onto the new subspace
    pca_result = X_std.dot(top_eigenvectors)
    
    return pca_result


In [7]:
# Apply PCA from scratch
n_components = 5  # Choose an appropriate number of components
pca_result_scratch = pca_from_scratch(data.iloc[:, :-1], n_components)


In [8]:
# Module 1.2 PCA using scikit-learn
pca_sklearn = PCA(n_components=n_components)
pca_result_sklearn = pca_sklearn.fit_transform(data.iloc[:, :-1])

In [9]:
# Compare results
explained_variance_scratch = np.sum(np.var(pca_result_scratch, axis=0))
explained_variance_sklearn = np.sum(pca_sklearn.explained_variance_)
print(f"Explained Variance (from scratch): {explained_variance_scratch}")
print(f"Explained Variance (scikit-learn): {explained_variance_sklearn}")

Explained Variance (from scratch): 8.504011391091012
Explained Variance (scikit-learn): 16126.792493520641


In [10]:
# Module 2.1 KPCA with RBF Kernel

def kpca_rbf_from_scratch(X, gamma, n_components):
    # Compute the RBF kernel matrix
    kernel_matrix = np.exp(-gamma * np.linalg.norm(X[:, np.newaxis] - X, axis=2)**2)
    
    # Center the kernel matrix
    n = kernel_matrix.shape[0]
    one_n = np.ones((n, n)) / n
    kernel_matrix_centered = kernel_matrix - one_n.dot(kernel_matrix) - kernel_matrix.dot(one_n) + one_n.dot(kernel_matrix).dot(one_n)
    
    # Compute eigenvalues and eigenvectors of the centered kernel matrix
    eigenvalues, eigenvectors = compute_eigen(kernel_matrix_centered)
    
    # Select the top n_components eigenvectors
    top_eigenvectors = eigenvectors[:, :n_components]
    
    # Project the original data onto the new subspace
    kpca_result = top_eigenvectors.T.dot(kernel_matrix_centered)
    
    return kpca_result.T


In [11]:
# Apply KPCA with RBF Kernel from scratch
gamma_rbf = 0.1  # Choose an appropriate gamma value
kpca_result_rbf_scratch = kpca_rbf_from_scratch(data.iloc[:, :-1].values, gamma_rbf, n_components)

# Module 2.2 KPCA with Polynomial Kernel

def kpca_poly_from_scratch(X, degree, coef0, n_components):
    # Compute the Polynomial kernel matrix
    kernel_matrix = (X.dot(X.T) + coef0)**degree
    
    # Center the kernel matrix
    n = kernel_matrix.shape[0]
    one_n = np.ones((n, n)) / n
    kernel_matrix_centered = kernel_matrix - one_n.dot(kernel_matrix) - kernel_matrix.dot(one_n) + one_n.dot(kernel_matrix).dot(one_n)
    
    # Compute eigenvalues and eigenvectors of the centered kernel matrix
    eigenvalues, eigenvectors = compute_eigen(kernel_matrix_centered)
    
    # Select the top n_components eigenvectors
    top_eigenvectors = eigenvectors[:, :n_components]
    
    # Project the original data onto the new subspace
    kpca_result = top_eigenvectors.T.dot(kernel_matrix_centered)
    
    return kpca_result.T

# Apply KPCA with Polynomial Kernel from scratch
degree_poly = 3  # Choose an appropriate degree
coef0_poly = 1  # Choose an appropriate coef0 value
kpca_result_poly_scratch = kpca_poly_from_scratch(data.iloc[:, :-1].values, degree_poly, coef0_poly, n_components)

# Module 2.3 KPCA with Linear Kernel

def kpca_linear_from_scratch(X, n_components):
    # Compute the Linear kernel matrix
    kernel_matrix = X.dot(X.T)
    
    # Center the kernel matrix
    n = kernel_matrix.shape[0]
    one_n = np.ones((n, n)) / n
    kernel_matrix_centered = kernel_matrix - one_n.dot(kernel_matrix) - kernel_matrix.dot(one_n) + one_n.dot(kernel_matrix).dot(one_n)
    
    # Compute eigenvalues and eigenvectors of the centered kernel matrix
    eigenvalues, eigenvectors = compute_eigen(kernel_matrix_centered)
    
    # Select the top n_components eigenvectors
    top_eigenvectors = eigenvectors[:, :n_components]
    
    # Project the original data onto the new subspace
    kpca_result = top_eigenvectors.T.dot(kernel_matrix_centered)
    
    return kpca_result.T

# Apply KPCA with Linear Kernel from scratch
kpca_result_linear_scratch = kpca_linear_from_scratch(data.iloc[:, :-1].values, n_components)