In [1]:
import pandas as pd
import numpy as np

In [2]:
# Load the data
data = pd.read_csv('telescope_data.csv')


In [3]:
# Define a function to calculate the covariance matrix
def compute_covariance_matrix(X):
    cov_matrix = np.cov(X, rowvar=False)
    return cov_matrix

# Define a function to calculate the eigenvalues and eigenvectors
def compute_eigen(cov_matrix):
    eigenvalues, eigenvectors = np.linalg.eig(cov_matrix)
    return eigenvalues, eigenvectors

# Define a function to perform PCA
def pca_from_scratch(X, target_variance_ratio=0.95):
    # Step 1: Calculate the covariance matrix
    cov_matrix = compute_covariance_matrix(X)
    
    # Step 2: Calculate eigenvalues and eigenvectors
    eigenvalues, eigenvectors = compute_eigen(cov_matrix)
    
    # Step 3: Sort eigenvalues and eigenvectors
    sorted_indices = np.argsort(eigenvalues)[::-1]
    eigenvalues = eigenvalues[sorted_indices]
    eigenvectors = eigenvectors[:, sorted_indices]
    
    # Step 4: Determine the number of components to retain
    total_variance = np.sum(eigenvalues)
    variance_ratio = np.cumsum(eigenvalues) / total_variance
    num_components = np.argmax(variance_ratio >= target_variance_ratio) + 1
    
    # Step 5: Retain the top components
    principal_components = eigenvectors[:, :num_components]
    
    # Step 6: Transform the data
    transformed_data = X @ principal_components
    
    return transformed_data


In [4]:
# Apply PCA from scratch to the dataset
data_pca_scratch = pca_from_scratch(data.iloc[:, :-1])


In [5]:
# 1.2 PCA using scikit-learn
from sklearn.decomposition import PCA


In [6]:
# Apply PCA using scikit-learn to the dataset
pca = PCA(n_components=0.95)
data_pca_sklearn = pca.fit_transform(data.iloc[:, :-1])


In [7]:
# Compare results
print("PCA from scratch result shape:", data_pca_scratch.shape)
print("PCA using scikit-learn result shape:", data_pca_sklearn.shape)

PCA from scratch result shape: (201, 5)
PCA using scikit-learn result shape: (201, 5)


In [8]:
from sklearn.metrics.pairwise import pairwise_kernels


In [9]:
def kpca_rbf_from_scratch(X, gamma=0.1, n_components=2):
    # Calculate the RBF kernel matrix
    kernel_matrix = pairwise_kernels(X, metric='rbf', gamma=gamma)
    
    # Center the kernel matrix
    n = kernel_matrix.shape[0]
    one_n = np.ones((n, n)) / n
    kernel_matrix_centered = kernel_matrix - one_n @ kernel_matrix - kernel_matrix @ one_n + one_n @ kernel_matrix @ one_n
    
    # Calculate eigenvalues and eigenvectors of the centered kernel matrix
    eigenvalues, eigenvectors = np.linalg.eigh(kernel_matrix_centered)
    
    # Sort eigenvalues and eigenvectors
    sorted_indices = np.argsort(eigenvalues)[::-1]
    eigenvalues = eigenvalues[sorted_indices]
    eigenvectors = eigenvectors[:, sorted_indices]
    
    # Retain the top components
    principal_components = eigenvectors[:, :n_components]
    
    # Transform the data
    transformed_data = kernel_matrix_centered @ principal_components
    
    return transformed_data

# Apply KPCA with RBF kernel from scratch to the dataset
data_kpca_rbf_scratch = kpca_rbf_from_scratch(data.iloc[:, :-1])


In [10]:
# 2.2 KPCA with Polynomial Kernel
def kpca_poly_from_scratch(X, degree=3, n_components=2):
    # Calculate the Polynomial kernel matrix
    kernel_matrix = pairwise_kernels(X, metric='poly', degree=degree)
    
    # Center the kernel matrix
    n = kernel_matrix.shape[0]
    one_n = np.ones((n, n)) / n
    kernel_matrix_centered = kernel_matrix - one_n @ kernel_matrix - kernel_matrix @ one_n + one_n @ kernel_matrix @ one_n
    
    # Calculate eigenvalues and eigenvectors of the centered kernel matrix
    eigenvalues, eigenvectors = np.linalg.eigh(kernel_matrix_centered)
    
    # Sort eigenvalues and eigenvectors
    sorted_indices = np.argsort(eigenvalues)[::-1]
    eigenvalues = eigenvalues[sorted_indices]
    eigenvectors = eigenvectors[:, sorted_indices]
    
    # Retain the top components
    principal_components = eigenvectors[:, :n_components]
    
    # Transform the data
    transformed_data = kernel_matrix_centered @ principal_components
    
    return transformed_data

# Apply KPCA with Polynomial kernel from scratch to the dataset
data_kpca_poly_scratch = kpca_poly_from_scratch(data.iloc[:, :-1])


In [11]:
# 2.3 KPCA with Linear Kernel
def kpca_linear_from_scratch(X, n_components=2):
    # Calculate the Linear kernel matrix
    kernel_matrix = pairwise_kernels(X, metric='linear')
    
    # Center the kernel matrix
    n = kernel_matrix.shape[0]
    one_n = np.ones((n, n)) / n
    kernel_matrix_centered = kernel_matrix - one_n @ kernel_matrix - kernel_matrix @ one_n + one_n @ kernel_matrix @ one_n
    
    # Calculate eigenvalues and eigenvectors of the centered kernel matrix
    eigenvalues, eigenvectors = np.linalg.eigh(kernel_matrix_centered)
    
    # Sort eigenvalues and eigenvectors
    sorted_indices = np.argsort(eigenvalues)[::-1]
    eigenvalues = eigenvalues[sorted_indices]
    eigenvectors = eigenvectors[:, sorted_indices]
    
    # Retain the top components
    principal_components = eigenvectors[:, :n_components]
    
    # Transform the data
    transformed_data = kernel_matrix_centered @ principal_components
    
    return transformed_data

# Apply KPCA with Linear kernel from scratch to the dataset
data_kpca_linear_scratch = kpca_linear_from_scratch(data.iloc[:, :-1])

In [12]:
# Split the dataset into train and test
from sklearn.model_selection import train_test_split

X_train, X_test = train_test_split(data, test_size=0.2, random_state=42)

In [13]:
# Apply PCA on the test dataset
data_test_pca_sklearn = pca.transform(X_test.iloc[:, :-1])
data_test_pca_scratch = pca_from_scratch(X_test.iloc[:, :-1])


In [15]:
# Apply KPCA with RBF kernel on the test dataset
data_test_kpca_rbf_scratch = kpca_rbf_from_scratch(X_test.iloc[:, :-1])

# Apply KPCA with Polynomial kernel on the test dataset
data_test_kpca_poly_scratch = kpca_poly_from_scratch(data.iloc[:, :-1])


# Apply KPCA with Linear kernel on the test dataset
data_test_kpca_linear_scratch = kpca_linear_from_scratch(X_test.iloc[:, :-1])


In [16]:
# Display the shapes of transformed test data
print("Test data shape after PCA (scikit-learn):", data_test_pca_sklearn.shape)
print("Test data shape after PCA (from scratch):", data_test_pca_scratch.shape)
print("Test data shape after KPCA with RBF kernel (from scratch):", data_test_kpca_rbf_scratch.shape)
print("Test data shape after KPCA with Polynomial kernel (from scratch):", data_test_kpca_poly_scratch.shape)
print("Test data shape after KPCA with Linear kernel (from scratch):", data_test_kpca_linear_scratch.shape)

Test data shape after PCA (scikit-learn): (41, 5)
Test data shape after PCA (from scratch): (41, 5)
Test data shape after KPCA with RBF kernel (from scratch): (41, 2)
Test data shape after KPCA with Polynomial kernel (from scratch): (201, 2)
Test data shape after KPCA with Linear kernel (from scratch): (41, 2)
