# Principal Component Analysis (PCA) - From Scratch

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler

In [None]:
df = pd.read_csv('../../Mall_Customers.csv')
df = df.drop('CustomerID', axis=1)
df['Genre'] = df['Genre'].map({'Male': 0, 'Female': 1})
X = df.values
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [None]:
class PCAScratch:
    def __init__(self, n_components):
        self.n_components = n_components
        self.components = None
        self.mean = None
        self.explained_variance = None
    
    def fit(self, X):
        self.mean = np.mean(X, axis=0)
        X = X - self.mean
        covariance_matrix = np.cov(X.T)
        eigenvalues, eigenvectors = np.linalg.eig(covariance_matrix)
        eigenvectors = eigenvectors.T
        idxs = np.argsort(eigenvalues)[::-1]
        eigenvalues = eigenvalues[idxs]
        eigenvectors = eigenvectors[idxs]
        self.components = eigenvectors[0:self.n_components]
        self.explained_variance = (eigenvalues / np.sum(eigenvalues))[0:self.n_components]
    
    def transform(self, X):
        X = X - self.mean
        return np.dot(X, self.components.T)

In [None]:
pca = PCAScratch(n_components=2)
pca.fit(X_scaled)
X_projected = pca.transform(X_scaled)

In [None]:
sns.set_style("whitegrid")
plt.figure(figsize=(10, 6))
scatter = plt.scatter(X_projected[:, 0], X_projected[:, 1], c=df['Spending Score (1-100)'], cmap='viridis', edgecolor='white', s=80, alpha=0.8)
plt.xlabel('Principal Component 1 (PC1)')
plt.ylabel('Principal Component 2 (PC2)')
plt.title('PCA Projection: Mall Customers Clustering Structure')
plt.colorbar(scatter, label='Spending Score (1-100)')
plt.show()

In [None]:
plt.figure(figsize=(8, 4))
plt.bar(range(1, 3), pca.explained_variance * 100, color='royalblue', alpha=0.7)
plt.xticks([1, 2], ['PC1', 'PC2'])
plt.ylabel('Percentage of Variance Explained (%)')
plt.title('Scree Plot: Explained Variance by Components')
plt.ylim(0, 100)
for i, v in enumerate(pca.explained_variance):
    plt.text(i+1, v*100 + 1, f"{v*100:.1f}%", ha='center')
plt.show()

In [None]:
print("Explained Variance Ratio:", pca.explained_variance)
print("Total Variance Explained:", np.sum(pca.explained_variance))