# Singular Value Decomposition (SVD) - From Scratch

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler

In [None]:
df = pd.read_csv('../../data.csv')
df = df.replace("?", np.nan)
df = df.apply(pd.to_numeric, errors='coerce')
df = df.drop(columns=['slope', 'ca', 'thal'])
df = df.fillna(df.median())
X = df.drop(columns=['num       ']).values
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [None]:
class SVDScratch:
    def __init__(self, n_components):
        self.n_components = n_components
        self.U = None
        self.S = None
        self.VT = None
    
    def fit_transform(self, X):
        U, S, VT = np.linalg.svd(X, full_matrices=False)
        self.U = U[:, :self.n_components]
        self.S = S[:self.n_components]
        self.VT = VT[:self.n_components, :]
        return self.U * self.S
    
    def reconstruct(self):
        return np.dot(self.U * self.S, self.VT)

In [None]:
svd = SVDScratch(n_components=5)
X_reduced = svd.fit_transform(X_scaled)
X_reconstructed = svd.reconstruct()

In [None]:
mse = np.mean((X_scaled - X_reconstructed)**2)
print("Reconstruction Mean Squared Error:", mse)

In [None]:
sns.set_style("whitegrid")
total_variance = np.sum(np.var(X_scaled, axis=0))
explained_variance = np.var(X_reduced, axis=0)
explained_variance_ratio = explained_variance / total_variance

plt.figure(figsize=(10, 6))
plt.bar(range(1, 6), explained_variance_ratio, color='darkorchid', alpha=0.6, align='center', label='Individual explained variance')
plt.step(range(1, 6), np.cumsum(explained_variance_ratio), where='mid', color='midnightblue', label='Cumulative explained variance')
plt.ylabel('Explained Variance Ratio', fontsize=12)
plt.xlabel('Singular Components', fontsize=12)
plt.title('SVD Variance Explanation: Reducing Feature Redundancy', fontsize=14)
plt.xticks(range(1, 6))
plt.legend(loc='best')
plt.show()

In [None]:
plt.figure(figsize=(10, 6))
plt.scatter(X_reduced[:, 0], X_reduced[:, 1], c=df['num       '], cmap='plasma', s=50, alpha=0.7, edgecolor='white')
plt.colorbar(label='Heart Disease (num)')
plt.xlabel('Component 1')
plt.ylabel('Component 2')
plt.title('SVD Projection: Top 2 Latent Factors', fontsize=14)
plt.show()