<a href="https://colab.research.google.com/github/raihanewubd/CSE457/blob/main/PCA.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.datasets import fetch_openml
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

# 1. Load and Explore MNIST Dataset
mnist = fetch_openml('mnist_784')

X, y = mnist['data'], mnist['target']
print("Dataset Shape:", X.shape)  # (70000, 784) - 70k images, each with 784 pixels




In [None]:
# Visualize a few examples
fig, axes = plt.subplots(2, 5, figsize=(10, 5))
for i, ax in enumerate(axes.flat):
    # Use .iloc to access the i-th row of the DataFrame and convert it to a NumPy array
    ax.imshow(X.iloc[i].values.reshape(28, 28), cmap='binary')
    ax.set(xticks=[], yticks=[], title=y[i])
plt.show()



In [None]:
# 2. Standardize the Data
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)



In [None]:
# 3. Perform PCA
pca = PCA()
X_pca = pca.fit_transform(X_scaled)



In [None]:
# Determine explained variance
explained_variance_ratio = pca.explained_variance_ratio_
cumulative_variance = np.cumsum(explained_variance_ratio)

# Scree Plot
plt.plot(range(1, len(explained_variance_ratio) + 1), cumulative_variance, marker='o')
plt.xlabel('Number of Components')
plt.ylabel('Cumulative Explained Variance')
plt.title('PCA Scree Plot')
plt.show()



In [None]:
# Choose optimal number of components (e.g., where cumulative variance reaches 95%)
n_components = np.argmax(cumulative_variance >= 0.95) + 1
print(f"Selected {n_components} components explaining {cumulative_variance[n_components-1]:.2f} of variance")

pca = PCA(n_components=n_components)
X_reduced = pca.fit_transform(X_scaled)

# 4. Visualize PCA Results
plt.scatter(X_reduced[:, 0], X_reduced[:, 1], c=y.cat.codes, cmap='viridis', s=10, alpha=0.5) # Convert y to numerical codes
plt.xlabel('Principal Component 1')
plt.ylabel('Principal Component 2')
plt.title('MNIST Data After PCA')
plt.colorbar(label='Digit')
plt.show()



In [None]:
# 5. Evaluation (optional)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_reduced_train, X_reduced_test, _, _ = train_test_split(X_reduced, y, test_size=0.2, random_state=42)

# Logistic Regression on original data
clf = LogisticRegression(max_iter=1000)
clf.fit(X_train, y_train)
accuracy_original = clf.score(X_test, y_test)

# Logistic Regression on reduced data
clf_reduced = LogisticRegression(max_iter=1000)
clf_reduced.fit(X_reduced_train, y_train)
accuracy_reduced = clf_reduced.score(X_reduced_test, y_test)

print("Accuracy (original):", accuracy_original)
print("Accuracy (reduced):", accuracy_reduced)