## Tasks: reduce the dataset’s dimensionality (PCA, t-SNE, LLE, MDS, LDA), then apply classification(Logistic, SVM, Random Forest)

In [None]:
from sklearn.datasets import fetch_openml
from sklearn.model_selection import train_test_split

X, y = fetch_openml('mnist_784', return_X_y=True, parser='auto')
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
X_train = X_train / 255.0
X_test = X_test / 255.0

In [None]:
import time
import numpy as np
import pandas as pd
from sklearn.datasets import fetch_openml
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA, IncrementalPCA
from sklearn.manifold import TSNE, MDS, LocallyLinearEmbedding
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

# ============================================================================
# 1. LOAD AND PREPARE DATA
# ============================================================================
print("="*70)
print("STEP 1: LOADING MNIST DATASET")
print("="*70)
start_total = time.time()

X, y = fetch_openml('mnist_784', return_X_y=True, parser='auto')
X = X / 255.0  # Normalize to [0, 1]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print(f"✓ Train set: {X_train.shape}")
print(f"✓ Test set: {X_test.shape}")
print(f"✓ Loading time: {time.time() - start_total:.2f}s\n")

# ============================================================================
# 2. DIMENSIONALITY REDUCTION
# ============================================================================
print("="*70)
print("STEP 2: DIMENSIONALITY REDUCTION")
print("="*70)

reduction_results = {}
reduction_times = {}

# --- 2.1 PCA ---
print("\n[1/6] PCA (50 components)...")
start = time.time()
pca = PCA(n_components=50, random_state=42)
X_train_pca = pca.fit_transform(X_train)
X_test_pca = pca.transform(X_test)
elapsed = time.time() - start
reduction_times['PCA'] = elapsed
reduction_results['PCA'] = (X_train_pca, X_test_pca)
print(f"      ✓ Time: {elapsed:.2f}s | Variance: {pca.explained_variance_ratio_.sum():.1%}")

# --- 2.2 Incremental PCA ---
print("\n[2/6] Incremental PCA (50 components)...")
start = time.time()
ipca = IncrementalPCA(n_components=50, batch_size=200)
X_train_ipca = ipca.fit_transform(X_train)
X_test_ipca = ipca.transform(X_test)
elapsed = time.time() - start
reduction_times['IncrementalPCA'] = elapsed
reduction_results['IncrementalPCA'] = (X_train_ipca, X_test_ipca)
print(f"      ✓ Time: {elapsed:.2f}s")

# --- 2.3 LDA ---
print("\n[3/6] LDA (9 components - max for 10 classes)...")
start = time.time()
lda = LinearDiscriminantAnalysis(n_components=9)
X_train_lda = lda.fit_transform(X_train, y_train)
X_test_lda = lda.transform(X_test)
elapsed = time.time() - start
reduction_times['LDA'] = elapsed
reduction_results['LDA'] = (X_train_lda, X_test_lda)
print(f"      ✓ Time: {elapsed:.2f}s")

# --- 2.4 LLE (sampled) ---
print("\n[4/6] LLE (30 components, 10k samples - no transform available)...")
start = time.time()
lle = LocallyLinearEmbedding(n_components=30, n_neighbors=10, random_state=42, n_jobs=-1)
X_train_lle = lle.fit_transform(X_train[:10000])
elapsed = time.time() - start
reduction_times['LLE'] = elapsed
print(f"      ✓ Time: {elapsed:.2f}s | Note: Cannot transform test set")

# --- 2.5 t-SNE (sampled) ---
print("\n[5/6] t-SNE (2 components, 5k samples - visualization only)...")
start = time.time()
tsne = TSNE(n_components=2, random_state=42, n_jobs=-1, verbose=0)
X_train_tsne = tsne.fit_transform(X_train[:5000])
elapsed = time.time() - start
reduction_times['t-SNE'] = elapsed
print(f"      ✓ Time: {elapsed:.2f}s | Note: Cannot transform test set")

# --- 2.6 MDS (sampled) ---
print("\n[6/6] MDS (10 components, 2k samples - very slow)...")
start = time.time()
mds = MDS(n_components=10, random_state=42, n_jobs=-1, verbose=0)
X_train_mds = mds.fit_transform(X_train[:2000])
elapsed = time.time() - start
reduction_times['MDS'] = elapsed
print(f"      ✓ Time: {elapsed:.2f}s | Note: Cannot transform test set")

# --- Summary Table ---
print("\n" + "="*70)
print("DIMENSIONALITY REDUCTION TIMING SUMMARY")
print("="*70)
timing_df = pd.DataFrame({
    'Method': list(reduction_times.keys()),
    'Time (s)': [f"{t:.2f}" for t in reduction_times.values()],
    'Time (min)': [f"{t/60:.2f}" for t in reduction_times.values()],
    'Can Transform Test': ['✓', '✓', '✓', '✗', '✗', '✗']
})
print(timing_df.to_string(index=False))

# ============================================================================
# 3. CLASSIFICATION
# ============================================================================
print("\n" + "="*70)
print("STEP 3: CLASSIFICATION (using PCA-reduced data)")
print("="*70)

classification_results = {}

# --- 3.1 Logistic Regression ---
print("\n[1/3] Logistic Regression...")
start = time.time()
lr = LogisticRegression(max_iter=1000, random_state=42, n_jobs=-1)
lr.fit(X_train_pca, y_train)
y_pred_lr = lr.predict(X_test_pca)
elapsed = time.time() - start
acc_lr = accuracy_score(y_test, y_pred_lr)
classification_results['Logistic Regression'] = {'time': elapsed, 'accuracy': acc_lr}
print(f"      ✓ Time: {elapsed:.2f}s | Accuracy: {acc_lr:.1%}")

# --- 3.2 SVM (subset for speed) ---
print("\n[2/3] SVM (5k samples for speed)...")
start = time.time()
svm = SVC(kernel='rbf', random_state=42)
svm.fit(X_train_pca[:5000], y_train[:5000])
y_pred_svm = svm.predict(X_test_pca)
elapsed = time.time() - start
acc_svm = accuracy_score(y_test, y_pred_svm)
classification_results['SVM'] = {'time': elapsed, 'accuracy': acc_svm}
print(f"      ✓ Time: {elapsed:.2f}s | Accuracy: {acc_svm:.1%}")

# --- 3.3 Random Forest ---
print("\n[3/3] Random Forest...")
start = time.time()
rf = RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1)
rf.fit(X_train_pca, y_train)
y_pred_rf = rf.predict(X_test_pca)
elapsed = time.time() - start
acc_rf = accuracy_score(y_test, y_pred_rf)
classification_results['Random Forest'] = {'time': elapsed, 'accuracy': acc_rf}
print(f"      ✓ Time: {elapsed:.2f}s | Accuracy: {acc_rf:.1%}")

# --- Classification Summary ---
print("\n" + "="*70)
print("CLASSIFICATION RESULTS SUMMARY")
print("="*70)
clf_df = pd.DataFrame({
    'Classifier': list(classification_results.keys()),
    'Time (s)': [f"{v['time']:.2f}" for v in classification_results.values()],
    'Accuracy': [f"{v['accuracy']:.1%}" for v in classification_results.values()]
})
print(clf_df.to_string(index=False))

# ============================================================================
# 4. COMPARISON ACROSS REDUCTION METHODS
# ============================================================================
print("\n" + "="*70)
print("STEP 4: LOGISTIC REGRESSION ON ALL REDUCTION METHODS")
print("="*70)

comparison_results = {}
for method, (X_tr, X_te) in reduction_results.items():
    print(f"\nTesting {method}...")
    lr_temp = LogisticRegression(max_iter=1000, random_state=42, n_jobs=-1)
    lr_temp.fit(X_tr, y_train)
    y_pred_temp = lr_temp.predict(X_te)
    acc_temp = accuracy_score(y_test, y_pred_temp)
    comparison_results[method] = acc_temp
    print(f"      ✓ Accuracy: {acc_temp:.1%}")

# --- Comparison Table ---
print("\n" + "="*70)
print("ACCURACY COMPARISON TABLE")
print("="*70)
comp_df = pd.DataFrame({
    'Reduction Method': list(comparison_results.keys()),
    'Components': [50, 50, 9],
    'Accuracy': [f"{v:.1%}" for v in comparison_results.values()],
    'Reduction Time': [f"{reduction_times[k]:.2f}s" for k in comparison_results.keys()]
})
print(comp_df.to_string(index=False))

# ============================================================================
# FINAL SUMMARY
# ============================================================================
total_time = time.time() - start_total
print("\n" + "="*70)
print("✓ ANALYSIS COMPLETE!")
print("="*70)
print(f"Total execution time: {total_time:.2f}s ({total_time/60:.2f} min)")
print(f"\nBest reduction method: {max(comparison_results, key=comparison_results.get)} "
      f"({max(comparison_results.values()):.1%})")
print(f"Best classifier: {max(classification_results, key=lambda k: classification_results[k]['accuracy'])} "
      f"({max(v['accuracy'] for v in classification_results.values()):.1%})")
print("="*70)

STEP 1: LOADING MNIST DATASET
✓ Train set: (56000, 784)
✓ Test set: (14000, 784)
✓ Loading time: 24.14s

STEP 2: DIMENSIONALITY REDUCTION

[1/6] PCA (50 components)...
      ✓ Time: 1.90s | Variance: 82.6%

[2/6] Incremental PCA (50 components)...
      ✓ Time: 18.17s

[3/6] LDA (9 components - max for 10 classes)...
      ✓ Time: 19.00s

[4/6] LLE (30 components, 10k samples - no transform available)...
      ✓ Time: 182.74s | Note: Cannot transform test set

[5/6] t-SNE (2 components, 5k samples - visualization only)...
      ✓ Time: 56.66s | Note: Cannot transform test set

[6/6] MDS (10 components, 2k samples - very slow)...
      ✓ Time: 105.12s | Note: Cannot transform test set

DIMENSIONALITY REDUCTION TIMING SUMMARY
        Method Time (s) Time (min) Can Transform Test
           PCA     1.90       0.03                  ✓
IncrementalPCA    18.17       0.30                  ✓
           LDA    19.00       0.32                  ✓
           LLE   182.74       3.05                

## **SUMMARY**
This analysis involved a machine learning pipeline on the MNIST dataset, focusing on dimensionality reduction followed by classification.

Step 1: Data Loading and Preparation. The MNIST_784 dataset was loaded, normalized, and split into training (56,000 samples) and testing (14,000 samples) sets.

Step 2: Dimensionality Reduction. Several techniques were applied: PCA, Incremental PCA, LDA, LLE, t-SNE, and MDS. PCA (50 components) was the fastest (1.90s), explaining 82.6% of the variance, and, along with Incremental PCA and LDA, allowed for test set transformation.

Step 3: Classification on PCA-reduced data. Three classifiers were trained: Logistic Regression, SVM, and Random Forest. SVM, trained on a subset of 5,000 samples, achieved the highest accuracy at 96.0% in 3.17s. Logistic Regression achieved 90.8%, and Random Forest achieved 95.3%.

Step 4: Comparison across Reduction Methods. Logistic Regression was re-applied to data reduced by PCA, IncrementalPCA, and LDA. PCA and IncrementalPCA both yielded 90.8% accuracy, while LDA resulted in 88.4% accuracy.

#Final Result and Interpretation:

The analysis successfully demonstrated the impact of different dimensionality reduction techniques on classification performance. PCA emerged as the best reduction method for this task, offering a good balance of speed and classification accuracy (90.8% with Logistic Regression). Among the classifiers tested, SVM achieved the highest accuracy (96.0%) on the PCA-reduced data, demonstrating its effectiveness even with a sampled training set. This experiment highlights that simpler, linear reduction methods like PCA can be very efficient and effective for classification when combined with powerful classifiers like SVM, even on large datasets like MNIST.