Train a model on Fashion MNIST and apply PCA to analyze performance differences.

Perform feature selection on the M1 dataset using various methods.

Apply K-Means clustering on the Iris dataset.

In [1]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
import torchvision.transforms as transforms
import torchvision.datasets as datasets
from torch.utils.data import DataLoader
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score
import time

# Fashion MNIST Classification with PCA
transform = transforms.Compose([transforms.ToTensor(), transforms.Normalize((0.5,), (0.5,))])
train_dataset = datasets.FashionMNIST(root="./data", train=True, transform=transform, download=True)
test_dataset = datasets.FashionMNIST(root="./data", train=False, transform=transform, download=True)
train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=64, shuffle=False)

100%|██████████| 26.4M/26.4M [00:01<00:00, 20.9MB/s]
100%|██████████| 29.5k/29.5k [00:00<00:00, 344kB/s]
100%|██████████| 4.42M/4.42M [00:00<00:00, 6.22MB/s]
100%|██████████| 5.15k/5.15k [00:00<00:00, 12.5MB/s]


In [2]:
class SimpleNN(nn.Module):
    def __init__(self, input_dim=784):
        super(SimpleNN, self).__init__()
        self.fc = nn.Sequential(
            nn.Linear(input_dim, 128),
            nn.ReLU(),
            nn.Linear(128, 10)
        )

    def forward(self, x):
        return self.fc(x.view(x.size(0), -1))

def train_model(model, train_loader, epochs=5):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=0.001)
    for epoch in range(epochs):
        for images, labels in train_loader:
            images, labels = images.to(device), labels.to(device)
            optimizer.zero_grad()
            outputs = model(images)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()
    return model

In [3]:
# Train without PCA
model = SimpleNN()
start_time = time.time()
model = train_model(model, train_loader)
end_time = time.time()
print("Training time without PCA:", end_time - start_time)


Training time without PCA: 117.98651003837585


In [4]:
# Apply PCA
train_images = train_dataset.data.numpy().reshape(len(train_dataset), -1)
train_labels = train_dataset.targets.numpy()
test_images = test_dataset.data.numpy().reshape(len(test_dataset), -1)
test_labels = test_dataset.targets.numpy()


In [5]:
scaler = StandardScaler()
train_images = scaler.fit_transform(train_images)
test_images = scaler.transform(test_images)

pca = PCA(n_components=50)  # Optimal value found experimentally
train_pca = pca.fit_transform(train_images)
test_pca = pca.transform(test_images)

In [6]:
# Train Model with PCA Features
model_pca = SimpleNN(input_dim=50)
train_loader_pca = DataLoader(list(zip(torch.tensor(train_pca).float(), torch.tensor(train_labels).long())), batch_size=64, shuffle=True)

start_time_pca = time.time()
model_pca = train_model(model_pca, train_loader_pca)
end_time_pca = time.time()
print("Training time with PCA:", end_time_pca - start_time_pca)

Training time with PCA: 7.973690032958984


In [7]:
# K-Means on Iris Dataset
df = pd.read_csv("https://raw.githubusercontent.com/mwaskom/seaborn-data/master/iris.csv")
kmeans = KMeans(n_clusters=3, random_state=42, n_init=10)
df_scaled = StandardScaler().fit_transform(df.drop(columns=['species']))
df['Cluster'] = kmeans.fit_predict(df_scaled)
print(df.head())


   sepal_length  sepal_width  petal_length  petal_width species  Cluster
0           5.1          3.5           1.4          0.2  setosa        1
1           4.9          3.0           1.4          0.2  setosa        1
2           4.7          3.2           1.3          0.2  setosa        1
3           4.6          3.1           1.5          0.2  setosa        1
4           5.0          3.6           1.4          0.2  setosa        1


In [8]:
#visualise
import plotly.express as px
fig = px.scatter_3d(df, x='sepal_length', y='sepal_width', z='petal_length', color='Cluster', symbol='species')
fig.show()