<a href="https://colab.research.google.com/github/sagunkayastha/CAI_Workshop/blob/main/Workshop_3/msnist_tut.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Standard Libraries
import os
import numpy as np
import pandas as pd
import random as rn

# Visualization libraries

import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style({"axes.facecolor": ".95"})

# Modeling and Machine Learning
from IPython.display import Image
from sklearn.manifold import TSNE
from sklearn.metrics import accuracy_score
# from sklearn.externals.six import StringIO
from sklearn.decomposition import TruncatedSVD
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier, export_graphviz


# Seed for reproducability
seed = 1234
np.random.seed(seed)
rn.seed(seed)
os.environ['PYTHONHASHSEED'] = str(seed)

In [None]:
!wget https://raw.githubusercontent.com/sagunkayastha/CAI_Workshop/main/Workshop_3/Inputs/test.csv
!wget https://raw.githubusercontent.com/sagunkayastha/CAI_Workshop/main/Workshop_3/Inputs/train.csv

In [None]:
# Specify Paths for easy dataloading

TRAIN_PATH = 'train.csv'
TEST_PATH = 'test.csv'

# Load in training and testing data
train_df = pd.read_csv(TRAIN_PATH)
test_df = pd.read_csv(TEST_PATH)
concat_df = pd.concat([train_df, test_df])


In [None]:
def acc(y_true : np.ndarray, y_pred : np.ndarray) -> float:
    """
        Calculates the accuracy score between labels and predictions.

        :param y_true: The true labels of the data
        :param y_pred: The predictions for the data

        :return: a floating point number denoting the accuracy
    """
    return round(accuracy_score(y_true, y_pred) * 100, 2)

In [None]:
# Visualize target distribution
train_df['label'].value_counts().sort_index().plot(kind='bar', figsize=(10, 6), rot=0)
plt.title('Visualization of class distribution for the MNIST Dataset', fontsize=20, weight='bold')
plt.xticks(fontsize=14)
plt.yticks(fontsize=14)
plt.xlabel('Class', fontsize=16)
plt.ylabel('Frequency', fontsize=16);

In [None]:
plt.imshow(train_df.loc[np.random.randint(1000)][:-1].values.reshape(28,28), cmap='gray')

In [None]:
train_df

In [None]:
# Get all pixel features
features = [col for col in train_df.columns if col.startswith('pixel')]
# Split up training to for validation
X_train, X_val, y_train, y_val = train_test_split(train_df[features],
                                                  train_df['label'],
                                                  test_size=0.25,
                                                  random_state=seed)

In [None]:
# Train baseline decision tree model
clf = DecisionTreeClassifier(max_depth=10, random_state=seed)
clf.fit(X_train, y_train)

In [None]:
# Evaluate the baseline model
train_preds_baseline = clf.predict(X_train)
val_preds_baseline = clf.predict(X_val)
acc_baseline_train = acc(train_preds_baseline, y_train)
acc_baseline_val = acc(val_preds_baseline, y_val)
print(f'Training accuracy for our baseline (using all pixel features): {acc_baseline_train}%')
print(f'Validation accuracy for our baseline (using all pixel features): {acc_baseline_val}%')

In [None]:
from sklearn.decomposition import PCA


pca = PCA(n_components=50).fit(concat_df[features])
pca_result = pca.transform(concat_df[features])


# Create a DataFrame with the PCA results
# If you have labels for each point, you can add them to the DataFrame for color coding
pca_df = pd.DataFrame(data=pca_result[:, 0:2], columns=['PC1', 'PC2'])
pca_df['label'] = concat_df['label'].values

# Visualize the first two principal components
plt.figure(figsize=(10, 8))
sns.scatterplot(x='PC1', y='PC2', hue='label', data=pca_df, palette="Set1", alpha=0.7)
plt.title('PCA - First Two Principal Components')
plt.xlabel('Principal Component 1')
plt.ylabel('Principal Component 2')
plt.legend()
plt.show()



In [None]:
explained_variance_ratio = pca.explained_variance_ratio_

# Print the explained variance ratio for each component
for i, variance in enumerate(explained_variance_ratio):
    print(f"Principal Component {i+1}: {variance:.2%} of the variance")
    if i==5:
        break

In [None]:
pd.DataFrame(pca_result)

In [None]:

# Split up the PCA results in training and testing data
pca_cols = [f'component_{i+1}' for i in range(50)]
pca_train = pd.DataFrame(pca_result[:len(train_df)], columns=pca_cols)
pca_test = pd.DataFrame(pca_result[len(train_df):], columns=pca_cols)

# Perform another split for t-sne feature validation
X_train, X_val, y_train, y_val = train_test_split(pca_train,
                                                  train_df['label'],
                                                  test_size=0.25,
                                                  random_state=seed)

# Train model with PCA features
clf = DecisionTreeClassifier(max_depth=10, random_state=seed)
clf.fit(X_train, y_train)

# Evaluate model with the 50 PCA features and compare to the baseline model
train_preds = clf.predict(X_train)
val_preds = clf.predict(X_val)
acc_pca_train = acc(train_preds, y_train)
acc_pca_val = acc(val_preds, y_val)
print(f'Training accuracy with PCA features (50 components): {acc_pca_train}%')
print(f'Validation accuracy with PCA features (50 components): {acc_pca_val}%')
# Check out how it performed compared to the baseline
acc_diff = round(acc_pca_val - acc_baseline_val, 2)
print(f'\nThis is a difference of {acc_diff}% in validation accuracy compared to the baseline.')

In [None]:
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE


tsne = TSNE()
transformed = tsne.fit_transform(pca_result)

In [None]:
transformed.shape

In [None]:
# Split up the t-SNE results in training and testing data
tsne_train = pd.DataFrame(transformed[:len(train_df)], columns=['component1', 'component2'])
tsne_test = pd.DataFrame(transformed[len(train_df):], columns=['component1', 'component2'])

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Split up the t-SNE results in training and testing data
tsne_train = pd.DataFrame(transformed[:len(train_df)], columns=['component1', 'component2'])
tsne_test = pd.DataFrame(transformed[len(train_df):], columns=['component1', 'component2'])

# Visualize the results for t-SNE on MNIST
plt.figure(figsize=(14, 14))
plt.title(f"Visualization of t-SNE results on the MNIST Dataset\n\
Amount of datapoints: {len(tsne_train)}", fontsize=24, weight='bold')
sns.scatterplot(x="component1", y="component2",
                data=tsne_train, hue=train_df['label'],
                palette="Set1", legend="full")
plt.xticks(fontsize=14)
plt.yticks(fontsize=14)
plt.xlabel("Component 1", fontsize=16)
plt.ylabel("Component 2", fontsize=16)
plt.legend(fontsize=16)
plt.show()  # Explicitly show the plot


In [None]:
# Perform another split for t-sne feature validation
X_train, X_val, y_train, y_val = train_test_split(tsne_train,
                                                  train_df['label'],
                                                  test_size=0.25,
                                                  random_state=seed)

# Train model with t-sne features
clf = DecisionTreeClassifier(max_depth=10, random_state=seed)
clf.fit(X_train, y_train)


In [None]:
# Evaluate model with t-SNE features and compare to the baseline model
train_preds = clf.predict(X_train)
val_preds = clf.predict(X_val)
acc_tsne_train = acc(train_preds, y_train)
acc_tsne_val = acc(val_preds, y_val)
print(f'Training accuracy with t-SNE features: {acc_tsne_train}%')
print(f'Validation accuracy with t-SNE features: {acc_tsne_val}%')
# Compare t-SNE results with the baseline model
acc_diff = round(acc_tsne_val - acc_baseline_val, 2)
print(f'\nThis is an improvement of {acc_diff}% in validation accuracy over the baseline!')

In [None]:
from sklearn.neighbors import KNeighborsClassifier

# Train baseline KNN model
knn = KNeighborsClassifier(n_neighbors=3)
knn.fit(X_train, y_train)

# Evaluate the baseline model
train_preds_baseline = knn.predict(X_train)
val_preds_baseline = knn.predict(X_val)
acc_baseline_train = acc(train_preds_baseline, y_train)
acc_baseline_val = acc(val_preds_baseline, y_val)
print(f'Training accuracy for our baseline (using all pixel features): {acc_baseline_train}%')
print(f'Validation accuracy for our baseline (using all pixel features): {acc_baseline_val}%')