# Practical Lab 8
## Student Name: `Simardeep Singh`
## Student Roll Number:`8976948`

In [None]:
#LIBRARIES
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score,confusion_matrix,precision_recall_curve
import matplotlib.pyplot as plt
import seaborn as sns

#### Dear Professor,
#### Welcome to Lab 8 - fMNIST Classification with Dense Neural Networks using Tensorflow

In this lab i am focusing on exploring the Fashion MNIST (fMNIST) dataset, training a neural network, and evaluating its performance. Key objectives include a thorough Exploratory Data Analysis (EDA), defining and training a neural network, and assessing the model on both validation and test sets.

#### `Lab Objectives:`

1. **EDA:** Conducting a concise and insightful Exploratory Data Analysis on Kaggle [data](https://www.kaggle.com/code/abhishekyana/fmnist-dataset-with-cnns-tensorflow).

2. **Define and Train Neural Network:** Implementing a fully-connected feedforward neural network using Keras and Tensorflow.

3. **Evaluate Model Performance:** Utilizing scikit-learn metrics to evaluate the model on the validation dataset.

4. **Test Set Evaluation** Running the trained model on the test set and draw clear conclusions about its performance.

5. **Precision and Recall Enhancement:** Exploring code examples to adjust precision and recall for class '5' without model retraining.


#### **Q1 Get the data from Kaggle. See image below. Tip: load the data into numpy arrays, similar to the MNIST data used in our tutorial.**

In [None]:
# Loading the data
train_data = pd.read_csv('C:\\Foundations_of_Machine_Learning_Frameworks_lab\\Labs\\lab1\\CSCN8010-labs-simardeep-singh\\data\\fashion mnist\\fashion-mnist_train.csv')
test_data = pd.read_csv('C:\\Foundations_of_Machine_Learning_Frameworks_lab\\Labs\\lab1\\CSCN8010-labs-simardeep-singh\data\\fashion mnist\\fashion-mnist_test.csv')

# Extract features (X) and labels (y) for training data
X = train_data.drop('label', axis=1).values
y = train_data['label'].values


#### **Q2 Run Great EDA (1 point). For inspiration see the image-specific EDA at the bottom of this notebook (feel free to copy relevant code. This was taken from notebooks of students of this course). Also, feel free to reference code from Kaggle (add links). Important: make it clear for a reader, make it relevant for the problem statement, draw relevant insights.**

#####     `Check the data types`

In [None]:
print(f'Data Type: {type(X_train)}, Labels Type: {type(y_train)}')

##### `Display keys and shapes`

In [None]:
print(f'Data Shape: {X_train.shape}')
print(f'Target Shape: {y_train.shape}')

##### `Visualize sample images`

In [None]:
plt.figure(figsize=(10, 10))
for i in range(1, 10):
    plt.subplot(3, 3, i)
    plot_digit(X_train[i])
    plt.title(f"Label: {y_train[i]}")
plt.show()


##### `Pixel distribution`

In [None]:
plt.hist(X_train[0], bins=50, range=(0, 255), density=True, alpha=0.5, color='b', label='Original')
plt.hist(X_train_gray[0], bins=50, range=(0, 255), density=True, alpha=0.5, color='r', label='Gray Scale')
plt.legend()
plt.title('Pixel Distribution')
plt.show()


##### `Average image per class`

In [None]:
class_labels = np.unique(y_train)
plt.figure(figsize=(15, 8))
for i, label in enumerate(class_labels):
    plt.subplot(3, 5, i + 1)
    class_images = X_train[y_train == label]
    average_image = np.mean(class_images, axis=0)
    plot_digit(average_image)
    plt.title(f"Class {label}")
plt.show()

##### `Lable distribution`

In [None]:
plt.figure(figsize=(8, 5))
plt.hist(y_train, bins=10, edgecolor='black')
plt.title('Label Distribution')
plt.xlabel('Label')
plt.ylabel('Frequency')
plt.show()


##### `Pixel intensity distribution for each class`

In [None]:
plt.figure(figsize=(15, 8))
for i, label in enumerate(class_labels):
    plt.subplot(3, 5, i + 1)
    class_images = X_train[y_train == label]
    pixel_intensity = np.mean(class_images, axis=0)
    plt.plot(pixel_intensity, label=f'Class {label}')
    plt.title(f"Class {label}")
plt.legend()
plt.show()


#### **Q3 Define and Train a fully-connected feedforward neural network of your choice using Keras and Tensorflow.**


In [None]:

# Normalize pixel values
X = X.astype('float32') / 255.0

# Split the data into training and temporary sets (80% training, 20% temporary)
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.2, random_state=42)
# Split the temporary data into validation and test sets (50% validation, 50% test from the temporary set)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

# Convert labels to integers
y_train = y_train.astype(int)
y_val = y_val.astype(int)

# Define the model using Sequential API
model = keras.Sequential([
  layers.Flatten(input_shape=(X_train.shape[1],)),  # Flatten input
  layers.Dense(128, activation='relu'),  # First hidden layer with 128 neurons and ReLU activation
  layers.Dense(64, activation='relu'),  # Second hidden layer with 64 neurons and ReLU activation
  layers.Dense(10, activation='softmax')   # Output layer with 10 neurons (fashion categories) and softmax activation
])

# Compile the model, specifying optimizer, loss function, and metrics
model.compile(optimizer='adam',
              loss='sparse_categorical_crossentropy',
              metrics=['accuracy'])


#### **Q4 Evaluate the model using the validation dataset. You can use the same sklearn functions used in lab 7 (accuracy, precision, recall, F1). Feel free to expand**

In [None]:
# Predictions on the validation set
y_val_prob = model.predict(X_val)
y_val_pred = y_val_prob.argmax(axis=-1)

# Convert predictions to integers (if needed)
y_val_pred = y_val_pred.astype(int)

# Precision
precision = precision_score(y_val, y_val_pred, average='weighted')

# Recall
recall = recall_score(y_val, y_val_pred, average='weighted')

# F1 score
f1 = f1_score(y_val, y_val_pred, average='weighted')

# Accuracy
accuracy = accuracy_score(y_val, y_val_pred)
print(f'Precision: {precision:.4f}')
print(f'Recall: {recall:.4f}')
print(f'F1 Score: {f1:.4f}')
print(f'Accuracy: {accuracy:.4f}')

# Confusion Matrix
conf_matrix = confusion_matrix(y_val, y_val_pred)
print("\nConfusion Matrix:")
print(conf_matrix)

#### **Q5 Run the model on the test set, and provide clear and relevant conclusions.**

In [None]:
X_test_normalized = X_test.astype('float32') / 255.0

# Evaluate the model on the test set
test_loss, test_accuracy = model.evaluate(X_test_normalized, y_test)

# Print the test loss and accuracy
print(f'Test Loss: {test_loss}')
print(f'Test Accuracy: {test_accuracy}')


#### `CONCLUSION`
##### The model's performance on the test set is suboptimal, with an accuracy of approximately 13.93% and a high loss of 2.30. This suggests that the model struggles to generalize well to unseen data.

1. **Low Accuracy:** The accuracy of 13.93% indicates that the model is making correct predictions for a small portion of the test samples. In a multiclass classification task, this performance is significantly below what would be expected from a well-performing model.

2. **High Loss:** The loss value of 2.30 is quite high, suggesting a lack of confidence in the model's predictions. This could be due to the model providing output probabilities that are not well-calibrated or indicative of clear distinctions between classes.

##### The model may be suffering from issues such as overfitting, underfitting, or an insufficiently complex architecture as well as  preprocessing or normalization of the data might be impacting on performance.



#### **Q6 What if we wanted to increase the precision for class '5', how can we do that without changing the model or retraining? provide code that exemplifies this.**

In [None]:

new_threshold = 0.1  
class_5_predicted_labels = (class_5_probabilities >= new_threshold).astype(int)

# Calculating precision for class 5 with the new threshold
precision_class_5_new = precision_score(y_test == 5, class_5_predicted_labels)

print(f'Precision for class 5 with the new threshold: {precision_class_5_new}')
print(f'New threshold: {new_threshold}')

# precision-recall curve with new thhreshold for precision
precision, recall, thresholds = precision_recall_curve(y_test == 5, class_5_probabilities)

plt.plot(recall, precision, label='Precision-Recall curve')
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.title('Precision-Recall Curve for Class 5 with New Threshold for Precision')
plt.axhline(y=precision_class_5_new, color='r', linestyle='--', label=f'Precision (New Threshold): {precision_class_5_new:.4f}')
plt.axvline(x=np.sum(y_test == 5) / len(y_test), color='b', linestyle='--', label='Random Baseline')
plt.legend()
plt.show()

##### `In summary, adjusting the threshold for class '5' in the model significantly impacted precision. By lowering the threshold from the default value, we increased the number of instances predicted as class '5', resulting in a higher rate of false positives.`

#### **Q7 What if we wanted to increase the recall for class '5', how can we do that without changing the model or retraining? provide code that exemplifies this.**

In [None]:
new_recall_threshold = 0.1

class_5_predicted_labels_high_recall = (class_5_probabilities >= new_recall_threshold).astype(int)

# Calculating recall for class 5
recall_class_5_high_recall = recall_score(y_test == 5, class_5_predicted_labels_high_recall)

print(f'Recall for class 5 with the new threshold for recall: {recall_class_5_high_recall}')
print(f'New threshold for recall: {new_recall_threshold}')

# precision-recall curve with the new threshold for recall
precision, recall, thresholds = precision_recall_curve(y_test == 5, class_5_probabilities)

plt.plot(recall, precision, label='Precision-Recall curve')
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.title('Precision-Recall Curve for Class 5 with New Threshold for Recall')
plt.axvline(x=recall_class_5_high_recall, color='g', linestyle='--', label=f'Recall (New Threshold): {recall_class_5_high_recall:.4f}')
plt.axhline(y=precision[recall >= new_recall_threshold].max(), color='r', linestyle='--', label=f'Max Precision at New Recall Threshold: {precision[recall >= new_recall_threshold].max():.4f}')
plt.legend()
plt.show()

##### `In Summary, I adjusted the threshold for class '5' to enhance the recall performance without retraining the model. By setting a new threshold of 0.1, I achieved a recall of approximately 0.0988. This adjustment allowed us to prioritize the identification of true positive instances of class '5' at the expense of potentially higher false positives.`