<h1 align="center">
    Lab 03
</h1>

In [None]:
# Packages yippiee

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os
import cv2
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
from sklearn.metrics import classification_report
from sklearn.utils import shuffle
import seaborn as sns
import random
import glob
from PIL import Image
from collections import Counter

In [2]:
# Reusbal function to display confusion matrix
def plot_confusion_matrix(cm, class_names, title):
    plt.figure(figsize=(10, 8))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=class_names, yticklabels=class_names)
    plt.xlabel('Predicted Label')
    plt.ylabel('True Label')
    plt.title(title)
    plt.tight_layout()
    plt.show()

In [None]:
# Step 2: Load the dataset
# Assuming the dataset is in a directory structure with images and a CSV file with labels

# Define the path to your data
data_dir = 'data/chinese_mnist'  # This should be the path to the directory containing the data
metadata_path = os.path.join(data_dir, 'chinese_mnist.csv')  # Path to the CSV file with labels

# Load the metadata
metadata = pd.read_csv(metadata_path)
print("Dataset metadata sample:")
print(metadata.head())

# Display some basic statistics about the dataset
print(f"Total number of images: {len(metadata)}")
print(f"Number of unique classes: {metadata['value'].nunique()}")
print(f"Class distribution:")
print(metadata['value'].value_counts())

# Function to load and preprocess images
def load_images(metadata, data_dir):
    images = []
    labels = []
    
    for idx, row in metadata.iterrows():
        # Construct file path from metadata
        # Adjust this according to your actual file naming convention
        file_name = f"input_{row['suite_id']}_{row['sample_id']}_{row['value']}.jpg"
        file_path = os.path.join(data_dir, 'images', file_name)
        
        if os.path.exists(file_path):
            # Read and preprocess the image
            img = cv2.imread(file_path, cv2.IMREAD_GRAYSCALE)
            if img is not None:
                images.append(img)
                labels.append(row['value'])
        
        # Print progress every 1000 images
        if (idx + 1) % 1000 == 0:
            print(f"Processed {idx + 1} images")
    
    return np.array(images), np.array(labels)

# Load all images and their labels
print("Loading images...")
X, y = load_images(metadata, data_dir)
print(f"Loaded {len(X)} images with shape {X[0].shape}")

# Visualize some examples
plt.figure(figsize=(15, 6))
for i in range(10):
    plt.subplot(2, 5, i+1)
    plt.imshow(X[i], cmap='gray')
    plt.title(f"Label: {y[i]}")
    plt.axis('off')
plt.tight_layout()
plt.show()

# Step 3: Split and sample the dataset
# Function to perform stratified sampling
def stratified_sample(X, y, n_samples):
    unique_classes = np.unique(y)
    samples_per_class = n_samples // len(unique_classes)
    
    sampled_X = []
    sampled_y = []
    
    for cls in unique_classes:
        # Get indices of samples belonging to this class
        indices = np.where(y == cls)[0]
        
        # Randomly select the required number of samples
        selected_indices = np.random.choice(indices, size=samples_per_class, replace=False)
        
        # Add the selected samples to our lists
        sampled_X.extend(X[selected_indices])
        sampled_y.extend(y[selected_indices])
    
    return np.array(sampled_X), np.array(sampled_y)

# Function to perform evaluation with different training set sizes
def evaluate_classifiers(X, y, train_sizes=[5000, 10000], test_size=1000):
    results = []
    
    for train_size in train_sizes:
        print(f"\nEvaluating with {train_size} training samples and {test_size} test samples")
        
        # Sample the data
        X_train_sampled, y_train_sampled = stratified_sample(X, y, train_size)
        X_test_sampled, y_test_sampled = stratified_sample(X, y, test_size)
        
        # Step 4: Perform necessary data reshaping
        # Flatten the images from 2D to 1D arrays
        X_train_flattened = X_train_sampled.reshape(X_train_sampled.shape[0], -1)
        X_test_flattened = X_test_sampled.reshape(X_test_sampled.shape[0], -1)
        
        # Check the class distribution in training and test sets
        print("Training set class distribution:")
        unique, counts = np.unique(y_train_sampled, return_counts=True)
        print(dict(zip(unique, counts)))
        
        print("Test set class distribution:")
        unique, counts = np.unique(y_test_sampled, return_counts=True)
        print(dict(zip(unique, counts)))
        
        # Initialize the classifiers
        classifiers = {
            'KNN (k=3)': KNeighborsClassifier(n_neighbors=3),
            'Decision Tree': DecisionTreeClassifier(),
            'SGD (epochs=250)': SGDClassifier(max_iter=250)
        }
        
        class_names = np.unique(y)
        
        for name, clf in classifiers.items():
            print(f"\nTraining {name}...")
            
            # Step 6: Fit the classifier to the training data
            clf.fit(X_train_flattened, y_train_sampled)
            
            # Step 7: Evaluate the trained model on the testing data
            y_pred = clf.predict(X_test_flattened)
            
            # Step 8: Report the performance metrics
            accuracy = accuracy_score(y_test_sampled, y_pred)
            precision = precision_score(y_test_sampled, y_pred, average='weighted')
            recall = recall_score(y_test_sampled, y_pred, average='weighted')
            f1 = f1_score(y_test_sampled, y_pred, average='weighted')
            
            print(f"Accuracy: {accuracy:.4f}")
            print(f"Precision: {precision:.4f}")
            print(f"Recall: {recall:.4f}")
            print(f"F1 Score: {f1:.4f}")
            
            # Calculate and display confusion matrix
            cm = confusion_matrix(y_test_sampled, y_pred)
            plot_confusion_matrix(cm, class_names, f"Confusion Matrix - {name} (Train Size: {train_size})")
            
            # Detailed classification report
            print("\nClassification Report:")
            print(classification_report(y_test_sampled, y_pred))
            
            # Store results
            results.append({
                'Classifier': name,
                'Train Size': train_size,
                'Accuracy': accuracy,
                'Precision': precision,
                'Recall': recall,
                'F1 Score': f1
            })
    
    # Create a DataFrame with all results for easy comparison
    results_df = pd.DataFrame(results)
    print("\nSummary of all results:")
    print(results_df)
    
    # Plot comparison of classifier performance
    plt.figure(figsize=(12, 8))
    
    # Group by classifier and train size
    for metric in ['Accuracy', 'Precision', 'Recall', 'F1 Score']:
        plt.subplot(2, 2, ['Accuracy', 'Precision', 'Recall', 'F1 Score'].index(metric) + 1)
        
        for train_size in train_sizes:
            subset = results_df[results_df['Train Size'] == train_size]
            plt.bar(
                np.arange(len(subset)) + (0.4 if train_size == train_sizes[0] else 0), 
                subset[metric], 
                width=0.4, 
                label=f'Train Size: {train_size}'
            )
        
        plt.title(f'{metric} Comparison')
        plt.xticks(np.arange(len(subset)), subset['Classifier'])
        plt.ylabel(metric)
        plt.legend()
    
    plt.tight_layout()
    plt.show()
    
    return results_df

# Run the evaluation
results = evaluate_classifiers(X, y, train_sizes=[5000, 10000], test_size=1000)

# Conclusions and discussion
print("\nConclusions:")
print("1. Effect of increasing training set size:")
for classifier in results['Classifier'].unique():
    size_5k = results[(results['Classifier'] == classifier) & (results['Train Size'] == 5000)]
    size_10k = results[(results['Classifier'] == classifier) & (results['Train Size'] == 10000)]
    
    acc_diff = size_10k['Accuracy'].values[0] - size_5k['Accuracy'].values[0]
    print(f"   - {classifier}: Accuracy change = {acc_diff:.4f}")

print("\n2. Best performing classifier:")
best_acc = results.loc[results['Accuracy'].idxmax()]
print(f"   - {best_acc['Classifier']} with training size {best_acc['Train Size']} achieved the highest accuracy: {best_acc['Accuracy']:.4f}")

print("\n3. Other observations:")
print("   - [Add your observations here based on the actual results]")