In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import cross_val_score, KFold
from sklearn.metrics import confusion_matrix



# Load and prepare data
data = pd.read_csv(r'40415474/40415474_features.csv')

# Filter for 4 classes: a, j, smile, sad, xclaim
selected_classes = ['a', 'j', 'smile', 'sad', 'xclaim']
filtered_data = data[data['label'].isin(selected_classes)].copy()

# Create numeric class codes (4-way classification)
class_mapping = {
    'a': 0, 'j': 1,      # Letters (kept separate as per assignment specs)
    'smile': 2,           # Happy faces
    'sad': 3,             # Sad faces
    'xclaim': 4           # Exclamation marks
}
filtered_data['class_code'] = filtered_data['label'].map(class_mapping)

# Select features (justify your choice in report)
selected_features = ['nr_pix', 'aspect_ratio', 'rows_with_1', 'cols_with_1']
X = filtered_data[selected_features]
y = filtered_data['class_code']

# 2.1 KNN on Training Data (all 76 items)
k_values = list(range(1, 14, 2))  # Odd k from 1 to 13
train_accuracies = []

print("=== Section 2.1 ===")
print("Training Accuracies:")
for k in k_values:
    knn = KNeighborsClassifier(n_neighbors=k)
    knn.fit(X, y)
    train_acc = knn.score(X, y)
    train_accuracies.append(train_acc)
    print(f"k={k}: {train_acc:.4f}")

# 2.2 KNN with 5-Fold Cross-Validation
cv = KFold(n_splits=5, shuffle=True, random_state=42)
cv_accuracies = []

print("\n=== Section 2.2 ===")
print("Cross-Validated Accuracies:")
for k in k_values:
    knn = KNeighborsClassifier(n_neighbors=k)
    scores = cross_val_score(knn, X, y, cv=cv)
    cv_accuracies.append(scores.mean())
    print(f"k={k}: {scores.mean():.4f}")

# 2.3 Confusion Matrix for Best k
best_k = k_values[np.argmax(cv_accuracies)]
print(f"\n=== Section 2.3 ===")
print(f"Best k from CV: {best_k}")

# Fit model with best k using all data
knn = KNeighborsClassifier(n_neighbors=best_k)
knn.fit(X, y)
y_pred = knn.predict(X)

# Create confusion matrix
class_names = ['a', 'j', 'smile', 'sad', 'xclaim']
conf_mat = confusion_matrix(y, y_pred)

plt.figure(figsize=(8, 6))
sns.heatmap(conf_mat, annot=True, fmt='d', cmap='Blues',
            xticklabels=class_names, yticklabels=class_names)
plt.title(f'Confusion Matrix for k={best_k}')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.show()

# Find most confusing pairs
print("\nMost Difficult Pairs:")
for i in range(len(class_names)):
    for j in range(len(class_names)):
        if i != j and conf_mat[i,j] > 0:
            print(f"{class_names[i]} vs {class_names[j]}: {conf_mat[i,j]} misclassifications")

# 2.4 Training vs CV Accuracy Plot
plt.figure(figsize=(10, 6))
plt.plot(k_values, train_accuracies, 'b-o', label='Training Accuracy')
plt.plot(k_values, cv_accuracies, 'r-o', label='CV Accuracy')
plt.xlabel('k')
plt.ylabel('Accuracy')
plt.title('Training vs Cross-Validated Accuracy')
plt.legend()
plt.grid(True)

# Annotate points with accuracy values
for k, train_acc, cv_acc in zip(k_values, train_accuracies, cv_accuracies):
    plt.annotate(f'{train_acc:.2f}', (k, train_acc), textcoords="offset points", xytext=(0,5), ha='center')
    plt.annotate(f'{cv_acc:.2f}', (k, cv_acc), textcoords="offset points", xytext=(0,5), ha='center')

plt.show()

# Additional plot for 1/k as requested
plt.figure(figsize=(10, 6))
plt.plot(1/np.array(k_values), train_accuracies, 'b-o', label='Training Accuracy')
plt.plot(1/np.array(k_values), cv_accuracies, 'r-o', label='CV Accuracy')
plt.xlabel('1/k')
plt.ylabel('Accuracy')
plt.title('Accuracy vs 1/k')
plt.legend()
plt.grid(True)

# Annotate points with k values
for k, inv_k, train_acc, cv_acc in zip(k_values, 1/np.array(k_values), train_accuracies, cv_accuracies):
    plt.annotate(f'k={k}', (inv_k, train_acc), textcoords="offset points", xytext=(0,5), ha='center')
    plt.annotate(f'k={k}', (inv_k, cv_acc), textcoords="offset points", xytext=(0,5), ha='center')

plt.show()