In [None]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, KFold, cross_val_predict
from sklearn.metrics import (
    confusion_matrix, accuracy_score, precision_score,
    recall_score, f1_score, roc_curve, auc, make_scorer
)
import matplotlib.pyplot as plt
import seaborn as sns

# Load the data
data = pd.read_csv('40415474_features.csv')
data['is_letter'] = data['label'].isin(list('abcdefghij')).astype(int)

X = data[['nr_pix', 'aspect_ratio']]
y = data['is_letter']

# SECTION 1.1: 80/20 split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)
model = LogisticRegression(random_state=42)
model.fit(X_train, y_train)

y_pred = model.predict(X_test)
y_prob = model.predict_proba(X_test)[:, 1]

# Confusion matrix
cm1 = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(5,4))
sns.heatmap(cm1, annot=True, fmt='d', cmap='Blues', cbar=False,
            xticklabels=['Pred 0','Pred 1'], yticklabels=['True 0','True 1'])
plt.title('Confusion Matrix (Section 1.1)')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.tight_layout()
plt.show()

# Metrics
acc1 = accuracy_score(y_test, y_pred)
prec1 = precision_score(y_test, y_pred)
rec1 = recall_score(y_test, y_pred)
f1_1 = f1_score(y_test, y_pred)
fpr1 = cm1[0,1] / (cm1[0,1] + cm1[0,0])

print("=== Section 1.1 Metrics ===")
print(f"Accuracy: {acc1:.2%}")
print(f"Precision: {prec1:.2%}")
print(f"Recall (TPR): {rec1:.2%}")
print(f"False Positive Rate: {fpr1:.2%}")
print(f"F1-score: {f1_1:.2%}")

# SECTION 1.2: 5-fold CV predictions
cv = KFold(n_splits=5, shuffle=True, random_state=42)
y_pred_cv = cross_val_predict(
    model, X, y, cv=cv, method='predict'
)
y_prob_cv = cross_val_predict(
    model, X, y, cv=cv, method='predict_proba'
)[:, 1]

cm2 = confusion_matrix(y, y_pred_cv)
plt.figure(figsize=(5,4))
sns.heatmap(cm2, annot=True, fmt='d', cmap='Greens', cbar=False,
            xticklabels=['Pred 0','Pred 1'], yticklabels=['True 0','True 1'])
plt.title('Confusion Matrix (5-Fold CV)')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.tight_layout()
plt.show()

acc2 = accuracy_score(y, y_pred_cv)
prec2 = precision_score(y, y_pred_cv)
rec2 = recall_score(y, y_pred_cv)
f1_2 = f1_score(y, y_pred_cv)
fpr2 = cm2[0,1] / (cm2[0,1] + cm2[0,0])

print("\n=== Section 1.2 Metrics (5-Fold CV) ===")
print(f"Accuracy: {acc2:.2%}")
print(f"Precision: {prec2:.2%}")
print(f"Recall (TPR): {rec2:.2%}")
print(f"False Positive Rate: {fpr2:.2%}")
print(f"F1-score: {f1_2:.2%}")

# SECTION 1.3: ROC curves for both
fpr1_curve, tpr1_curve, _ = roc_curve(y_test, y_prob)
roc_auc1 = auc(fpr1_curve, tpr1_curve)

fpr2_curve, tpr2_curve, _ = roc_curve(y, y_prob_cv)
roc_auc2 = auc(fpr2_curve, tpr2_curve)

plt.figure(figsize=(7,5))
plt.plot(fpr1_curve, tpr1_curve, label=f'Section 1.1 (AUC {roc_auc1:.2f})')
plt.plot(fpr2_curve, tpr2_curve, linestyle='--',
         label=f'5-Fold CV (AUC {roc_auc2:.2f})')
plt.plot([0,1], [0,1], 'k--')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curves')
plt.legend(loc='lower right')
plt.grid(True)
plt.tight_layout()
plt.show()
