# Day 8: Model Evaluation & Assessment - SOLUTIONS

**Duration:** 90 minutes
**Dataset:** Titanic Passenger Data

---

## Part 1: Setup and Data Loading

In [None]:
# Import libraries
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import seaborn as sns

from sklearn.model_selection import train_test_split, cross_val_score, learning_curve
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score,
    confusion_matrix, classification_report, roc_curve, roc_auc_score,
    mean_squared_error, mean_absolute_error, r2_score
)
import warnings
warnings.filterwarnings('ignore')

print("✓ Libraries imported successfully!")

In [None]:
# Load and prepare data
df = sns.load_dataset('titanic')
print(f"Dataset loaded: {df.shape[0]} rows, {df.shape[1]} columns")
print(f"Survival rate: {df['survived'].mean()*100:.1f}%")

df_clean = df.copy()
df_clean['age'].fillna(df_clean['age'].median(), inplace=True)
df_clean['embarked'].fillna(df_clean['embarked'].mode()[0], inplace=True)
df_clean['fare'].fillna(df_clean['fare'].median(), inplace=True)
df_clean['sex_encoded'] = df_clean['sex'].map({'male': 1, 'female': 0})
df_clean['family_size'] = df_clean['sibsp'] + df_clean['parch'] + 1
df_clean['is_alone'] = (df_clean['family_size'] == 1).astype(int)

features = ['pclass', 'sex_encoded', 'age', 'fare', 'family_size', 'is_alone']
X = df_clean[features]
y = df_clean['survived']

print("\nFeatures prepared!")
df_clean.head()

---\n## Part 2: Train-Test Split - SOLUTION

In [None]:
# SOLUTION: Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print(f"Training set: {X_train.shape[0]} samples")
print(f"Test set: {X_test.shape[0]} samples")
print(f"\nTraining set survival rate: {y_train.mean()*100:.1f}%")
print(f"Test set survival rate: {y_test.mean()*100:.1f}%")

print("\n**Answer:** We use random_state=42 for reproducibility - so we get the same split every time.")

---\n## Part 3: Bias-Variance Tradeoff - SOLUTIONS

In [None]:
# Train models with different complexities
model_simple = LogisticRegression(max_iter=1000, random_state=42)
model_simple.fit(X_train, y_train)

model_medium = DecisionTreeClassifier(max_depth=3, random_state=42)
model_medium.fit(X_train, y_train)

# SOLUTION: Complex model
model_complex = DecisionTreeClassifier(random_state=42)
model_complex.fit(X_train, y_train)

print("✓ All models trained!")

In [None]:
# Compare performance
models = {
    'Simple (Logistic)': model_simple,
    'Medium (Tree depth=3)': model_medium,
    'Complex (Tree unlimited)': model_complex
}

results = []
for name, model in models.items():
    train_acc = model.score(X_train, y_train)
    test_acc = model.score(X_test, y_test)
    gap = train_acc - test_acc
    
    results.append({
        'Model': name,
        'Train Accuracy': train_acc,
        'Test Accuracy': test_acc,
        'Gap': gap
    })

results_df = pd.DataFrame(results)
print("Model Performance Comparison:")
print(results_df.round(4))

print("\n**Analysis:**")
print("- Simple model: Small gap (low variance), decent test performance")
print("- Medium model: Good balance, best test performance")
print("- Complex model: Large gap indicates OVERFITTING (memorizing training data)")

In [None]:
# Visualize
fig = go.Figure()
fig.add_trace(go.Bar(name='Training', x=results_df['Model'], y=results_df['Train Accuracy'], marker_color='lightblue'))
fig.add_trace(go.Bar(name='Test', x=results_df['Model'], y=results_df['Test Accuracy'], marker_color='darkblue'))
fig.update_layout(title='Bias-Variance Tradeoff', xaxis_title='Model', yaxis_title='Accuracy', barmode='group', yaxis_tickformat='.0%')
fig.show()

print("\n**Question Answer:** Complex model shows overfitting - high training accuracy but lower test accuracy.")
print("**Best choice:** Medium complexity model has best test performance and smallest train-test gap.")

In [None]:
# Learning curves
train_sizes, train_scores, test_scores = learning_curve(
    model_medium, X_train, y_train, train_sizes=np.linspace(0.1, 1.0, 10), cv=5, random_state=42
)

train_mean = np.mean(train_scores, axis=1)
test_mean = np.mean(test_scores, axis=1)

fig = go.Figure()
fig.add_trace(go.Scatter(x=train_sizes, y=train_mean, name='Training', mode='lines+markers', line=dict(color='lightblue')))
fig.add_trace(go.Scatter(x=train_sizes, y=test_mean, name='CV', mode='lines+markers', line=dict(color='darkblue')))
fig.update_layout(title='Learning Curve', xaxis_title='Training Size', yaxis_title='Accuracy', yaxis_tickformat='.0%')
fig.show()

print("\n**Answer:** As we add more data, the gap between training and test scores DECREASES.")
print("More data helps the model generalize better!")

---\n## Part 4: Regression Metrics - SOLUTIONS

In [None]:
# Regression example
X_reg = df_clean[['pclass', 'sex_encoded', 'age', 'family_size']].copy()
y_reg = df_clean['fare'].copy()

X_train_reg, X_test_reg, y_train_reg, y_test_reg = train_test_split(X_reg, y_reg, test_size=0.2, random_state=42)

reg_model = LinearRegression()
reg_model.fit(X_train_reg, y_train_reg)
y_pred_reg = reg_model.predict(X_test_reg)

# SOLUTION: Calculate all metrics
mae = mean_absolute_error(y_test_reg, y_pred_reg)
mse = mean_squared_error(y_test_reg, y_pred_reg)
rmse = np.sqrt(mse)
r2 = r2_score(y_test_reg, y_pred_reg)

print("Regression Metrics:")
print(f"MAE:  £{mae:.2f} (on average, off by this much)")
print(f"MSE:  {mse:.2f}")
print(f"RMSE: £{rmse:.2f} (similar to MAE but penalizes large errors)")
print(f"R²:   {r2:.3f} (model explains {r2*100:.1f}% of variance)")

In [None]:
# Visualize predictions
comparison_df = pd.DataFrame({'Actual': y_test_reg, 'Predicted': y_pred_reg})
fig = px.scatter(comparison_df, x='Actual', y='Predicted', title='Actual vs Predicted Fare')
fig.add_trace(go.Scatter(x=[0, comparison_df['Actual'].max()], y=[0, comparison_df['Actual'].max()], 
                         mode='lines', name='Perfect', line=dict(color='red', dash='dash')))
fig.show()

print("\n**Answer:** Points far from red line = large prediction errors. Perfect predictions would lie on the line.")

---\n## Part 5: Classification Metrics - SOLUTIONS

In [None]:
# Train classifier
clf_model = RandomForestClassifier(n_estimators=100, random_state=42)
clf_model.fit(X_train, y_train)
y_pred = clf_model.predict(X_test)

# SOLUTION: Calculate all metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

print("Classification Metrics:")
print(f"Accuracy:  {accuracy:.3f} ({accuracy*100:.1f}% correct overall)")
print(f"Precision: {precision:.3f} (when we predict survival, we're right {precision*100:.1f}% of the time)")
print(f"Recall:    {recall:.3f} (we catch {recall*100:.1f}% of actual survivors)")
print(f"F1-Score:  {f1:.3f} (balanced metric)")

print("\n**Iceberg Detection Answer:** Prioritize RECALL! Better to have false alarms than miss an iceberg.")

---\n## Part 6: Confusion Matrix - SOLUTIONS

In [None]:
# SOLUTION: Confusion matrix
cm = confusion_matrix(y_test, y_pred)

print("Confusion Matrix:")
print(cm)
print("\nInterpretation:")
print(f"True Negatives (TN):  {cm[0,0]} - Correctly predicted died")
print(f"False Positives (FP): {cm[0,1]} - Predicted survived but died (Type I)")
print(f"False Negatives (FN): {cm[1,0]} - Predicted died but survived (Type II)")
print(f"True Positives (TP):  {cm[1,1]} - Correctly predicted survived")

In [None]:
# Visualize
fig = px.imshow(cm, labels=dict(x="Predicted", y="Actual", color="Count"),
                x=['Died (0)', 'Survived (1)'], y=['Died (0)', 'Survived (1)'],
                text_auto=True, color_continuous_scale='Blues', title='Confusion Matrix')
fig.show()

In [None]:
# SOLUTION: Manual calculation
TN, FP, FN, TP = cm.ravel()

manual_accuracy = (TP + TN) / (TP + TN + FP + FN)
manual_precision = TP / (TP + FP)
manual_recall = TP / (TP + FN)

print("Manual Calculations:")
print(f"Accuracy:  {manual_accuracy:.3f}")
print(f"Precision: {manual_precision:.3f}")
print(f"Recall:    {manual_recall:.3f}")

print("\nVerification:")
print(f"Accuracy matches:  {np.isclose(manual_accuracy, accuracy)}")
print(f"Precision matches: {np.isclose(manual_precision, precision)}")
print(f"Recall matches:    {np.isclose(manual_recall, recall)}")

In [None]:
# SOLUTION: Classification report
print("Classification Report:")
print(classification_report(y_test, y_pred, target_names=['Died', 'Survived']))

---\n## Part 7: Type I vs Type II Errors - SOLUTIONS

**Answers to scenarios:**

1. **Cancer Screening:** Type II is worse (missing cancer is more dangerous than false alarm)
2. **Credit Card Fraud:** Depends on context, but usually Type II is worse (losing money to fraud)
3. **Spam Filter:** Type I is worse (missing important emails)

In [None]:
# Error rates
type1_error_rate = FP / (FP + TN)
type2_error_rate = FN / (FN + TP)

print("Error Analysis:")
print(f"\nType I Error Rate:  {type1_error_rate:.3f} - {FP} false positives")
print(f"Type II Error Rate: {type2_error_rate:.3f} - {FN} false negatives")
print("\n**Answer:** In disaster scenario, Type II is worse - predicting death when they could survive")
print("means we might not allocate resources to save them!")

---\n## Part 8: ROC Curve & AUC - SOLUTIONS

In [None]:
# SOLUTION: ROC curve
y_pred_proba = clf_model.predict_proba(X_test)[:, 1]
fpr, tpr, thresholds = roc_curve(y_test, y_pred_proba)
auc = roc_auc_score(y_test, y_pred_proba)

print(f"AUC Score: {auc:.3f}")

fig = go.Figure()
fig.add_trace(go.Scatter(x=fpr, y=tpr, mode='lines', name=f'ROC (AUC={auc:.3f})', line=dict(color='blue', width=2)))
fig.add_trace(go.Scatter(x=[0,1], y=[0,1], mode='lines', name='Random (0.5)', line=dict(color='red', dash='dash')))
fig.update_layout(title='ROC Curve', xaxis_title='False Positive Rate', yaxis_title='True Positive Rate', 
                  width=700, height=700)
fig.show()

print("\n**Answer 1:** Close to top-left = HIGH true positive rate, LOW false positive rate = EXCELLENT!")
print("**Answer 2:** Following diagonal = random guessing = USELESS model")

In [None]:
# Compare multiple models
models_to_compare = {
    'Logistic Regression': LogisticRegression(max_iter=1000, random_state=42),
    'Random Forest': RandomForestClassifier(n_estimators=100, random_state=42),
    'K-Nearest Neighbors': KNeighborsClassifier(n_neighbors=5)
}

fig = go.Figure()
for name, model in models_to_compare.items():
    model.fit(X_train, y_train)
    y_proba = model.predict_proba(X_test)[:, 1]
    fpr, tpr, _ = roc_curve(y_test, y_proba)
    auc_score = roc_auc_score(y_test, y_proba)
    fig.add_trace(go.Scatter(x=fpr, y=tpr, mode='lines', name=f'{name} (AUC={auc_score:.3f})'))

fig.add_trace(go.Scatter(x=[0,1], y=[0,1], mode='lines', name='Random', line=dict(color='red', dash='dash')))
fig.update_layout(title='ROC Comparison', xaxis_title='FPR', yaxis_title='TPR', width=700, height=700)
fig.show()

print("\n**Answer:** Best model = highest AUC score (curve most towards top-left)")

---\n## Part 9: Cross-Validation - SOLUTIONS

In [None]:
# SOLUTION: Cross-validation
cv_scores = cross_val_score(clf_model, X, y, cv=5)

print("Cross-Validation Results:")
print(f"Scores: {cv_scores}")
print(f"\nMean: {cv_scores.mean():.3f}")
print(f"Std Dev: {cv_scores.std():.3f}")
print(f"95% CI: {cv_scores.mean():.3f} ± {1.96*cv_scores.std():.3f}")

fig = go.Figure()
fig.add_trace(go.Bar(x=[f'Fold {i+1}' for i in range(5)], y=cv_scores, marker_color='lightblue'))
fig.add_hline(y=cv_scores.mean(), line_dash="dash", line_color="red", annotation_text=f"Mean: {cv_scores.mean():.3f}")
fig.update_layout(title='CV Scores', xaxis_title='Fold', yaxis_title='Accuracy', yaxis_tickformat='.0%')
fig.show()

print("\n**Answer:** CV is more reliable because it tests on multiple different splits,")
print("reducing the chance that results are due to a lucky/unlucky single split.")

---\n## Part 10: Ethical AI - SOLUTIONS

In [None]:
# Bias detection
results_by_gender = []
for gender in [0, 1]:
    mask = X_test['sex_encoded'] == gender
    X_test_gender = X_test[mask]
    y_test_gender = y_test[mask]
    
    if len(y_test_gender) > 0:
        y_pred_gender = clf_model.predict(X_test_gender)
        results_by_gender.append({
            'Gender': 'Female' if gender == 0 else 'Male',
            'Count': len(y_test_gender),
            'Accuracy': accuracy_score(y_test_gender, y_pred_gender),
            'Precision': precision_score(y_test_gender, y_pred_gender, zero_division=0),
            'Recall': recall_score(y_test_gender, y_pred_gender, zero_division=0)
        })

bias_df = pd.DataFrame(results_by_gender)
print("Performance by Gender:")
print(bias_df.round(3))

fig = go.Figure()
for metric in ['Accuracy', 'Precision', 'Recall']:
    fig.add_trace(go.Bar(name=metric, x=bias_df['Gender'], y=bias_df[metric]))
fig.update_layout(title='Bias Analysis', barmode='group', yaxis_tickformat='.0%')
fig.show()

print("\n**Answer 1:** Model performs differently for men vs women, reflecting historical bias.")
print("**Answer 2:** Yes, it reflects Titanic's 'women and children first' policy. In this HISTORICAL")
print("context, it's accurate. But using sex for modern survival predictions would be problematic!")

**Trustworthy AI Scenario Answers:**

1. **Black box hospital AI:** Violates **Transparency** (doctors can't explain decisions)
2. **Biased facial recognition:** Violates **Diversity & Fairness** (unequal accuracy across groups)
3. **Discriminatory loan AI:** Violates **Fairness** and **Privacy/Data Governance** (perpetuates bias)
4. **No-appeal hiring AI:** Violates **Human Agency**, **Transparency**, and **Accountability**

In [None]:
# Feature importance
feature_importance = pd.DataFrame({
    'Feature': features,
    'Importance': clf_model.feature_importances_
}).sort_values('Importance', ascending=False)

print("Feature Importance:")
print(feature_importance)

fig = px.bar(feature_importance, x='Importance', y='Feature', orientation='h',
             title='Model Feature Importance')
fig.show()

print("\n**Ethics Answer:** For HISTORICAL analysis (Titanic), using sex is appropriate - it reflects")
print("the actual evacuation policy. For MODERN applications, using sex/gender would be discriminatory.")
print("\n**Explanation:** 'Our model found that being female, in 1st class, with a higher fare increased")
print("survival odds, reflecting the evacuation priority given to women and upper-class passengers.'")

---\n## Summary

**Key Metrics:**
- Our Random Forest achieved {:.1f}% accuracy with AUC of {:.3f}
- Always evaluate multiple metrics, not just accuracy
- Cross-validation showed consistent performance across folds
- Detected performance differences between genders

**Best Practices:**
1. Always use train-test split
2. Evaluate multiple metrics appropriate for your problem
3. Check for bias across sensitive attributes
4. Use cross-validation for robust estimates
5. Prioritize interpretability and fairness

**Congratulations on completing the course!** 🎉