# Day 6 — Solutions Notebook

*Auto-generated notebook based on provided lecture slides.*

## Solutions — Day 6: Intro to ML
Includes alternate model (Decision Tree) and explanations.

In [None]:
# Setup: installs (uncomment the !pip lines if needed) and imports
# If running in a managed environment (e.g. Google Colab), uncomment the pip installs below.
# !pip install pandas numpy seaborn plotly scikit-learn matplotlib

import pandas as pd, numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, roc_auc_score, roc_curve
sns.set_theme(style='whitegrid')

# Load dataset (seaborn's titanic dataset) - we'll use this across all notebooks
df = sns.load_dataset('titanic')
df_original = df.copy()  # keep a pristine copy
print('Loaded titanic dataset with shape:', df.shape)
df.head()


In [None]:
# Prepare X,y (solution)
df_ml = df.copy()
df_ml['age'] = df_ml['age'].fillna(df_ml['age'].median())
df_ml['fare'] = df_ml['fare'].fillna(df_ml['fare'].median())
df_ml = pd.get_dummies(df_ml, columns=['sex','embarked','class'], drop_first=True)
features = ['age','fare'] + [c for c in df_ml.columns if c.startswith('sex_') or c.startswith('embarked_') or c.startswith('class_')]
X = df_ml[features]
y = df_ml['survived']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Logistic Regression
clf = LogisticRegression(max_iter=300)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
print('Logistic Regression Accuracy:', accuracy_score(y_test, y_pred))
print('Confusion matrix:\n', confusion_matrix(y_test, y_pred))

# Decision Tree alternative
dt = DecisionTreeClassifier(max_depth=4, random_state=42)
dt.fit(X_train, y_train)
y_pred_dt = dt.predict(X_test)
print('\nDecision Tree Accuracy:', accuracy_score(y_test, y_pred_dt))
print('Confusion matrix:\n', confusion_matrix(y_test, y_pred_dt))

# Short explanation
print('\nNotes: Logistic regression is linear; decision tree can capture nonlinear interactions. Small dataset and limited features limit performance.')


### Optional: two-solution approach
- Option A: simple preprocessing + logistic regression (fast, interpretable)
- Option B: add interaction features / use tree-based model (more flexible, less interpretable)
