# Day 2 — Solutions Notebook

*Auto-generated notebook based on provided lecture slides.*

## Solutions — Day 2: Introduction & Visualization
This notebook contains worked solutions and short explanations.

In [None]:
# Setup: installs (uncomment the !pip lines if needed) and imports
# If running in a managed environment (e.g. Google Colab), uncomment the pip installs below.
# !pip install pandas numpy seaborn plotly scikit-learn matplotlib

import pandas as pd, numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, roc_auc_score, roc_curve
sns.set_theme(style='whitegrid')

# Load dataset (seaborn's titanic dataset) - we'll use this across all notebooks
df = sns.load_dataset('titanic')
df_original = df.copy()  # keep a pristine copy
print('Loaded titanic dataset with shape:', df.shape)
df.head()


### Exploration (solution)

In [None]:
print(df.info())
print('\nNumeric summary:')
print(df.describe(include='all'))
print('\nMissing values per column:')
print(df.isna().sum())


### Visualizations (solutions with alternatives)

In [None]:
# Survival rate per class (solution)
surv_by_class = df.groupby('class', observed=True)['survived'].mean().reset_index()
surv_by_class['survived_pct'] = surv_by_class['survived']*100
print(surv_by_class)
fig = px.bar(surv_by_class, x='class', y='survived_pct', title='Survival rate per class (percent)')
fig.update_yaxes(title='Survival rate (%)')
fig.show()

# Alternative: stacked bar with counts
counts = df.groupby(['class','survived']).size().reset_index(name='count')
fig = px.bar(counts, x='class', y='count', color='survived', barmode='stack', title='Counts: survived vs not')
fig.show()

# Example of misleading visualization (demonstration)
# Create a truncated y-axis to show how it can be misleading
import plotly.graph_objects as go
fig = go.Figure(go.Bar(x=['A','B','C'], y=[10,11,12]))
fig.update_layout(title='Truncated y-axis example (misleading)')
fig.update_yaxes(range=[9.5,12])
fig.show()

# Explain: truncating the y-axis makes small differences look large.

### Notes for instructors
- Emphasise labeling axes, titles, and choosing appropriate charts.
- For beginners, stick to bar/hist/box/scatter; avoid 3D charts.