In [76]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [77]:
df = sns.load_dataset('titanic')

In [78]:
df.head(15)

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.25,S,Third,man,True,,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.925,S,Third,woman,False,,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1,S,First,woman,False,C,Southampton,yes,False
4,0,3,male,35.0,0,0,8.05,S,Third,man,True,,Southampton,no,True
5,0,3,male,,0,0,8.4583,Q,Third,man,True,,Queenstown,no,True
6,0,1,male,54.0,0,0,51.8625,S,First,man,True,E,Southampton,no,True
7,0,3,male,2.0,3,1,21.075,S,Third,child,False,,Southampton,no,False
8,1,3,female,27.0,0,2,11.1333,S,Third,woman,False,,Southampton,yes,False
9,1,2,female,14.0,1,0,30.0708,C,Second,child,False,,Cherbourg,yes,False


In [79]:
df.columns

Index(['survived', 'pclass', 'sex', 'age', 'sibsp', 'parch', 'fare',
       'embarked', 'class', 'who', 'adult_male', 'deck', 'embark_town',
       'alive', 'alone'],
      dtype='object')

In [80]:
list(df.columns)

['survived',
 'pclass',
 'sex',
 'age',
 'sibsp',
 'parch',
 'fare',
 'embarked',
 'class',
 'who',
 'adult_male',
 'deck',
 'embark_town',
 'alive',
 'alone']

In [81]:
df.shape

(891, 15)

In [82]:
df.isna().sum()

survived         0
pclass           0
sex              0
age            177
sibsp            0
parch            0
fare             0
embarked         2
class            0
who              0
adult_male       0
deck           688
embark_town      2
alive            0
alone            0
dtype: int64

In [83]:
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.metrics import accuracy_score, mean_squared_error

In [84]:
features = ['pclass','sex','age','sibsp','parch']
targets = ['survived','fare']

In [85]:
df_small = df[features+targets].dropna()

In [86]:
df_small.isna().sum()

pclass      0
sex         0
age         0
sibsp       0
parch       0
survived    0
fare        0
dtype: int64

In [87]:
X = df_small[features]
y_class = df_small['survived']
y_reg = df_small['fare']

In [88]:
X

Unnamed: 0,pclass,sex,age,sibsp,parch
0,3,male,22.0,1,0
1,1,female,38.0,1,0
2,3,female,26.0,0,0
3,1,female,35.0,1,0
4,3,male,35.0,0,0
...,...,...,...,...,...
885,3,female,39.0,0,5
886,2,male,27.0,0,0
887,1,female,19.0,0,0
889,1,male,26.0,0,0


In [121]:
X_train,X_test,y_train_c,y_test_c = train_test_split(X,y_class,test_size = 0.25)

In [123]:
_, _, y_train_r, y_test_r = train_test_split(X, y_reg, test_size=0.25)

In [91]:
## Preprocess: one_hot encode 'sex', pass through numeric features



preprocess = ColumnTransformer(transformers=[
    ('cat',OneHotEncoder(drop='if_binary'),['sex']),
    ('num','passthrough',['pclass','age','sibsp','parch'])
])

In [92]:
## Models


clf = Pipeline(steps=[
    ("prep", preprocess),
    ("model", LogisticRegression(max_iter=1000))
])

reg = Pipeline(steps=[
    ("prep", preprocess),
    ("model", LinearRegression())
])

In [125]:
clf.fit(X_train, y_train_c)
pred_c = clf.predict(X_test)
acc = accuracy_score(y_test_c, pred_c)

reg.fit(X_train, y_train_r)
pred_r = reg.predict(X_test)
rmse = mean_squared_error(y_test_r, pred_r, squared=False)

print("Rows used:", len(df_small))
print("Classification Accuracy:", round(acc, 4))
print("Regression RMSE:", round(rmse, 4))

Rows used: 714
Classification Accuracy: 0.7709
Regression RMSE: 59.2014




## SMALL data vs LARGE data

In [128]:

def run_experiment(X, y_class, y_reg, sample_frac):
    # 1) sample data
    df_sample = X.copy()
    df_sample["survived"] = y_class
    df_sample["fare"] = y_reg
    
    df_sample = df_sample.sample(frac=sample_frac)

    X_s = df_sample[features]
    y_c = df_sample["survived"]
    y_r = df_sample["fare"]

    

    # 2) train/test split
    X_train, X_test, y_train_c, y_test_c = train_test_split(
        X_s, y_c, test_size=0.25
    )

    _, _, y_train_r, y_test_r = train_test_split(
        X_s, y_r, test_size=0.25
    )

    # 3) fit models
    clf.fit(X_train, y_train_c)
    acc = accuracy_score(y_test_c, clf.predict(X_test))

    reg.fit(X_train, y_train_r)
    rmse = mean_squared_error(
        y_test_r, reg.predict(X_test), squared=False
    )

    return round(acc, 4), round(rmse, 2)


## 10% of the data

In [178]:
print("-----10% DATA-----")



for i in range(5):
    acc, rmse = run_experiment(X,y_class, y_reg,sample_frac=0.10)
    print(f"Run {i+1}: Accuracy = {acc}, RMSE = {rmse}")

-----10% DATA-----
Run 1: Accuracy = 0.7222, RMSE = 27.64
Run 2: Accuracy = 0.8889, RMSE = 33.96
Run 3: Accuracy = 0.7778, RMSE = 120.95
Run 4: Accuracy = 0.6667, RMSE = 28.78
Run 5: Accuracy = 0.7778, RMSE = 35.96




## 100% of the data

In [176]:
print("-------100% DATA-------")



for i in range(5):
    acc,rmse = run_experiment(X,y_class,y_reg,sample_frac=1.00)
    print(f"Run {i+1}: Accuracy = {acc}, RMSE = {rmse}")

-------100% DATA-------
Run 1: Accuracy = 0.8268, RMSE = 68.01
Run 2: Accuracy = 0.7989, RMSE = 46.32
Run 3: Accuracy = 0.8324, RMSE = 40.09
Run 4: Accuracy = 0.7877, RMSE = 42.68
Run 5: Accuracy = 0.8436, RMSE = 40.73


