# Titanic.csv

* Explore: Bagaimana setiap feature berhubungan dengan apakah seseorang survive/ alive.
* Splitting: 80-20, stratify: y, random state 2020
* Preprocessing: 
>* drop deck
>* Isi missing value menggunakan simple imputer 
>* onehot encoding: sex, alone  
>* ordinal encoding: class  
>* binary encoding: embarked town  

* Model selection:
>* evaluation metric yang dipakai: F1_score
>* Logreg, KNN, DecisionTreeClassifier, RandomForestClassifier
>* Hyperparameter tuning 2 model yang menurut kalian terbaik
>* Buat summary untuk hasil evaluasi, dan kesimpulan mana model yang terbaik untuk titanic.csv

Kalau bisa, gunakan pipeline ketika diperlukan untuk menghindari data leaking.

email hasil modelling kalian ke Brigita.gems@gmail.com dengan subject: titanic


In [284]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# PREPROCESSING
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer # untuk transformers
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
import category_encoders as ce # untuk ordinal dan binary encoder
from sklearn.preprocessing import StandardScaler, PolynomialFeatures

from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold, GridSearchCV, RandomizedSearchCV  

# Model machine learninng
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import VotingClassifier # voting


from sklearn.metrics import classification_report, f1_score, recall_score, precision_score

In [285]:
# load dataset
df = pd.read_csv('titanic.csv')
df.head(3)

Unnamed: 0,sex,age,parch,fare,class,deck,embark_town,alive,alone
0,male,22.0,0,7.25,Third,,Southampton,no,False
1,female,38.0,0,71.2833,First,C,Cherbourg,yes,False
2,female,26.0,0,7.925,Third,,Southampton,yes,True


In [286]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 9 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   sex          891 non-null    object 
 1   age          714 non-null    float64
 2   parch        891 non-null    int64  
 3   fare         891 non-null    float64
 4   class        891 non-null    object 
 5   deck         203 non-null    object 
 6   embark_town  889 non-null    object 
 7   alive        891 non-null    object 
 8   alone        891 non-null    bool   
dtypes: bool(1), float64(2), int64(1), object(5)
memory usage: 56.7+ KB


In [287]:
df.describe()

Unnamed: 0,age,parch,fare
count,714.0,891.0,891.0
mean,29.699118,0.381594,32.204208
std,14.526497,0.806057,49.693429
min,0.42,0.0,0.0
25%,20.125,0.0,7.9104
50%,28.0,0.0,14.4542
75%,38.0,0.0,31.0
max,80.0,6.0,512.3292


In [288]:
df.isna().sum()

sex              0
age            177
parch            0
fare             0
class            0
deck           688
embark_town      2
alive            0
alone            0
dtype: int64

# 1. Preprocessing

## Drop 

In [289]:
# drop column 'deck' karena missing value terlalu banyak
df = df.drop(columns='deck')

## Pipeline dan Transformer

In [290]:
df['class'].unique()

array(['Third', 'First', 'Second'], dtype=object)

In [291]:
# pipeline berisi imputing lalu binary encoding untuk 'embark_town' nanti
binary_encoder_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('binary encoder', ce.BinaryEncoder())
])

# ordinal mapping untuk 'class' nanti
ordinal_mapping = [
    {'col':'class',
    'mapping':{None:0, 'First':1, 'Second':2, 'Third':3}}
]

ordinal_encoder = ce.OrdinalEncoder(cols=['class'], mapping=ordinal_mapping)

# transformer 
transformer = ColumnTransformer([
    ('imputer', SimpleImputer(strategy='median'), ['age']),
    ('one hot encoder', OneHotEncoder(drop='first'), ['sex','alone']),
    ('ordinal encoder', ordinal_encoder, ['class']),
    ('binary encoder', binary_encoder_pipeline, ['embark_town'])
], remainder='passthrough')

## Split Data

In [292]:
# ganti target ('alive') jadi 0-1 ('label)
df['label'] = np.where(df['alive']=='yes', 1, 0)

# drop column 'alive'
df = df.drop(columns='alive')

df.head(3)

Unnamed: 0,sex,age,parch,fare,class,embark_town,alone,label
0,male,22.0,0,7.25,Third,Southampton,False,0
1,female,38.0,0,71.2833,First,Cherbourg,False,1
2,female,26.0,0,7.925,Third,Southampton,True,1


In [293]:
df['label'].value_counts()

0    549
1    342
Name: label, dtype: int64

In [294]:
# define X dan y
# X drop alive dan label
X = df.drop(columns='label')
y = df['label']

In [295]:
# split data
# X_train di sini maksudnya adalah X_train_val
X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    stratify=y,
    test_size=0.2,
    random_state=2020
)

## Data Transforming

In [296]:
# # transformer difit
# X_train = transformer.fit_transform(X_train)
# X_test = transformer.transform(X_test)

In [297]:
# X_train = pd.DataFrame(X_train)
# X_train.head(3)

In [298]:
# X_test = pd.DataFrame(X_test)
# X_test.head(3)

In [299]:
# transformer.transformers_

In [300]:
# # mengambil nama feature
# onehot_col = transformer.transformers_[1][1].get_feature_names()
# ordinal_col = transformer.transformers_[2][1].get_feature_names()
# binary_col = transformer.transformers_[3][1][1].get_feature_names()

# # menamai features
# features = ['age'] + list(onehot_col) + ordinal_col + binary_col + ['parch','fare']

# X_train.columns = features
# X_test.columns = features


In [301]:
# X_train.head(3)

In [302]:
# X_test.head(3)

# 2. Model Selection

In [303]:
# define model2 logreg KNN DT RF 
logreg = LogisticRegression()
knn = KNeighborsClassifier()
dt = DecisionTreeClassifier()
rf = RandomForestClassifier()


### Coba lihat score model logreg

In [304]:
# buat pipeline berisi transformer (dari yg di atas) lalu model (logreg dll)

# Logisitic regression
logreg_pipeline = Pipeline([
    ('transformer', transformer),
    ('clf', logreg)
])

# KNN
knn_pipeline = Pipeline([
    ('transformer', transformer),
    ('clf', knn)
])

# DecisionTree
dt_pipeline = Pipeline([
    ('transformer', transformer),
    ('clf', dt)
])

# RandomForest
rf_pipeline = Pipeline([
    ('transformer', transformer),
    ('clf', rf)
])

## Cross Validation

In [305]:

# define skf 
skf = StratifiedKFold(n_splits=5)

# cross val score (pipeline, X_train, y_train, cv=skf, scoring='f1')
model_cv = cross_val_score(logreg_pipeline, X_train, y_train, cv=skf, scoring='f1')

# lihat scorenya, mean nya dan std nya
print('CrossVal model:', model_cv)
print('CrossVal model mean:', model_cv.mean())
print('CrossVal model std:', model_cv.std())


CrossVal model: [0.62135922 0.75471698 0.77358491 0.73043478 0.74747475]
CrossVal model mean: 0.7255141280353733
CrossVal model std: 0.053885679207528986


### Lihat score semua model (flexible metrics)

In [306]:
# cross validation
def model_evaluation_cv(model):

    # model.fit(X_train, y_train)

    skf = StratifiedKFold(n_splits = 5)
    model_cv = cross_val_score(model, X_train, y_train, cv=skf, scoring='f1')

    return model_cv

# print score dari cross validation
def print_result(model_cv):
    print('Model:', str(model))
    print('CV score',model_cv)
    print('CV score mean',model_cv.mean())
    print('CV score std',model_cv.std())
    print()

In [307]:
# logistic regression
model = logreg_pipeline
model_cv = model_evaluation_cv(model)
print_result(model_cv)

Model: Pipeline(steps=[('transformer',
                 ColumnTransformer(remainder='passthrough',
                                   transformers=[('imputer',
                                                  SimpleImputer(strategy='median'),
                                                  ['age']),
                                                 ('one hot encoder',
                                                  OneHotEncoder(drop='first'),
                                                  ['sex', 'alone']),
                                                 ('ordinal encoder',
                                                  OrdinalEncoder(cols=['class'],
                                                                 mapping=[{'col': 'class',
                                                                           'mapping': {None: 0,
                                                                                       'First': 1,
                                           

In [308]:
# KNN
model = knn_pipeline
model_cv = model_evaluation_cv(model)
print_result(model_cv)

Model: Pipeline(steps=[('transformer',
                 ColumnTransformer(remainder='passthrough',
                                   transformers=[('imputer',
                                                  SimpleImputer(strategy='median'),
                                                  ['age']),
                                                 ('one hot encoder',
                                                  OneHotEncoder(drop='first'),
                                                  ['sex', 'alone']),
                                                 ('ordinal encoder',
                                                  OrdinalEncoder(cols=['class'],
                                                                 mapping=[{'col': 'class',
                                                                           'mapping': {None: 0,
                                                                                       'First': 1,
                                           

In [309]:
# Decision Tree
model = dt_pipeline
model_cv = model_evaluation_cv(model)
print_result(model_cv)

Model: Pipeline(steps=[('transformer',
                 ColumnTransformer(remainder='passthrough',
                                   transformers=[('imputer',
                                                  SimpleImputer(strategy='median'),
                                                  ['age']),
                                                 ('one hot encoder',
                                                  OneHotEncoder(drop='first'),
                                                  ['sex', 'alone']),
                                                 ('ordinal encoder',
                                                  OrdinalEncoder(cols=['class'],
                                                                 mapping=[{'col': 'class',
                                                                           'mapping': {None: 0,
                                                                                       'First': 1,
                                           

In [310]:
# Random Forest
model = rf_pipeline
model_cv = model_evaluation_cv(model)
print_result(model_cv)

Model: Pipeline(steps=[('transformer',
                 ColumnTransformer(remainder='passthrough',
                                   transformers=[('imputer',
                                                  SimpleImputer(strategy='median'),
                                                  ['age']),
                                                 ('one hot encoder',
                                                  OneHotEncoder(drop='first'),
                                                  ['sex', 'alone']),
                                                 ('ordinal encoder',
                                                  OrdinalEncoder(cols=['class'],
                                                                 mapping=[{'col': 'class',
                                                                           'mapping': {None: 0,
                                                                                       'First': 1,
                                           

## Kesimpulan Cross Validation

### berdasarkan score (mean tertinggi) dan stabilitas (std terendah), maka diambil 2 model terbaik:

- Logistic Regression
- Random Forest Classifier

In [311]:
# fit model_pipeline dengan X_train, y_train
logreg_pipeline.fit(X_train, y_train)

# predict dgn X_test
y_pred_logreg = logreg_pipeline.predict(X_test)

# lihat f1 score (classification report)
print(classification_report(y_test, y_pred_logreg))

              precision    recall  f1-score   support

           0       0.82      0.83      0.82       110
           1       0.72      0.71      0.72        69

    accuracy                           0.78       179
   macro avg       0.77      0.77      0.77       179
weighted avg       0.78      0.78      0.78       179



In [312]:
# fit model_pipeline dengan X_train, y_train
rf_pipeline.fit(X_train, y_train)

# predict dgn X_test
y_pred_rf = rf_pipeline.predict(X_test)

# lihat f1 score (classification report)
print(classification_report(y_test, y_pred_rf))

              precision    recall  f1-score   support

           0       0.82      0.90      0.86       110
           1       0.81      0.68      0.74        69

    accuracy                           0.82       179
   macro avg       0.81      0.79      0.80       179
weighted avg       0.82      0.82      0.81       179



# 3. Hyperparameter Tuning 

## Randomize Search

## A. Logistic Regression

In [313]:
# Hyperparameter Space
hyperparam_space = {
    'clf__C':[1000, 500, 100, 10, 5, 1, 0.5, 0.1, 0.05, 0.01, 0.005, 0.001, 0.0005, 0.0001],
    'clf__max_iter':[100,200,300,400,500,1000],
    'clf__solver':['liblinear','newton-cg']
}

# SKfold
skf = StratifiedKFold(n_splits=5)

# Hyperparameter Tuning
randomize_search = RandomizedSearchCV(
    logreg_pipeline,
    param_distributions= hyperparam_space,
    cv= skf,
    scoring= 'f1',
    n_jobs= -1
)

In [314]:
# fitting hyperparam tuning (randomize search)
randomize_search.fit(X_train, y_train)

RandomizedSearchCV(cv=StratifiedKFold(n_splits=5, random_state=None, shuffle=False),
                   estimator=Pipeline(steps=[('transformer',
                                              ColumnTransformer(remainder='passthrough',
                                                                transformers=[('imputer',
                                                                               SimpleImputer(strategy='median'),
                                                                               ['age']),
                                                                              ('one '
                                                                               'hot '
                                                                               'encoder',
                                                                               OneHotEncoder(drop='first'),
                                                                               ['sex',
                

In [315]:
# lihat score dan parameter terbaik
print('best score', randomize_search.best_score_)
print('best param', randomize_search.best_params_)

best score 0.7226028766741196
best param {'clf__solver': 'liblinear', 'clf__max_iter': 400, 'clf__C': 1}


### Comparison model: before and after hyperparameter tuning

### Before tuning

In [316]:
model = logreg_pipeline
model.fit(X_train, y_train)
y_pred_1 = model.predict(X_test)
print(classification_report(y_test, y_pred_1))

              precision    recall  f1-score   support

           0       0.82      0.83      0.82       110
           1       0.72      0.71      0.72        69

    accuracy                           0.78       179
   macro avg       0.77      0.77      0.77       179
weighted avg       0.78      0.78      0.78       179



### After tuning

In [317]:
randomize_search.best_estimator_

Pipeline(steps=[('transformer',
                 ColumnTransformer(remainder='passthrough',
                                   transformers=[('imputer',
                                                  SimpleImputer(strategy='median'),
                                                  ['age']),
                                                 ('one hot encoder',
                                                  OneHotEncoder(drop='first'),
                                                  ['sex', 'alone']),
                                                 ('ordinal encoder',
                                                  OrdinalEncoder(cols=['class'],
                                                                 mapping=[{'col': 'class',
                                                                           'mapping': {None: 0,
                                                                                       'First': 1,
                                                  

In [318]:
model = randomize_search.best_estimator_
model.fit(X_train, y_train)
y_pred_2 = model.predict(X_test)
print(classification_report(y_test, y_pred_2))

              precision    recall  f1-score   support

           0       0.81      0.82      0.81       110
           1       0.71      0.70      0.70        69

    accuracy                           0.77       179
   macro avg       0.76      0.76      0.76       179
weighted avg       0.77      0.77      0.77       179



## B. Random Forest

In [319]:
# Hyperparameter Space
hyperparam_space = {
    'clf__bootstrap': [True, False],
    'clf__max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, None], #
    'clf__max_features': ['auto', 'sqrt'],
    'clf__min_samples_leaf': [1, 2, 4], #
    'clf__min_samples_split': [2, 5, 10], #
    'clf__n_estimators': [200, 400, 600, 800, 1000, 1200, 1400, 1600, 1800, 2000] #
 }

# SKfold
skf = StratifiedKFold(n_splits=5)

# Hyperparameter Tuning
randomize_search = RandomizedSearchCV(
    rf_pipeline,
    param_distributions= hyperparam_space,
    cv= skf,
    scoring= 'f1',
    n_jobs= -1
)

In [320]:
# fitting hyperparam tuning (randomize search)
randomize_search.fit(X_train, y_train)

RandomizedSearchCV(cv=StratifiedKFold(n_splits=5, random_state=None, shuffle=False),
                   estimator=Pipeline(steps=[('transformer',
                                              ColumnTransformer(remainder='passthrough',
                                                                transformers=[('imputer',
                                                                               SimpleImputer(strategy='median'),
                                                                               ['age']),
                                                                              ('one '
                                                                               'hot '
                                                                               'encoder',
                                                                               OneHotEncoder(drop='first'),
                                                                               ['sex',
                

In [321]:
# lihat score dan parameter terbaik
print('best score', randomize_search.best_score_)
print('best param', randomize_search.best_params_)

best score 0.7467747215333972
best param {'clf__n_estimators': 800, 'clf__min_samples_split': 5, 'clf__min_samples_leaf': 1, 'clf__max_features': 'sqrt', 'clf__max_depth': 90, 'clf__bootstrap': True}


### Comparison before vs after tuning

### Before Tuning

In [322]:
model = rf_pipeline
model.fit(X_train, y_train)
y_pred_3 = model.predict(X_test)
print(classification_report(y_test, y_pred_3))

              precision    recall  f1-score   support

           0       0.81      0.89      0.85       110
           1       0.79      0.67      0.72        69

    accuracy                           0.80       179
   macro avg       0.80      0.78      0.79       179
weighted avg       0.80      0.80      0.80       179



### After Tuning

In [323]:
randomize_search.best_estimator_

Pipeline(steps=[('transformer',
                 ColumnTransformer(remainder='passthrough',
                                   transformers=[('imputer',
                                                  SimpleImputer(strategy='median'),
                                                  ['age']),
                                                 ('one hot encoder',
                                                  OneHotEncoder(drop='first'),
                                                  ['sex', 'alone']),
                                                 ('ordinal encoder',
                                                  OrdinalEncoder(cols=['class'],
                                                                 mapping=[{'col': 'class',
                                                                           'mapping': {None: 0,
                                                                                       'First': 1,
                                                  

In [324]:
model = randomize_search.best_estimator_
model.fit(X_train, y_train)
y_pred_4 = model.predict(X_test)
print(classification_report(y_test, y_pred_4))

              precision    recall  f1-score   support

           0       0.82      0.92      0.87       110
           1       0.84      0.68      0.75        69

    accuracy                           0.83       179
   macro avg       0.83      0.80      0.81       179
weighted avg       0.83      0.83      0.82       179



## Perbandingan F1 score 

In [325]:
f1_1 = f1_score(y_test, y_pred_1)
f1_2 = f1_score(y_test, y_pred_2)
f1_3 = f1_score(y_test, y_pred_3)
f1_4 = f1_score(y_test, y_pred_4)

In [327]:
score_list = [f1_1, f1_2, f1_3, f1_4] 
model_names = ['LogReg','RandomForest', 'Logreg with Tuning', 'RandomForest with Tuning']
df_summary = pd.DataFrame({
    'method':model_names,
    'score':score_list
})
df_summary

Unnamed: 0,method,score
0,LogReg,0.715328
1,RandomForest,0.70073
2,Logreg with Tuning,0.724409
3,RandomForest with Tuning,0.752


# Kesimpulan

dari 2 model yang digunakan yaitu Logistic Regresion dan Random Forest, dan setelah dilakukan hyperparameter tuning, model terbaik adalah:

Random Forest dengan hyperparameter tuning dengan F1 score 0.75