# DEPLOYMENT TITANIC.CSV

In [252]:
# Basic Operations
import pandas as pd
import numpy as np

# ML Models
from sklearn.linear_model import LogisticRegression

# Feature Engineering
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, PolynomialFeatures, StandardScaler
import category_encoders as ce # untuk ordinal dan binary encoder
from sklearn.impute import SimpleImputer

# Evaluation
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
from sklearn.metrics import f1_score

# Model
import pickle
import joblib

In [253]:
# load dataset
df = pd.read_csv('titanic.csv')
df

Unnamed: 0,sex,age,parch,fare,class,deck,embark_town,alive,alone
0,male,22.0,0,7.2500,Third,,Southampton,no,False
1,female,38.0,0,71.2833,First,C,Cherbourg,yes,False
2,female,26.0,0,7.9250,Third,,Southampton,yes,True
3,female,35.0,0,53.1000,First,C,Southampton,yes,False
4,male,35.0,0,8.0500,Third,,Southampton,no,True
...,...,...,...,...,...,...,...,...,...
886,male,27.0,0,13.0000,Second,,Southampton,no,True
887,female,19.0,0,30.0000,First,B,Southampton,yes,True
888,female,,2,23.4500,Third,,Southampton,no,False
889,male,26.0,0,30.0000,First,C,Cherbourg,yes,True


# 1. Preprocessing

In [254]:
# drop column 'deck' karena missing value terlalu banyak
df = df.drop(columns='deck')

In [255]:
# ganti target ('alive') jadi 0-1 ('label)
df['label'] = np.where(df['alive']=='yes', 1, 0)
df['alone'] = df['alone'].astype(object)

# drop column 'alive'
df = df.drop(columns='alive')

df.head(3)

Unnamed: 0,sex,age,parch,fare,class,embark_town,alone,label
0,male,22.0,0,7.25,Third,Southampton,False,0
1,female,38.0,0,71.2833,First,Cherbourg,False,1
2,female,26.0,0,7.925,Third,Southampton,True,1


In [256]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 8 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   sex          891 non-null    object 
 1   age          714 non-null    float64
 2   parch        891 non-null    int64  
 3   fare         891 non-null    float64
 4   class        891 non-null    object 
 5   embark_town  889 non-null    object 
 6   alone        891 non-null    object 
 7   label        891 non-null    int32  
dtypes: float64(2), int32(1), int64(1), object(4)
memory usage: 52.3+ KB


In [257]:
# define X y
X = df.drop(columns='label')
y = df['label']

In [258]:
# train test split
X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    stratify=y,
    test_size=0.2,
    random_state=2020
)

In [259]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 712 entries, 317 to 792
Data columns (total 7 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   sex          712 non-null    object 
 1   age          569 non-null    float64
 2   parch        712 non-null    int64  
 3   fare         712 non-null    float64
 4   class        712 non-null    object 
 5   embark_town  710 non-null    object 
 6   alone        712 non-null    object 
dtypes: float64(2), int64(1), object(4)
memory usage: 44.5+ KB


In [260]:
# buat dataframe baru berisi X_train
titanic_train = X_train

# tambah column label dari y_train
titanic_train['label'] = y_train

# ubah  data type alone jadi string
titanic_train['alone'] = titanic_train['alone'].astype(str)

titanic_train

Unnamed: 0,sex,age,parch,fare,class,embark_town,alone,label
317,male,54.0,0,14.0000,Second,Southampton,True,0
80,male,22.0,0,9.0000,Third,Southampton,True,0
510,male,29.0,0,7.7500,Third,Queenstown,True,1
298,male,,0,30.5000,First,Southampton,True,1
188,male,40.0,1,15.5000,Third,Queenstown,False,0
...,...,...,...,...,...,...,...,...
572,male,36.0,0,26.3875,First,Southampton,True,1
284,male,,0,26.0000,First,Southampton,True,0
224,male,38.0,0,90.0000,First,Southampton,False,1
406,male,51.0,0,7.7500,Third,Southampton,True,0


Unnamed: 0,sex,age,parch,fare,class,embark_town,alone,label
317,male,54.0,0,14.0000,Second,Southampton,True,0
80,male,22.0,0,9.0000,Third,Southampton,True,0
510,male,29.0,0,7.7500,Third,Queenstown,True,1
298,male,,0,30.5000,First,Southampton,True,1
188,male,40.0,1,15.5000,Third,Queenstown,False,0
...,...,...,...,...,...,...,...,...
572,male,36.0,0,26.3875,First,Southampton,True,1
284,male,,0,26.0000,First,Southampton,True,0
224,male,38.0,0,90.0000,First,Southampton,False,1
406,male,51.0,0,7.7500,Third,Southampton,True,0


In [261]:
# buat dataframe baru berisi X_test
titanic_test = X_test

# tambah column label dari y_test
titanic_test['label'] = y_test

# ubah  data type alone jadi string
titanic_test['alone'] = titanic_test['alone'].astype(str)

titanic_test

Unnamed: 0,sex,age,parch,fare,class,embark_town,alone,label
560,male,,0,7.7500,Third,Queenstown,True,0
130,male,33.0,0,7.8958,Third,Cherbourg,True,0
551,male,27.0,0,26.0000,Second,Southampton,True,0
587,male,60.0,1,79.2000,First,Cherbourg,False,1
2,female,26.0,0,7.9250,Third,Southampton,True,1
...,...,...,...,...,...,...,...,...
818,male,43.0,0,6.4500,Third,Southampton,True,0
113,female,20.0,0,9.8250,Third,Southampton,False,0
605,male,36.0,0,15.5500,Third,Southampton,False,0
642,female,2.0,2,27.9000,Third,Southampton,False,0


In [262]:
titanic_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 712 entries, 317 to 792
Data columns (total 8 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   sex          712 non-null    object 
 1   age          569 non-null    float64
 2   parch        712 non-null    int64  
 3   fare         712 non-null    float64
 4   class        712 non-null    object 
 5   embark_town  710 non-null    object 
 6   alone        712 non-null    object 
 7   label        712 non-null    int32  
dtypes: float64(2), int32(1), int64(1), object(4)
memory usage: 47.3+ KB


In [263]:
titanic_test.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 179 entries, 560 to 206
Data columns (total 8 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   sex          179 non-null    object 
 1   age          145 non-null    float64
 2   parch        179 non-null    int64  
 3   fare         179 non-null    float64
 4   class        179 non-null    object 
 5   embark_town  179 non-null    object 
 6   alone        179 non-null    object 
 7   label        179 non-null    int32  
dtypes: float64(2), int32(1), int64(1), object(4)
memory usage: 11.9+ KB


In [264]:
# save ke csv
titanic_train.to_csv('titanic_train.csv', index=False)
titanic_test.to_csv('titanic_test.csv', index=False)

In [265]:
titanic_train = pd.read_csv('titanic_train.csv')

# ubah  data type alone jadi string
titanic_train['alone'] = titanic_train['alone'].astype(str)

titanic_train

Unnamed: 0,sex,age,parch,fare,class,embark_town,alone,label
0,male,54.0,0,14.0000,Second,Southampton,True,0
1,male,22.0,0,9.0000,Third,Southampton,True,0
2,male,29.0,0,7.7500,Third,Queenstown,True,1
3,male,,0,30.5000,First,Southampton,True,1
4,male,40.0,1,15.5000,Third,Queenstown,False,0
...,...,...,...,...,...,...,...,...
707,male,36.0,0,26.3875,First,Southampton,True,1
708,male,,0,26.0000,First,Southampton,True,0
709,male,38.0,0,90.0000,First,Southampton,False,1
710,male,51.0,0,7.7500,Third,Southampton,True,0


In [266]:
titanic_test = pd.read_csv('titanic_test.csv')

# ubah  data type alone jadi string
titanic_test['alone'] = titanic_test['alone'].astype(str)

titanic_test

Unnamed: 0,sex,age,parch,fare,class,embark_town,alone,label
0,male,,0,7.7500,Third,Queenstown,True,0
1,male,33.0,0,7.8958,Third,Cherbourg,True,0
2,male,27.0,0,26.0000,Second,Southampton,True,0
3,male,60.0,1,79.2000,First,Cherbourg,False,1
4,female,26.0,0,7.9250,Third,Southampton,True,1
...,...,...,...,...,...,...,...,...
174,male,43.0,0,6.4500,Third,Southampton,True,0
175,female,20.0,0,9.8250,Third,Southampton,False,0
176,male,36.0,0,15.5500,Third,Southampton,False,0
177,female,2.0,2,27.9000,Third,Southampton,False,0


In [267]:
titanic_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 712 entries, 0 to 711
Data columns (total 8 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   sex          712 non-null    object 
 1   age          569 non-null    float64
 2   parch        712 non-null    int64  
 3   fare         712 non-null    float64
 4   class        712 non-null    object 
 5   embark_town  710 non-null    object 
 6   alone        712 non-null    object 
 7   label        712 non-null    int64  
dtypes: float64(2), int64(2), object(4)
memory usage: 44.6+ KB


In [268]:
titanic_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 179 entries, 0 to 178
Data columns (total 8 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   sex          179 non-null    object 
 1   age          145 non-null    float64
 2   parch        179 non-null    int64  
 3   fare         179 non-null    float64
 4   class        179 non-null    object 
 5   embark_town  179 non-null    object 
 6   alone        179 non-null    object 
 7   label        179 non-null    int64  
dtypes: float64(2), int64(2), object(4)
memory usage: 11.3+ KB


# 2. Model Training

In [269]:
# 1. DATA
# titanic_train = pd.read_csv('titanic_train.csv')

# 2. PREPROCESS
# pipeline berisi imputing lalu binary encoding untuk 'embark_town' nanti
binary_encoder_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('binary encoder', ce.BinaryEncoder())
])

# ordinal mapping untuk 'class' nanti
ordinal_mapping = [
    {'col':'class',
    'mapping':{None:0, 'First':1, 'Second':2, 'Third':3}}
]

ordinal_encoder = ce.OrdinalEncoder(cols=['class'], mapping=ordinal_mapping)

# transformer 
transformer = ColumnTransformer([
    ('imputer', SimpleImputer(strategy='median'), ['age']),
    ('one hot encoder', OneHotEncoder(drop='first'), ['sex','alone']),
    ('ordinal encoder', ordinal_encoder, ['class']),
    ('binary encoder', binary_encoder_pipeline, ['embark_town'])
], remainder='passthrough')

# 3. DATA SPLITTING
X = titanic_train.drop(columns='label')
y = titanic_train['label']

# 4. MODEL SELECTION
model = LogisticRegression(solver='liblinear', random_state=2020)

estimator = Pipeline([
    ('prepocess', transformer),
    ('clf', model)
])

hyperparam_space = {
    'clf__C':[100, 10, 1, 0.1, 0.01, 0.001],
    'clf__solver':['liblinear','newton-cg']
}

skfold = StratifiedKFold(n_splits=5)

grid_search = GridSearchCV(
    estimator, 
    param_grid = hyperparam_space,
    cv = skfold,
    scoring = 'f1',
    n_jobs = -1
)

grid_search.fit(X,y)

GridSearchCV(cv=StratifiedKFold(n_splits=5, random_state=None, shuffle=False),
             estimator=Pipeline(steps=[('prepocess',
                                        ColumnTransformer(remainder='passthrough',
                                                          transformers=[('imputer',
                                                                         SimpleImputer(strategy='median'),
                                                                         ['age']),
                                                                        ('one '
                                                                         'hot '
                                                                         'encoder',
                                                                         OneHotEncoder(drop='first'),
                                                                         ['sex',
                                                                          'alone']),

# 3. Prediction 

## a. Pickle

In [270]:
# # model pickle
grid_search.best_estimator_.fit(X,y) # final model

# # saving model
filename = 'Titanic_Final5.sav'
pickle.dump(grid_search.best_estimator_,open(filename,'wb'))

In [271]:
# # Load Dataset untuk test
# titanic_test = pd.read_csv('titanic_test.csv')

# # Load Model Pickle
filename = 'Titanic_Final5.sav'
loaded_model = pickle.load(open(filename,'rb'))

In [272]:
# Predict Dataset Test
loaded_model.predict(titanic_test.drop(columns='label'))

array([0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0,
       0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0,
       1, 0, 1, 0, 1, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 0, 1, 1,
       0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 1, 1, 1, 0, 1,
       1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 1, 0, 1, 1,
       0, 1, 0, 1, 1, 0, 1, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0,
       1, 0, 1, 1, 1, 0, 0, 1, 1, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1,
       0, 0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1,
       0, 1, 0], dtype=int64)

In [273]:
y_pred = loaded_model.predict(titanic_test.drop(columns='label'))
f1_score(titanic_test['label'], y_pred)

0.7153284671532847

In [274]:
titanic_train.head(3)

Unnamed: 0,sex,age,parch,fare,class,embark_town,alone,label
0,male,54.0,0,14.0,Second,Southampton,True,0
1,male,22.0,0,9.0,Third,Southampton,True,0
2,male,29.0,0,7.75,Third,Queenstown,True,1


In [275]:
titanic_train['embark_town'].unique()

array(['Southampton', 'Queenstown', 'Cherbourg', nan], dtype=object)

In [276]:
# coba kita tes pakai data observasi baru

df_observ = pd.DataFrame({
    'sex':['male'],
    'age':[25],
    'parch':[0],
    'fare':[500],
    'class':['First'],
    'embark_town':['Southampton'],
    'alone':['True']
})

print('predict class: ', loaded_model.predict(df_observ))
print('predict proba: ', loaded_model.predict_proba(df_observ))


predict class:  [1]
predict proba:  [[0.33349238 0.66650762]]


## b. Joblib

In [277]:
# saving model
# joblib.dump(grid_search.best_estimator_,'model_joblib')

In [278]:
# load model
# model_joblib = joblib.load('model_joblib')

In [279]:
# predict ke data observasi
# print('predict class', model_joblib.predict(df_observ))
# print('predict proba', model_joblib.predict_proba(df_observ))