# Titanic survival classification problem
- Download titanic dataset.
- Train a 2-layer NN with 5 neurons per layer (input/output apart) for XX epochs and 64 batch size.
- Save the model.

## DL model

In [None]:
import boto3 # required in case we store the artifacts on s3
import mlflow
import numpy as np
import os
import pandas as pd
import seaborn as sns
import sklearn.model_selection as ms

from keras import regularizers
from keras.layers import Input, Dense, Dropout
from keras.models import Model
from matplotlib import pyplot as plt
from sklearn.metrics import roc_curve, roc_auc_score, precision_score, accuracy_score,recall_score, auc, confusion_matrix, ConfusionMatrixDisplay
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.optimizers import Adam

## Get MLFlow server URI

In [None]:
registry_uri = os.getenv('REGISTRY_URI')
if not registry_uri:
    raise Exception('REGISTRY_URI env variable should be defined on the system in order to log the generated model')

## Prepare dataset

In [None]:
# data load
dataset = sns.load_dataset("titanic")
dataframe, test_dataframe = ms.train_test_split(dataset, train_size=0.7, random_state=1)
dataframe.head(5)

In [None]:
# format data
dataframe = dataframe.astype({"deck": str})
test_dataframe = test_dataframe.astype({"deck": str})

In [None]:
# data processing
for i in dataframe.index:
    if dataframe['deck'][i] == 'nan':
        dataframe.loc[i,'deck'] = 'Z'

for i in test_dataframe.index:
    if test_dataframe['deck'][i] == 'nan':
        test_dataframe.loc[i,'deck'] = 'Z'

train_median = dataframe['age'].median()
for i in dataframe.index:
    if dataframe['age'][i] != dataframe['age'][i]:
        dataframe.loc[i,'age'] = train_median

train_median = test_dataframe['age'].median()
for i in test_dataframe.index:
    if test_dataframe['age'][i] != test_dataframe['age'][i]:
        test_dataframe.loc[i,'age'] = train_median

X = dataframe[['sex', 'pclass', 'age', 'deck']]
y = dataframe[['alive']]
X_ts = test_dataframe[['sex', 'pclass', 'age', 'deck']]
y_ts = test_dataframe[['alive']]
X_ts.head(10)

In [None]:
# normalization
normalization = [X.loc[:, 'age'].mean(), X.loc[:, 'age'].std()]
print("Age normalization --> " + str(normalization))

X.loc[:, 'age'] = (X.loc[:, 'age'] - normalization[0]) / normalization[1]
X_ts.loc[:, 'age'] = (X_ts.loc[:, 'age'] - normalization[0]) / normalization[1]

X_dum = pd.get_dummies(X)
X_ts_dum = pd.get_dummies(X_ts)

In [None]:
# dummification
label_encoder = LabelEncoder()
integer_encoded = label_encoder.fit_transform(y.values.ravel())
y = integer_encoded.reshape(len(integer_encoded), 1)
integer_encoded_ts = label_encoder.fit_transform(y_ts.values.ravel())
y_ts = integer_encoded_ts.reshape(len(integer_encoded_ts), 1)

## Model implementation

In [None]:
opt = Adam(learning_rate=0.01)

# model definition
input = Input(len(X_dum.columns))
layer_1 = Dense(5, activation='relu')(input)
layer_2 = Dense(5, activation='relu')(layer_1)
output = Dense(1, activation='sigmoid')(layer_2)

model = Model(input, output)
model.compile(loss='binary_crossentropy',
          optimizer=opt,
          metrics=['accuracy'])
model.summary()

In [None]:
hist=model.fit(X_dum, 
               y,
               batch_size=64,
               epochs=5,
               validation_split=0.1,
               shuffle=True)

In [None]:
# model.save("titanic_DeepLearn_model")

In [None]:
hist.history.keys()

In [None]:
probabilities = model.predict(X_ts_dum)
fpr, tpr, _ = roc_curve(y_ts, probabilities)
auc = auc(fpr, tpr)
print("Max ROC:")
print(auc)

In [None]:
# register the classifier
mlflow.set_tracking_uri(registry_uri)
mlflow.set_experiment('NeuralNetwork')

with mlflow.start_run(run_name='forest_gump'):
    mlflow.log_metric("auc", auc)
    mlflow.keras.log_model(keras_model=model, artifact_path='', registered_model_name='neural_network')

In [None]:
predictions = np.where(probabilities > .5, 1, 0)
cm = confusion_matrix(y_true=y_ts, y_pred=predictions)

In [None]:
labels = ['Survivor', 'Dead']
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=labels)
disp.plot()

In [None]:
def plot_curves(history):
  plt.figure()
  plt.xlabel('Épocas')
  plt.ylabel('Error')
  plt.plot(history['loss'])
  plt.plot(history['val_loss'])
  plt.legend(['Entrenamiento', 'Validación'])

  plt.figure()
  plt.xlabel('Épocas')
  plt.ylabel('Accuracy')
  plt.plot(history['accuracy'])
  plt.plot(history['val_accuracy'])
  plt.legend(['Entrenamiento', 'Validación'], loc='lower right')

plot_curves(hist.history)

In [None]:
plt.figure()
plt.plot(fpr, tpr, label='ROC curve (area = %0.2f)' % auc)
plt.plot([0, 1], [0, 1], linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic example')
plt.legend(loc="lower right")
plt.show()