<h1 align = "center">Training a DAE on the Titanic Dataset</h1>

---

In [None]:
import os
import logging
import sys
import tqdm

In [25]:
import numpy as np
import pandas as pd

from tabdae.models.model import DAE

from sklearn.linear_model import RidgeClassifierCV
from sklearn.metrics.pairwise import cosine_similarity

In [None]:
ROOT = ".." # the document root is one level up, that contains all code structure
DATA = os.path.join(ROOT, "data") # the directory contains all data files, subdirectory (if any) can also be used/defined
PROCESSED_DATA = os.path.join(DATA, "processed")

### Data 

This is a classification problem, the goal is to predict whether an passenger survived the tragic. 

In [26]:
df = pd.read_csv(f'{DATA}/titanic.csv')
print(df.head())

   pclass  survived                                             name     sex  \
0     1.0         1                    Allen, Miss. Elisabeth Walton  female   
1     1.0         1                   Allison, Master. Hudson Trevor    male   
2     1.0         0                     Allison, Miss. Helen Loraine  female   
3     1.0         0             Allison, Mr. Hudson Joshua Creighton    male   
4     1.0         0  Allison, Mrs. Hudson J C (Bessie Waldo Daniels)  female   

       age  sibsp  parch  ticket      fare    cabin embarked boat   body  \
0  29.0000    0.0    0.0   24160  211.3375       B5        S    2    NaN   
1   0.9167    1.0    2.0  113781  151.5500  C22 C26        S   11    NaN   
2   2.0000    1.0    2.0  113781  151.5500  C22 C26        S  NaN    NaN   
3  30.0000    1.0    2.0  113781  151.5500  C22 C26        S  NaN  135.0   
4  25.0000    1.0    2.0  113781  151.5500  C22 C26        S  NaN    NaN   

                         home.dest  
0                     St 

In [None]:
y = df['survived']
df.drop('survived', axis=1, inplace=True)

### DAE model

+ By default, the `DAE` model class uses a `Deep Stacked AutoEncoder` network. 

In [27]:
dae = DAE()  
dae.fit(df, verbose=1)

epoch    0 - train loss 1.7711 - valid loss 1.6766
epoch   10 - train loss 0.8435 - valid loss 0.6317
epoch   20 - train loss 0.8224 - valid loss 0.5992
epoch   30 - train loss 0.7894 - valid loss 0.6190
epoch   40 - train loss 0.7613 - valid loss 0.6010
Epoch 00045: reducing learning rate of group 0 to 3.0000e-05.
epoch   50 - train loss 0.7514 - valid loss 0.6045
epoch   60 - train loss 0.7597 - valid loss 0.5665
Epoch 00063: reducing learning rate of group 0 to 3.0000e-06.
epoch   70 - train loss 0.7399 - valid loss 0.5723
Epoch 00075: reducing learning rate of group 0 to 3.0000e-07.
epoch   80 - train loss 0.7453 - valid loss 0.6087
Epoch 00090: reducing learning rate of group 0 to 3.0000e-08.
epoch   90 - train loss 0.7622 - valid loss 0.5766
epoch  100 - train loss 0.7699 - valid loss 0.6197
Epoch 00101: reducing learning rate of group 0 to 3.0000e-09.
epoch  110 - train loss 0.7557 - valid loss 0.5963
epoch  120 - train loss 0.7441 - valid loss 0.6221
epoch  130 - train loss 0.7

### Extract Hidden Representations

In [28]:
features = dae.transform(df)
print(features.shape)
print(features[:5, :5])

(1309, 384)
[[0.34836283 0.36228737 0.5858515  0.63929695 0.        ]
 [0.         0.5106081  0.50572485 1.0189637  0.        ]
 [0.         0.5026131  0.37263662 0.9599643  0.        ]
 [0.         0.38337716 0.4179449  0.5958852  0.        ]
 [0.         0.31011987 0.35858983 0.7810749  0.        ]]


### Use the Hidden Representation for a Classifier

In [35]:
classifier = RidgeClassifierCV(alphas=[1, 5, 10, 20], cv=5).fit(features, y)
print('5 Fold CV Accuracy: {:4.2f}%'.format(np.round(classifier.best_score_ * 100, 4)))

5 Fold CV Accuracy: 95.26%


### Similarity Query

In [30]:
similarity_matrix = cosine_similarity(features)
np.fill_diagonal(similarity_matrix, 0)

In [32]:
pd.concat([df.iloc[0, :].T, df.iloc[similarity_matrix[0, :].argmax(), :]])

pclass                                     1.0
name             Allen, Miss. Elisabeth Walton
sex                                     female
age                                       29.0
sibsp                                      0.0
parch                                      0.0
ticket                                   24160
fare                                  211.3375
cabin                                       B5
embarked                                     S
boat                                         2
body                                       NaN
home.dest                         St Louis, MO
pclass                                     1.0
name         Madill, Miss. Georgette Alexandra
sex                                     female
age                                       15.0
sibsp                                      0.0
parch                                      1.0
ticket                                   24160
fare                                  211.3375
cabin        

In [34]:
pd.concat([df.iloc[42, :].T, df.iloc[similarity_matrix[42, :].argmax(), :]])

pclass                                                  1.0
name         Brown, Mrs. John Murray (Caroline Lane Lamson)
sex                                                  female
age                                                    59.0
sibsp                                                   2.0
parch                                                   0.0
ticket                                                11769
fare                                                51.4792
cabin                                                  C101
embarked                                                  S
boat                                                      D
body                                                    NaN
home.dest                                       Belmont, MA
pclass                                                  1.0
name          Appleton, Mrs. Edward Dale (Charlotte Lamson)
sex                                                  female
age                                     