In [1]:
# Reference : https://www.kaggle.com/chriszou/titanic-with-pytorch-nn-solution

In [2]:
import numpy as np
import pandas as pd

import os
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
import torch.nn.functional as F
from sklearn.preprocessing import LabelEncoder

In [3]:
train = pd.read_csv('../../_data/titanic/train.csv')
test = pd.read_csv('../../_data/titanic/test.csv')

In [4]:
train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [5]:
test.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


In [6]:
(train.shape, test.shape)

((891, 12), (418, 11))

In [8]:
all_df = pd.concat([train, test], sort=False)
print(all_df.shape)
all_df.head()

(1309, 12)


Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0.0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1.0,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1.0,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1.0,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0.0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [10]:
# 데이터 전처리 하기
def preprocess(df, cat_cols):
    df = df.drop(['PassengerId', 'Name', 'Ticket', 'Cabin'], axis=1)
    
    for cat_col in cat_cols:
        if cat_col in ['Embarked']:
            df[cat_col] = LabelEncoder().fit_transform(df[cat_col].astype(str))
        else:
            df[cat_col] = LabelEncoder().fit_transform(df[cat_col])
            
    df = df.fillna(df.mean())
    return df

In [12]:
cat_cols = ['Pclass', 'Sex', 'SibSp', 'Parch', 'Embarked']
all_df = preprocess(all_df, cat_cols)
all_df.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,0.0,2,1,22.0,1,0,7.25,2
1,1.0,0,0,38.0,1,0,71.2833,0
2,1.0,2,0,26.0,0,0,7.925,2
3,1.0,0,0,35.0,1,0,53.1,2
4,0.0,2,1,35.0,0,0,8.05,2


In [18]:
print(train.shape)
print(all_df.shape)
train_df = all_df.head(train.shape[0])
train_df.shape

(891, 12)
(1309, 8)


(891, 8)

In [19]:
class TabularDataset(Dataset):
    def __init__(self, df, categorical_columns, output_column = None):
        super().__init__()
        self.len = df.shape[0]
        
        # 분류형 데이터와 연속형 데이터를 분리한다.
        self.categorical_columns = categorical_columns
        self.continous_columns = [col for col in df.columns if col not in self.categorical_columns + [output_column]]
        
        if self.continous_columns:
            self.cont_X = df[self.continous_columns].astype(np.float32).values
        else:
            self.cont_X = np.zeros((self.len, 1))
        
        if self.categorical_columns:
            self.cat_X = df[self.categorical_columns].astype(np.int64).values
        else:
            self.cat_X = np.zeros((self.len, 1))
            
        if output_column != None:
            self.has_label = True
            self.label = df[output_column].astype(np.float32).values.reshape(-1, 1)
        else:
            self.has_label = False
            
    def __len__(self):
        return self.len
    
    def __getitem__(self, index):
        if self.has_label:
            return [self.label[index], self.cont_X[index], self.cat_X[index]]
        else: 
            return [self.cont_X[index], self.cat_X[index]]

In [20]:
train_ds = TabularDataset(train_df, cat_cols, 'Survived')
train_dl = DataLoader(train_ds, 64, shuffle=True)

In [63]:
train_ds[5]

[array([0.], dtype=float32),
 array([29.881138,  8.4583  ], dtype=float32),
 array([2, 1, 0, 0, 1])]

In [50]:
class TitanicNet(nn.Module):
    def __init__(self, emb_dims, n_cont, lin_layer_sizes, output_size):
        super().__init__()
        
        self.emb_layers = nn.ModuleList([nn.Embedding(x, y) for x, y in emb_dims])
        self.n_embs = sum([y for x, y in emb_dims])
        self.n_cont = n_cont
        
        # linear layers
        first_lin_layer = nn.Linear(self.n_embs + self.n_cont, lin_layer_sizes[0])
        
        self.lin_layers = nn.ModuleList(
            [first_lin_layer] + 
            [nn.Linear(lin_layer_sizes[i], lin_layer_sizes[i+1]) for i in range(len(lin_layer_sizes) - 1)]
        )
        
        self.output_layer = nn.Linear(lin_layer_sizes[-1], output_size)
        nn.init.kaiming_normal_(self.output_layer.weight.data)
        
        # Batch Norm Layers
        self.first_bn_layer = nn.BatchNorm1d(self.n_cont)
        self.bn_layers = nn.ModuleList([nn.BatchNorm1d(size) for size in lin_layer_sizes])
        
    def forward(self, cont_data, cat_data):
        if self.n_embs != 0:
            x = [emb_layer(cat_data[:, i]) for i, emb_layer in enumerate(self.emb_layers)]
            # torch cat : concatenate data
            x = torch.cat(x, 1)
            
        if self.n_cont != 0:
            normalized_cont_data = self.first_bn_layer(cont_data)
        
            if self.n_embs != 0:
                x = torch.cat([x, normalized_cont_data], 1)
            else:
                x = cont_data
                
        for lin_layer, bn_layer in zip(self.lin_layers, self.bn_layers):
            x = torch.relu(lin_layer(x))
            x = bn_layer(x)
        
        x = self.output_layer(x)
        x = torch.sigmoid(x)
        return x

In [51]:
cat_dims = [int(all_df[col].nunique()) for col in cat_cols]
cat_dims

[3, 2, 7, 8, 4]

In [52]:
emb_dims = [(x, min(50, (x+1) // 2)) for x in cat_dims]
emb_dims

[(3, 2), (2, 1), (7, 4), (8, 4), (4, 2)]

In [53]:
# 결과가 동일하게 보여주기 위해서 랜덤 시드를 동일하게 맞춘다.
torch.manual_seed(2)

<torch._C.Generator at 0x118e2bad0>

In [49]:
model = TitanicNet(emb_dims, n_cont=2, lin_layer_sizes=[50, 100, 50], output_size = 1)
optimizer = torch.optim.Adam(model.parameters(), lr=0.003)
epochs = 10
criterion = nn.BCELoss()

for epoch in range(epochs):
    epoch_loss = 0
    epoch_accuracy = 0
    i = 0
    for y, cont_x, cat_x in train_dl:
        preds = model(cont_x, cat_x)
        loss = criterion(preds, y)
        epoch_loss += loss
        
        accuracy = ((preds > 0.5).float() == y).float().mean()
        epoch_accuracy += accuracy
        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
    
    print("Epoch ", epoch, ", loss:", epoch_loss.item()/len(train_dl), "accuracy: ", epoch_accuracy.item()/len(train_dl))

Epoch  0 , loss: 0.7211196081978934 accuracy:  0.6221814155578613
Epoch  1 , loss: 0.5557212488991874 accuracy:  0.7241222517830985
Epoch  2 , loss: 0.5144580772944859 accuracy:  0.7575098446437291
Epoch  3 , loss: 0.46396309988839285 accuracy:  0.7945293698992048
Epoch  4 , loss: 0.4428294726780483 accuracy:  0.8002988951546806
Epoch  5 , loss: 0.42421419279915945 accuracy:  0.8123865127563477
Epoch  6 , loss: 0.4009763513292585 accuracy:  0.8301490374973842
Epoch  7 , loss: 0.40278901372637066 accuracy:  0.8163778441292899
Epoch  8 , loss: 0.40320140974862234 accuracy:  0.8236417770385742
Epoch  9 , loss: 0.39810265813555035 accuracy:  0.8273683275495257
