In [2]:
import warnings
warnings.filterwarnings(action='ignore')
import pandas as pd

import torch
import torch.nn as nn
from torchvision import datasets, transforms
from torch.utils.data import Dataset, DataLoader, random_split

from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler

In [3]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'

torch.manual_seed(777)
if device == 'cuda':
    torch.cuda.manual_seed_all(777)

In [4]:
train_data = pd.read_csv('train.csv')
test_data = pd.read_csv('test.csv')

In [5]:
train_data = train_data.set_index('PassengerId')
test_data = test_data.set_index('PassengerId')

In [6]:
num_imputer = SimpleImputer(strategy='median')
cat_imputer = SimpleImputer(strategy='most_frequent')

train_data['Age'] = num_imputer.fit_transform(train_data[['Age']])
train_data['Embarked'] = cat_imputer.fit_transform(train_data[['Embarked']])

test_data['Age'] = num_imputer.fit_transform(test_data[['Age']])
test_data['Fare'] = num_imputer.fit_transform(test_data[['Fare']])

In [7]:
male_survivor = train_data[train_data['Sex'] == 'male']['Survived']
female_survivor = train_data[train_data['Sex'] == 'female']['Survived']

In [8]:
train_data['AgeBand'] = train_data['Age'] // 10
train_data['Fellow'] = train_data['SibSp'] + train_data['Parch']

test_data['AgeBand'] = test_data['Age'] // 10
test_data['Fellow'] = test_data['SibSp'] + test_data['Parch']

In [9]:
train_data['Honorific'] = train_data['Name'].str.extract('([A-Za-z]+)\.', expand=False)
test_data['Honorific'] = test_data['Name'].str.extract('([A-Za-z]+)\.', expand=False)

In [10]:
train_data['Honorific'] = train_data['Honorific'].replace(['Mlle', 'Ms'], 'Miss')
train_data['Honorific'] = train_data['Honorific'].replace('Mme', 'Mrs')
train_data['Honorific'] = train_data['Honorific'].replace(['Capt', 'Col', 'Don', 'Dr', 'Major', 'Rev', 'Sir', 'Jonkheer', 'Lady', 'Countess'], 'etc')

test_data['Honorific'] = test_data['Honorific'].replace('Ms', 'Miss')
test_data['Honorific'] = test_data['Honorific'].replace(['Col', 'Dona', 'Dr', 'Rev', 'Sir', 'Jonkheer'], 'etc')

In [11]:
train_data['Alone'] = train_data['Fellow'].apply(lambda x: 1 if x == 0 else 0)
test_data['Alone'] = test_data['Fellow'].apply(lambda x: 1 if x == 0 else 0)

In [12]:
num = ['Age', 'Fare', 'Fellow']
cat = ['Sex', 'Embarked', 'Pclass', 'Honorific', 'Alone', 'AgeBand']

pipeline = ColumnTransformer([
        ('num', StandardScaler(), num),
        ('cat', OneHotEncoder(), cat),
        ])

In [13]:
x_train = pipeline.fit_transform(train_data[num + cat])
y_train = train_data['Survived']

In [14]:
x_test = pipeline.transform(test_data[num + cat])

In [15]:
x_train = torch.tensor(x_train, dtype=torch.float32).to(device)
y_train = torch.tensor(y_train.values, dtype=torch.float32).unsqueeze(1).to(device)

x_test = torch.tensor(x_test, dtype=torch.float32).to(device)

In [16]:
class CustomDataset(Dataset):
    def __init__(self, x, y):
        self.x = x
        self.y = y

    def __len__(self):
        return len(self.x)

    def __getitem__(self, idx):
        x = self.x[idx]
        y = self.y[idx]
        return x, y

In [17]:
dataset = CustomDataset(x_train, y_train)

In [18]:
dataset_size = len(dataset)
train_size = int(dataset_size * 0.8)
validation_size = dataset_size - train_size

train_dataset, validation_dataset = random_split(dataset, [train_size, validation_size])

In [19]:
train_dataloader = DataLoader(train_dataset, batch_size=64, shuffle=True, drop_last=True)
validation_dataloader = DataLoader(validation_dataset, batch_size=16, shuffle=True, drop_last=True)

In [20]:
class Net(nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        self.linear1 = nn.Linear(27, 64)
        self.linear2 = nn.Linear(64, 64)
        self.linear3 = nn.Linear(64, 64)
        self.linear4 = nn.Linear(64, 64)
        self.linear5 = nn.Linear(64, 1)
        self.relu = nn.ReLU()
        self.sigmoid = nn.Sigmoid()
        self.dropout = nn.Dropout(p=0.3)

        nn.init.kaiming_uniform_(self.linear1.weight)
        nn.init.kaiming_uniform_(self.linear2.weight)
        nn.init.kaiming_uniform_(self.linear3.weight)
        nn.init.kaiming_uniform_(self.linear4.weight)
        nn.init.kaiming_uniform_(self.linear5.weight)

    def forward(self, x):
        x = self.linear1(x)
        x = self.relu(x)
        x = self.dropout(x)

        x = self.linear2(x)
        x = self.relu(x)
        x = self.dropout(x)

        x = self.linear3(x)
        x = self.relu(x)
        x = self.dropout(x)

        x = self.linear4(x)
        x = self.relu(x)
        x = self.dropout(x)
        
        x = self.linear5(x)
        x = self.sigmoid(x)

        return x

In [21]:
epochs = 2000
learning_rate = 0.001

In [None]:
model = Net().to(device)

criterion = nn.BCELoss().to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

model.train()
for epoch in range(1, epochs + 1):
    epoch_loss = 0

    for x, y in train_dataloader:
        x, y = x.to(device), y.to(device).type(torch.cuda.FloatTensor)

        optimizer.zero_grad()

        y_hat = model(x)

        loss = criterion(y_hat, y)

        loss.backward()

        optimizer.step()

        epoch_loss += loss.item()
    
    if epoch % 100 == 0:
        print(f'Epoch {epoch:4d}/{epochs} Cost: {epoch_loss/len(train_dataloader):.6f}')

Epoch  100/2000 Cost: 0.363292
Epoch  200/2000 Cost: 0.331866
Epoch  300/2000 Cost: 0.279307
Epoch  400/2000 Cost: 0.265127
Epoch  500/2000 Cost: 0.243283
Epoch  600/2000 Cost: 0.251929
Epoch  700/2000 Cost: 0.226956
Epoch  800/2000 Cost: 0.218499
Epoch  900/2000 Cost: 0.224821
Epoch 1000/2000 Cost: 0.216364
Epoch 1100/2000 Cost: 0.214792
Epoch 1200/2000 Cost: 0.230335
Epoch 1300/2000 Cost: 0.209024
Epoch 1400/2000 Cost: 0.195622
Epoch 1500/2000 Cost: 0.197668
Epoch 1600/2000 Cost: 0.205466
Epoch 1700/2000 Cost: 0.210533
Epoch 1800/2000 Cost: 0.196618
Epoch 1900/2000 Cost: 0.198150
Epoch 2000/2000 Cost: 0.195491


In [None]:
with torch.no_grad():
    model.eval()
    num_correct = 0
    num_samples = 0
    for x, y in validation_dataloader:
        x = x.to(device)
        y = y.to(device)
        
        y_hat = model(x)

        prediction = (y_hat > 0.5).float()
        num_correct += (prediction == y).sum().item()
        num_samples += y.size(0)

    print('Accuracy:', num_correct / num_samples)

Accuracy: 0.8352272727272727


In [None]:
pred = model(x_test)
pred = pd.DataFrame(pred.tolist())

pred[0] = pred[0].apply(lambda x : 1 if x>=0.5 else 0)

In [None]:
submission = pd.DataFrame({
    'PassengerId': test_data.index
    })
submission['Survived'] = pred
submission.to_csv('submission_nn.csv', index=False)