In [1]:
import pandas as pd
import numpy as np
import torch
from torch.utils.data.dataloader import DataLoader
from torch.utils.data.dataset import Dataset
import os
from torchvision.transforms import transforms
import torch.nn as nn
from torch.autograd import Variable
import torch.optim as optim
import math
from torch.utils.data.sampler import RandomSampler

In [2]:
file_path = '/home/max/titanic/train.csv'
file = pd.read_csv(file_path)
file.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [3]:
def dummy_data(data, columns):
    for column in columns:
        data = pd.concat([data, pd.get_dummies(data[column], prefix=column)], axis=1)
        data = data.drop(column, axis=1)
    return data

In [4]:
class TitanicDataset(Dataset):
    
    def __init__(self, root_dir, file_dir, transform = None, train = True):
        self.train = train
        dummy_columns = ['Pclass', 'Embarked']
        self.file = pd.read_csv(os.path.join(root_dir, file_dir))
        if not train:
            self.indices = self.file['PassengerId'] 
        list_to_drop = ['PassengerId', 'Name' , 'Ticket', 'Fare', 'Cabin']
        self.file.drop(list_to_drop, axis = 'columns', inplace=True)
        self.file['Age'] = self.file['Age'].fillna(self.file['Age'].mean())
        self.file['Embarked']  = self.file['Embarked'].fillna('S')
        self.file = dummy_data(self.file, dummy_columns)
        columns = ['Age', 'SibSp', 'Parch']
        if train:
            self.labels = self.file.loc[:, :'Survived']
            self.file.drop(['Survived'], axis = 'columns', inplace = True)
        self.file.loc[self.file['Sex'] == 'male', ['Sex']] = 1
        self.file.loc[self.file['Sex'] == 'female', ['Sex']] = 0
        for column in columns:
            self.file[column] = (self.file[column] - self.file[column].mean())/ self.file[column].std()
        self.transform = transform

    
    def __len__(self):
        return len(self.file)
    
    def __getitem__(self, idx):
        inputs = self.file.iloc[idx].values.astype(np.float32)
        inputs = torch.from_numpy(inputs)
        if self.train:
            labels = self.labels.iloc[idx].values.astype(np.float32)
            labels = torch.from_numpy(labels)
            return inputs, labels
        return self.indices[idx], inputs
        
        

In [5]:
train_set = TitanicDataset('/home/max/titanic/', 'train.csv')

In [6]:
indices = np.array(list(range(len(train_set))))
split = 0.33
np.random.shuffle(indices)
val_idx = indices[:math.floor(len(train_set) * split)]
train_idx = indices[math.floor(len(train_set) * split):]
train_sampler = RandomSampler(train_idx)
val_sampler = RandomSampler(val_idx)

In [7]:
train_loader = DataLoader(train_set, num_workers=2, batch_size= 32, drop_last=True, sampler=train_sampler)
val_loader = DataLoader(train_set, num_workers=2, batch_size = 32, drop_last=True, sampler = val_sampler)

In [8]:
import torch.nn.functional as F

In [9]:
class ResBlock(nn.Module):
    
    def __init__(self, input_features, output_features, downsample = None):
        super(ResBlock, self).__init__()
        self.downsample = downsample
        if downsample:
            self.downsample = nn.Linear(input_features, output_features)
        self.linear1 = nn.Linear(input_features, output_features)
        self.bn1 = nn.BatchNorm1d(output_features)
        self.linear2 = nn.Linear(output_features, output_features)
        self.bn2 = nn.BatchNorm1d(output_features)

    def forward(self, x):
        residual = x
        
        out = F.relu(self.bn1(self.linear1(x)))
        out = self.bn2(self.linear2(out))
        
        if self.downsample:
            residual = self.downsample(residual)
            
        out += residual
        out = F.relu(out)
        return out        
    

In [10]:
class Net(nn.Module):
    
    def __init__(self):
        super(Net,self).__init__()
        self.block1 = ResBlock(10, 32, downsample=True)
        self.block2 = ResBlock(32, 16, downsample=True)
        self.block3 = ResBlock(16, 8, downsample=True)
        #self.block4 = ResBlock(16, 8, downsample=True)
        self.linear = nn.Linear(8, 2)
    
    def forward(self, x):
        
        out = self.block1(x)
        out = self.block2(out)
        out = self.block3(out)
        #out = self.block4(out)
        out = self.linear(out)
        return out

net2 = Net()
        
            

In [11]:
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(net2.parameters(), lr = 0.01)

In [12]:
epochs = 100
best_acc = 0
print(len(val_loader) * val_loader.batch_size)
for i in range(epochs):
    curr_loss = 0
    sum_ = 0
    net2.train()
    for idx, data in enumerate(train_loader):
        
        optimizer.zero_grad()
        
        inputs, labels = data
        inputs = Variable(inputs.float())
        labels = Variable(labels.long())
        outputs = net2(inputs)
        labels = labels.squeeze()
        loss = criterion(outputs, labels)
        loss.backward()
        
        optimizer.step()
        curr_loss += loss.data[0]
        _, ans = torch.max(outputs, 1)
        correct = [labels[i].float() == ans[i].float() for i in range(len(labels))]
        sum_ += sum(correct).data[0]
        if idx % 20 == 19:
            #print('curr. loss: ', curr_loss / 20)
            curr_loss = 0
    print('epoch train accuracy: ', sum_ / (len(train_loader) * train_loader.batch_size))
    
    sum_ = 0
    net2.eval()
    for idx, data in enumerate(val_loader):
        
        inputs, labels = data
        inputs = Variable(inputs.float())
        labels = Variable(labels.long())
        
        outputs = net2(inputs)
        _, ans = torch.max(outputs, 1)
        
        correct = [labels[i].float() == ans[i].float() for i in range(len(labels))]
        sum_ += sum(correct).data[0]
    print('val accuracy is: ', sum_ / (len(val_loader) * val_loader.batch_size))
    if sum_ / (len(val_loader) * val_loader.batch_size) > best_acc:
        best_acc = sum_ / (len(val_loader) * val_loader.batch_size)
        torch.save(net2.state_dict(), '/home/max/titanic_model.pth')

288
epoch train accuracy:  0.6736111111111112
val accuracy is:  0.7708333333333334
epoch train accuracy:  0.8107638888888888
val accuracy is:  0.8159722222222222
epoch train accuracy:  0.8125
val accuracy is:  0.8263888888888888
epoch train accuracy:  0.8055555555555556
val accuracy is:  0.8263888888888888
epoch train accuracy:  0.8142361111111112
val accuracy is:  0.8229166666666666
epoch train accuracy:  0.8072916666666666
val accuracy is:  0.8298611111111112
epoch train accuracy:  0.8211805555555556
val accuracy is:  0.8333333333333334
epoch train accuracy:  0.8368055555555556
val accuracy is:  0.8229166666666666
epoch train accuracy:  0.8211805555555556
val accuracy is:  0.8229166666666666
epoch train accuracy:  0.8298611111111112
val accuracy is:  0.8263888888888888
epoch train accuracy:  0.828125
val accuracy is:  0.8368055555555556
epoch train accuracy:  0.8177083333333334
val accuracy is:  0.8229166666666666
epoch train accuracy:  0.8368055555555556
val accuracy is:  0.83680555

In [13]:
print(best_acc)

0.8715277777777778


In [14]:
net2.load_state_dict(torch.load('/home/max/titanic_model.pth'))

In [15]:
test_data = TitanicDataset('/home/max/titanic/', 'test.csv', train = False)

In [16]:
test_loader = DataLoader(test_data, batch_size = 1, num_workers= 2)

In [17]:
import csv
net2.eval()
csv_data = [['PassengerId', 'Survived']]
for idx,data in enumerate(test_loader):
    num, inputs = data
    #print(num, inputs)
    inputs = Variable(inputs.float())
    outputs = net2(inputs)
    _, ans = torch.max(outputs, 1)
    #print(ans)
    csv_data.append([str(num[0]), str(ans.data[0])])
#print(csv_data)
with open("/home/max/titanic.csv", 'w') as csvfile:
    writer = csv.writer(csvfile)
    writer.writerows(csv_data)
csvfile.close()    

In [18]:
print(pd.read_csv('/home/max/titanic.csv'))

     PassengerId  Survived
0            892         0
1            893         0
2            894         0
3            895         0
4            896         0
5            897         0
6            898         1
7            899         0
8            900         0
9            901         0
10           902         0
11           903         0
12           904         1
13           905         0
14           906         1
15           907         1
16           908         0
17           909         0
18           910         1
19           911         1
20           912         0
21           913         1
22           914         1
23           915         0
24           916         1
25           917         0
26           918         1
27           919         0
28           920         0
29           921         0
..           ...       ...
388         1280         0
389         1281         0
390         1282         0
391         1283         1
392         1284         1
3