In [1]:
import pandas as pd
import numpy as np
import torch
from torch.utils.data.dataloader import DataLoader
from torch.utils.data.dataset import Dataset
import os
from torchvision.transforms import transforms
import torch.nn as nn
from torch.autograd import Variable
import torch.optim as optim
import math
from torch.utils.data.sampler import RandomSampler

In [2]:
file_path = '/home/max/titanic/train.csv'
file = pd.read_csv(file_path)
file.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [2]:
def dummy_data(data, columns):
    for column in columns:
        data = pd.concat([data, pd.get_dummies(data[column], prefix=column)], axis=1)
        data = data.drop(column, axis=1)
    return data

In [3]:
class TitanicDataset(Dataset):
    
    def __init__(self, root_dir, file_dir, transform = None, train = True):
        self.train = train
        dummy_columns = ['Pclass', 'Embarked']
        self.file = pd.read_csv(os.path.join(root_dir, file_dir))
        if not train:
            self.indices = self.file['PassengerId'] 
        list_to_drop = ['PassengerId', 'Name' , 'Ticket', 'Fare', 'Cabin']
        self.file.drop(list_to_drop, axis = 'columns', inplace=True)
        self.file['Age'] = self.file['Age'].fillna(self.file['Age'].mean())
        self.file['Embarked']  = self.file['Embarked'].fillna('S')
        self.file = dummy_data(self.file, dummy_columns)
        columns = ['Age', 'SibSp', 'Parch']
        if train:
            self.labels = self.file.loc[:, :'Survived']
            self.file.drop(['Survived'], axis = 'columns', inplace = True)
        self.file.loc[self.file['Sex'] == 'male', ['Sex']] = 1
        self.file.loc[self.file['Sex'] == 'female', ['Sex']] = 0
        for column in columns:
            self.file[column] = (self.file[column] - self.file[column].mean())/ self.file[column].std()
        self.transform = transform

    
    def __len__(self):
        return len(self.file)
    
    def __getitem__(self, idx):
        inputs = self.file.iloc[idx].values.astype(np.float32)
        inputs = torch.from_numpy(inputs)
        if self.train:
            labels = self.labels.iloc[idx].values.astype(np.float32)
            labels = torch.from_numpy(labels)
            return inputs, labels
        return self.indices[idx], inputs
        
        

In [4]:
train_set = TitanicDataset('/home/max/titanic/', 'train.csv')

In [5]:
indices = np.array(list(range(len(train_set))))
split = 0.33
np.random.shuffle(indices)
val_idx = indices[:math.floor(len(train_set) * split)]
train_idx = indices[math.floor(len(train_set) * split):]
train_sampler = RandomSampler(train_idx)
val_sampler = RandomSampler(val_idx)

In [6]:
train_loader = DataLoader(train_set, num_workers=2, batch_size= 16, drop_last=True, sampler=train_sampler)
val_loader = DataLoader(train_set, num_workers=2, batch_size = 16, drop_last=True, sampler = val_sampler)

In [7]:
train_set[0][0].shape

torch.Size([10])

In [8]:
net = nn.Sequential(nn.Linear(10,32),
                    nn.BatchNorm1d(num_features=32),
                    nn.ReLU(),
                    nn.Linear(32, 16),
                    nn.ReLU(),
                    nn.BatchNorm1d(num_features=16),
                    nn.Linear(16,2)
                   )

In [21]:
import torch.nn.functional as F

In [24]:
class Net(nn.Module):
    
    def __init__(self, downsample = False):
        super(Net,self).__init__()
        self.downsample = None
        if downsample:
            self.downsample = nn.Linear(10, 2)
        self.linear1 = nn.Linear(10, 32)
        self.bn1 = nn.BatchNorm1d(32)
        self.linear2 = nn.Linear(32, 16)
        self.bn2 = nn.BatchNorm1d(16)
        self.linear3 = nn.Linear(16,2)
        self.bn3 = nn.BatchNorm1d(2)
    
    def forward(self, x):
        residual = x
        out = F.relu(self.bn1(self.linear1(x)))
        out = F.relu(self.bn2(self.linear2(out)))
        out = self.bn3(self.linear3(out))
        if self.downsample:
            residual = self.downsample(residual)
        out += residual
        return out

net2 = Net()
        
            

In [22]:
if ruf:
    print('fjp')
else:
    print('dfjsd')

NameError: name 'ruf' is not defined

In [25]:
pred = net2(Variable(train_set[0][0]))

RuntimeError: invalid argument 2: dimension 1 out of range of 1D tensor at /opt/conda/conda-bld/pytorch_1512387374934/work/torch/lib/TH/generic/THTensor.c:24

In [26]:
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(net.parameters(), lr = 0.01)

In [27]:
epochs = 100
best_acc = 0
print(len(val_loader) * val_loader.batch_size)
for i in range(epochs):
    curr_loss = 0
    sum_ = 0
    for idx, data in enumerate(train_loader):
        
        optimizer.zero_grad()
        
        inputs, labels = data
        inputs = Variable(inputs.float())
        labels = Variable(labels.long())
        outputs = net2(inputs)
        labels = labels.squeeze()
        loss = criterion(outputs, labels)
        loss.backward()
        
        optimizer.step()
        curr_loss += loss.data[0]
        _, ans = torch.max(outputs, 1)
        correct = [labels[i].float() == ans[i].float() for i in range(len(labels))]
        sum_ += sum(correct).data[0]
        if idx % 20 == 19:
            #print('curr. loss: ', curr_loss / 20)
            curr_loss = 0
    print('epoch train accuracy: ', sum_ / (len(train_loader) * train_loader.batch_size))
    
    sum_ = 0
    for idx, data in enumerate(val_loader):
        
        inputs, labels = data
        inputs = Variable(inputs.float())
        labels = Variable(labels.long())
        
        outputs = net(inputs)
        _, ans = torch.max(outputs, 1)
        
        correct = [labels[i].float() == ans[i].float() for i in range(len(labels))]
        sum_ += sum(correct).data[0]
    print('val accuracy is: ', sum_ / (len(val_loader) * val_loader.batch_size))
    if sum_ / (len(val_loader) * val_loader.batch_size) > best_acc:
        best_acc = sum_ / (len(val_loader) * val_loader.batch_size)
        torch.save(net.state_dict(), '/home/max/titanic_model.pth')

288


RuntimeError: The expanded size of the tensor (2) must match the existing size (10) at non-singleton dimension 1. at /opt/conda/conda-bld/pytorch_1512387374934/work/torch/lib/TH/generic/THTensor.c:309

In [12]:
print(best_acc)

0.8472222222222222


In [13]:
net.load_state_dict(torch.load('/home/max/titanic_model.pth'))

In [14]:
test_data = TitanicDataset('/home/max/titanic/', 'test.csv', train = False)

In [15]:
test_loader = DataLoader(test_data, batch_size = 1, num_workers= 2)

In [22]:
kek = pd.read_csv('/home/max/titanic/gender_submission.csv')

In [23]:
kek.head()

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,1
2,894,0
3,895,0
4,896,1


In [16]:
import csv

with open("/home/max/titanic.csv", 'w') as csvfile:
    writer = csv.writer(csvfile)
    writer.writerows(csv_data)
csvfile.close()

In [17]:
print(pd.read_csv('/home/max/titanic.csv'))

Empty DataFrame
Columns: [PassengerId, Survived]
Index: []


In [18]:
import csv
net.eval()
csv_data = [['PassengerId', 'Survived']]
for idx,data in enumerate(test_loader):
    num, inputs = data
    #print(num, inputs)
    inputs = Variable(inputs.float())
    outputs = net(inputs)
    _, ans = torch.max(outputs, 1)
    #print(ans)
    csv_data.append([str(num[0]), str(ans.data[0])])
print(csv_data)
with open("/home/max/titanic.csv", 'w') as csvfile:
    writer = csv.writer(csvfile)
    writer.writerows(csv_data)
csvfile.close()    

[['PassengerId', 'Survived'], ['892', '0'], ['893', '0'], ['894', '0'], ['895', '0'], ['896', '0'], ['897', '0'], ['898', '1'], ['899', '0'], ['900', '1'], ['901', '0'], ['902', '0'], ['903', '0'], ['904', '1'], ['905', '0'], ['906', '1'], ['907', '1'], ['908', '0'], ['909', '0'], ['910', '0'], ['911', '0'], ['912', '1'], ['913', '1'], ['914', '1'], ['915', '0'], ['916', '1'], ['917', '0'], ['918', '1'], ['919', '0'], ['920', '0'], ['921', '0'], ['922', '0'], ['923', '0'], ['924', '0'], ['925', '0'], ['926', '0'], ['927', '0'], ['928', '0'], ['929', '0'], ['930', '0'], ['931', '0'], ['932', '0'], ['933', '0'], ['934', '0'], ['935', '1'], ['936', '1'], ['937', '0'], ['938', '0'], ['939', '0'], ['940', '1'], ['941', '0'], ['942', '0'], ['943', '0'], ['944', '1'], ['945', '1'], ['946', '0'], ['947', '0'], ['948', '0'], ['949', '0'], ['950', '0'], ['951', '1'], ['952', '0'], ['953', '0'], ['954', '0'], ['955', '1'], ['956', '0'], ['957', '1'], ['958', '1'], ['959', '0'], ['960', '0'], ['96

In [19]:
print(pd.read_csv('/home/max/titanic.csv'))

     PassengerId  Survived
0            892         0
1            893         0
2            894         0
3            895         0
4            896         0
5            897         0
6            898         1
7            899         0
8            900         1
9            901         0
10           902         0
11           903         0
12           904         1
13           905         0
14           906         1
15           907         1
16           908         0
17           909         0
18           910         0
19           911         0
20           912         1
21           913         1
22           914         1
23           915         0
24           916         1
25           917         0
26           918         1
27           919         0
28           920         0
29           921         0
..           ...       ...
388         1280         0
389         1281         0
390         1282         0
391         1283         1
392         1284         1
3