In [106]:
import torch
import random
import torch.nn.functional as F
import pandas as pd

from torch import nn, optim


# Prepare the data

train_file = './train.csv'
test_file = './test.csv'

def prepare_data(data, test_sample=False):
    data = data.fillna(0)
    
    if test_sample:
        persons = data[['PassengerId','Name','Age','Sex']]
    else:
        persons = data[['PassengerId','Name','Age','Sex', 'Survived']]
        
    data = pd.get_dummies(data.drop(columns=['Cabin','Ticket','Pclass']), columns=['Sex', 'Embarked']).fillna(0)
    
    features = data[['Age', 'SibSp', 'Parch', 'Fare', 'Sex_female','Sex_male', 'Embarked_C', 'Embarked_Q', 'Embarked_S']].copy()
    features['Fare'] /= features['Fare'].max()
    features['Age'] /= features['Age'].max()
    features['Fare'] /= features['Fare'].max()
    features['SibSp'] /= features['SibSp'].max()
    
    features = torch.tensor(features.values, dtype=torch.float)
    persons = persons.values
    
    if(test_sample):
        return zip(features, persons)
    
    targets = data['Survived']
    targets = torch.tensor(targets.values, dtype=torch.long).view(-1,1)
    
    return zip(features, persons, targets)

def random_training_pair(train_list):
    choice = train_list[random.randint(0, len(train_list) - 1)]
    features, person, target = choice
    
    return features, person, target


    

In [107]:
data = prepare_data(pd.read_csv(train_file))
test_data = prepare_data(pd.read_csv(test_file), True)
test_data = list(test_data)

titanic_data = list(data)

n_data = len(titanic_data)
n_validation = 100
n_train = n_data - n_validation

validation_data = titanic_data[n_train:]
train_data = titanic_data[:-n_validation]

In [108]:
class Classifier(nn.Module):
    
    def __init__(self):
        super(Classifier, self).__init__()
        
        self.fc1 = nn.Linear(9, 6)
        self.fc2 = nn.Linear(6, 3)
        self.fc3 = nn.Linear(3, 2)
        self.dropout = nn.Dropout(0.2)
        
    def forward(self, X):
        X = self.dropout( F.relu( self.fc1(X) ) )
        X = self.dropout( F.relu( self.fc2(X) ) )
        X = self.dropout( F.log_softmax( self.fc3(X).view(1,2), dim=1 ) )
        
        return X
    

In [109]:
features, person, target = random_training_pair(train_data)
classifier = Classifier()
logits = classifier(features)
print(logits, target)

tensor([[-0.5092, -0.0000]], grad_fn=<MulBackward0>) tensor([0])


In [110]:
classifier = Classifier()

lr = 0.001
momentum = 0.5
criterion = nn.NLLLoss()
optimizer = optim.SGD(classifier.parameters(), lr=lr, momentum=momentum)

def train(features, target):
    """
        Training the data set
        Params:
            features: the expected category result
            target: the expected result
    """
    classifier.train()
    
    optimizer.zero_grad()
    
    output = classifier(features)

    loss = criterion(output, target)
    loss.backward()
    optimizer.step()
    
    return output, loss.item()

In [113]:
epochs = 100000
print_every = 10000
current_loss = 0
accuracy = 0

for epoch in range(1, epochs + 1):
    features, person, target = random_training_pair(train_data)
    output, item_loss = train(features, target)
    
    top_v, top_i = output.topk(1, dim=1)
    accuracy += 1 if top_i == target[0] else 0
    current_loss += item_loss
    
    if epoch % print_every == 0:
        print(f'Epoch {epoch}/{epochs}: accuracy: {accuracy/epoch:.3f} and loss: {current_loss/epoch:.3f}')
        
    

Epoch 10000/100000: accuracy: 0.672 and loss: 0.513
Epoch 20000/100000: accuracy: 0.674 and loss: 0.516
Epoch 30000/100000: accuracy: 0.674 and loss: 0.519
Epoch 40000/100000: accuracy: 0.674 and loss: 0.519
Epoch 50000/100000: accuracy: 0.675 and loss: 0.522
Epoch 60000/100000: accuracy: 0.674 and loss: 0.522
Epoch 70000/100000: accuracy: 0.676 and loss: 0.520
Epoch 80000/100000: accuracy: 0.677 and loss: 0.518
Epoch 90000/100000: accuracy: 0.676 and loss: 0.519
Epoch 100000/100000: accuracy: 0.676 and loss: 0.519


In [75]:
def validate():
    classifier.eval()
    
    with torch.no_grad():
        accuracy = 0;
        epochs = len(validation_data)
        
        for epoch in range(1, epochs + 1):
            features, person, target = random_training_pair(validation_data)
            output = classifier(features)

            top_v, top_i = output.topk(1, dim=1)
            correct = '✓' if top_i == target[0] else f'✗ {target[0]}'
            accuracy += 1 if top_i == target[0] else 0
            
            print(f'Person {person}')
            print(f'{top_i.item()} {correct} \n')
        else:
            print(f'Accuracy: {accuracy/epochs:.3f}')
        

In [114]:
classifier.to('cpu')
torch.save(classifier.state_dict(), './checkpoint.pth')

In [116]:
validate()

Person [885 'Sutehall, Mr. Henry Jr' 25.0 'male' 0]
0 ✓ 

Person [887 'Montvila, Rev. Juozas' 27.0 'male' 0]
0 ✓ 

Person [823 'Reuchlin, Jonkheer. John George' 38.0 'male' 0]
0 ✓ 

Person [867 'Duran y More, Miss. Asuncion' 27.0 'female' 1]
1 ✓ 

Person [855 'Carter, Mrs. Ernest Courtenay (Lilian Hughes)' 44.0 'female' 0]
1 ✗ 0 

Person [874 'Vander Cruyssen, Mr. Victor' 47.0 'male' 0]
0 ✓ 

Person [799 'Ibrahim Shawah, Mr. Yousseff' 30.0 'male' 0]
0 ✓ 

Person [796 'Otter, Mr. Richard' 39.0 'male' 0]
0 ✓ 

Person [869 'van Melkebeke, Mr. Philemon' 0.0 'male' 0]
0 ✓ 

Person [871 'Balkic, Mr. Cerin' 26.0 'male' 0]
0 ✓ 

Person [874 'Vander Cruyssen, Mr. Victor' 47.0 'male' 0]
0 ✓ 

Person [829 'McCormack, Mr. Thomas Joseph' 0.0 'male' 1]
0 ✗ 1 

Person [881 'Shelley, Mrs. William (Imanita Parrish Hall)' 25.0 'female' 1]
1 ✓ 

Person [847 'Sage, Mr. Douglas Bullen' 0.0 'male' 0]
0 ✓ 

Person [862 'Giles, Mr. Frederick Edward' 21.0 'male' 0]
0 ✓ 

Person [848 'Markoff, Mr. Marin' 35.0 '

In [103]:
def predict():
    classifier.eval()
    
    with torch.no_grad():
        prediction_data = pd.DataFrame([], columns=['PassengerId','Survived'])
        n_predictions = len(test_data) 

        for i in range(n_predictions):
            features, person = test_data[i]
            output = classifier(features)
            top_v, top_i = output.topk(1, dim=1)
            prediction_data.loc[i] = [person[0], top_i.item()]
    
    return prediction_data

        

In [118]:
prediction_data = predict()
prediction_data.to_csv(r'./titanic_submission.csv',index=False)
print(prediction_data.head())

418
  PassengerId Survived
0         892        0
1         893        1
2         894        0
3         895        0
4         896        1
