# Titanic

## Preapare Dataset

In [51]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from torch.utils.data import Dataset, DataLoader
import torch

# Read the data from csv using pandas
df_train = pd.read_csv('./train.csv')
df_test  = pd.read_csv('./test.csv')
df_sub   = pd.read_csv('./gender_submission.csv')

# drop useless info
df_train.drop(['Name','Ticket','Cabin','PassengerId'],axis=1,inplace=True) # inplace表示不产生副本，直接执行
df_test.drop( ['Name','Ticket','Cabin','PassengerId'],axis=1,inplace=True)

# 指标处理
sex      = pd.get_dummies(df_train['Sex'],drop_first=True) # 指标化处理
embark   = pd.get_dummies(df_train['Embarked'],drop_first=True)
df_train = pd.concat([df_train,sex,embark],axis=1)

df_train.drop(['Sex','Embarked'],axis=1,inplace=True) # 删除Sex和Embarked项

sex     = pd.get_dummies(df_test['Sex'],drop_first=True)
embark  = pd.get_dummies(df_test['Embarked'],drop_first=True)
df_test = pd.concat([df_test,sex,embark],axis=1)

df_test.drop(['Sex','Embarked'],axis=1,inplace=True)

# 填充NaN
df_train.fillna(df_train.mean(),inplace=True)
df_test.fillna(df_test.mean(),inplace=True)

# 取index
train_columns = df_train.columns
test_columns  = df_test.columns


df_train.columns = train_columns
df_test.columns  = test_columns

features = df_train.iloc[:,2:].columns.tolist()
target   = df_train.loc[:, 'Survived'].name

X_train = df_train.iloc[:,2:].values
y_train = df_train.loc[:, 'Survived'].values

In [52]:
df_train.to_csv("./train_x.csv",index=False)
df_test.to_csv("./test_y.csv",index=False)

In [57]:
csv_path="./train_x.csv"
class Titanicdata(Dataset):
    def __init__(self, csv_path, transforms=None):
        """
        Args:
            csv_path (string): csv 文件路径
            height (int): 图像高度
            width (int): 图像宽度
            transform: transform 操作
        """
        self.data = pd.read_csv(csv_path)
        self.y_data = torch.from_numpy(np.asarray(self.data.iloc[:, 0]))
        self.x_data = torch.from_numpy(np.asarray(self.data.iloc[:, 0:]))
        self.len=y_data.shape[0]
 
    def __getitem__(self, index):
        return self.x_data[index], self.y_data[index] # 返回对应样本即可
 
    def __len__(self):
        return self.len

dataset = Titanicdata('./train_x.csv')
train_loader = DataLoader(dataset=dataset, 
                          batch_size=32, 
                          shuffle=True, 
                          num_workers=2)

## Design model using Class

In [58]:
import torch
import torch.nn as nn
from torch.nn import functional as F
from torch.autograd import Variable

class TitanicNet(torch.nn.Module):
    def __init__(self):
        super(TitanicNet, self).__init__()
        self.fc1 = torch.nn.Linear(8, 512)
        self.fc2 = torch.nn.Linear(512, 512)
        self.fc3 = torch.nn.Linear(512, 2)
        self.dropout = torch.nn.Dropout(0.2)
        
    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = self.dropout(x)
        x = F.relu(self.fc2(x))
        x = self.dropout(x)
        x = self.fc3(x)
        return x
    
model = TitanicNet()
print(model)

TitanicNet(
  (fc1): Linear(in_features=8, out_features=512, bias=True)
  (fc2): Linear(in_features=512, out_features=512, bias=True)
  (fc3): Linear(in_features=512, out_features=2, bias=True)
  (dropout): Dropout(p=0.2, inplace=False)
)


## Construct loss and optimizer

In [59]:
criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(model.parameters(), lr=0.01)

## Training Cycle

In [60]:
len(x_data)

891

In [61]:
batch_size = 64
n_epochs = 500
batch_no = len(x_data) // batch_size

train_loss = 0
train_loss_min = np.Inf
for epoch in range(n_epochs):
    for i in range(batch_no):
        start = i * batch_size
        end   = start + batch_size
        x_var = Variable(torch.FloatTensor(x_data[start:end]))
        y_var = Variable(torch.LongTensor(y_data[start:end])) 
        
        optimizer.zero_grad()
        output = model(x_var)
        loss   = criterion(output,y_var)
        loss.backward()
        optimizer.step()
        
        values, labels = torch.max(output, 1)
        num_right   = np.sum(labels.data.numpy() == y_train[start:end])
        train_loss += loss.item()*batch_size
    
    train_loss = train_loss / len(X_train)
    if train_loss <= train_loss_min:
        print("Validation loss decreased ({:6f} ===> {:6f}). Saving the model...".format(train_loss_min,train_loss))
        torch.save(model.state_dict(), "model.pt")
        train_loss_min = train_loss
    
    if epoch % 200 == 0:
        print('')
        print("Epoch: {} \tTrain Loss: {} \tTrain Accuracy: {}".format(epoch+1, train_loss,num_right / len(y_train[start:end]) ))
print('Training Ended! ')

Validation loss decreased (   inf ===> 2.835552). Saving the model...

Epoch: 1 	Train Loss: 2.835552074291088 	Train Accuracy: 0.328125
Validation loss decreased (2.835552 ===> 0.603854). Saving the model...
Validation loss decreased (0.603854 ===> 0.587723). Saving the model...
Validation loss decreased (0.587723 ===> 0.582707). Saving the model...
Validation loss decreased (0.582707 ===> 0.578575). Saving the model...
Validation loss decreased (0.578575 ===> 0.574395). Saving the model...
Validation loss decreased (0.574395 ===> 0.571202). Saving the model...
Validation loss decreased (0.571202 ===> 0.569773). Saving the model...
Validation loss decreased (0.569773 ===> 0.565235). Saving the model...
Validation loss decreased (0.565235 ===> 0.560236). Saving the model...
Validation loss decreased (0.560236 ===> 0.558283). Saving the model...
Validation loss decreased (0.558283 ===> 0.552859). Saving the model...
Validation loss decreased (0.552859 ===> 0.550355). Saving the model...

## Predict

In [64]:
X_test     = df_test.iloc[:,0:].values
X_test_var = Variable(torch.FloatTensor(X_test), requires_grad=False) 
with torch.no_grad():
    test_result = model(X_test_var)
values, labels = torch.max(test_result, 1)
survived = labels.data.numpy()

In [65]:
submission = pd.DataFrame({'PassengerId': df_sub['PassengerId'], 'Survived': survived})
submission.to_csv('submission.csv', index=False)

In [72]:
labels

tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0,