# Kaggle Titantic Data Set
[Titanic Data Set](https://www.kaggle.com/c/titanic)

## 目標
* 訓練簡單的分類器以及簡單的深度學習程式 (keras、pytorch)
* 學會使用 kaggle 平台

## 環境
* python 3
* Keras 2.1.2
* sklearn
* pandas
* tensorflow

In [None]:
import torch
print(torch.__version__)

## 準備資料集

### 下載資料集

In [None]:
!mkdir data
!kg download -c titanic
!mv *.csv data/

### 資料集長相
[10 分鐘入門 pandas](https://pandas.pydata.org/pandas-docs/stable/10min.html)

In [69]:
import pandas as pd
train = pd.read_csv("data/train.csv")
test  = pd.read_csv("data/test.csv")

In [70]:
train.head(5)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [71]:
test.head(5)

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


### 資料集前處理

#### 抽取特徵

In [72]:
# 刪除不需要的特徵
features = list(train.columns.values)
# Remove unwanted features
features.remove('Name')
features.remove('PassengerId')
features.remove('Survived')
print(features)

['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked']


#### 把 文字 編碼成 數字，並填上 無資料的部分

In [73]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
train['Sex'] = le.fit_transform(train['Sex'])
test['Sex'] = le.fit_transform(test['Sex'])

train['Fare'] = train['Fare'].fillna(train['Fare'].mean())
test['Fare'] = train['Fare'].fillna(train['Fare'].mean())

train['Age'] = train['Age'].fillna(train['Age'].mean())
test['Age'] = train['Age'].fillna(train['Age'].mean())

train['Embarked'] = train['Embarked'].fillna("S")
test['Embarked'] = test['Embarked'].fillna("S")
train['Embarked'] = le.fit_transform(train['Embarked'])
test['Embarked'] = le.fit_transform(test['Embarked'])

train['Cabin'] = train['Cabin'].fillna("None")
test['Cabin'] = test['Cabin'].fillna("None")
train['Cabin'] = le.fit_transform(train['Cabin'])
test['Cabin'] = le.fit_transform(test['Cabin'])

train['Ticket'] = le.fit_transform(train['Ticket'])
test['Ticket'] = le.fit_transform(test['Ticket'])

In [74]:
# 拉出特徵資料
y = train['Survived']
x = train[list(train.columns.values)]
# test_x = test[features]

In [75]:
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.25,random_state=32)

In [76]:
X_train.to_csv("train_set.csv")
X_test.to_csv("valid_set.csv")

In [374]:
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms, utils
class TitanicDataset(Dataset):
    def __init__(self, csv_path, transform = None,mode = "train"):
        self.csv = pd.read_csv(csv_path)
        self.transform = transform
        self.mode = mode
        
        self.csv['Sex'] = le.fit_transform(self.csv['Sex'])


        self.csv['Fare'] = self.csv['Fare'].fillna(self.csv['Fare'].mean())


        self.csv['Age'] = self.csv['Age'].fillna(self.csv['Age'].mean())


        self.csv['Embarked'] = self.csv['Embarked'].fillna("S")

        self.csv['Embarked'] = le.fit_transform(self.csv['Embarked'])
    

        self.csv['Cabin'] = self.csv['Cabin'].fillna("None")

        self.csv['Cabin'] = le.fit_transform(self.csv['Cabin'])


        self.csv['Ticket'] = le.fit_transform(self.csv['Ticket'])

        
    def __len__(self):
        return len(self.csv)
        
    def __getitem__(self,index):
        
        sample = {}
        if self.mode == "train":
            sample["y"] = self.csv['Survived'][index]
#             sample["y"] = torch.from_numpy(sample["y"])
#         sample["x"] = self.csv.iloc[index,1:].as_matrix()
        sample["x"] = self.csv[features].iloc[index,1:].as_matrix()
#         sample["x"] = torch.from_numpy(sample["x"])
        
        if self.transform:
            sample = self.transform(sample)
            
        return sample

In [375]:
train = TitanicDataset("./train_set.csv")
valid = TitanicDataset("./valid_set.csv")
train[1]

{'x': array([  0.        ,  29.69911765,   0.        ,   0.        ,
        238.        ,   7.75      , 123.        ,   1.        ]), 'y': 1}

In [376]:
dataloader = DataLoader(train, batch_size=12400,
                        shuffle=True, num_workers=4)
# for i_batch, sample_batched in enumerate(dataloader):
#     if (i_batch == 1):
#         print(i_batch, sample_batched)

## Pytorch

In [377]:
import torch
import torch.nn as nn
import torch.nn.functional as F

In [378]:
class Net(nn.Module):
    def __init__(self):
        super(Net,self).__init__()
        self.fc1 = nn.Linear(8,64)
        self.fc2 = nn.Linear(64,64)
        self.fc3 = nn.Linear(64,64)
        self.fc4 = nn.Linear(64,64)
        self.fc5 = nn.Linear(64,8)
        self.fc6 = nn.Linear(8,2)
    def forward(self,x):
        x = F.relu(self.fc1(x))
        x = F.dropout(x)
        x = F.relu(self.fc2(x))
#         x = F.relu(self.fc3(x))
#         x = F.relu(self.fc4(x))
        x = F.relu(self.fc5(x))
#         x = F.dropout(x)
#         x = F.relu(self.fc6(x))
        x = self.fc6(x)
        x = F.softmax(x,dim=1)
#         return F.softmax(x,dim=1)
        return torch.max(x,1)

In [379]:
import torch.optim as optim
from torch.autograd import Variable
net = Net()
print(net)
optimizer = optim.SGD(net.parameters(), lr = 0.001)
# net.cuda()
# optimizer.cuda()
criterion = nn.CrossEntropyLoss()


for i in range(1000):
    net.train()
    for i_batch, sample_batched in enumerate(dataloader):
        x = sample_batched["x"].float()
        x = Variable(x)
    #     print(x)
        y = sample_batched["y"]
        y = Variable(y).float()
#         x , y = x.cuda(),y.cuda()
        optimizer.zero_grad()
    #     print(x)
        output = net(x)
#         print(output)
#         print(output,y)
#         loss = F.nll_loss(output,y)
#         loss = criterion(output,y)
        loss = F.mse_loss(output[0],y)
#         loss = loss.cuda()

        #反向传播
        loss.backward()

        #更新参数
        optimizer.step()
#         print(loss.data[0])
#         print(output,y)
#         if  i == 1:
#             print("output is ",output,y)
#             print("loss ",loss.data[0])
    print(i,loss.data[0])
# print("output is ",output,y)
# print("loss ",loss.data[0])

Net(
  (fc1): Linear(in_features=8, out_features=64)
  (fc2): Linear(in_features=64, out_features=64)
  (fc3): Linear(in_features=64, out_features=64)
  (fc4): Linear(in_features=64, out_features=64)
  (fc5): Linear(in_features=64, out_features=8)
  (fc6): Linear(in_features=8, out_features=2)
)
0 0.561735987663269
1 0.5575776696205139
2 0.5525520443916321
3 0.5463574528694153
4 0.5381177067756653
5 0.5271546840667725
6 0.5132617354393005
7 0.49320998787879944
8 0.46695998311042786
9 0.44027334451675415
10 0.4132519066333771
11 0.38299259543418884
12 0.3525990843772888
13 0.3288706839084625
14 0.3100356459617615
15 0.295142263174057
16 0.28305819630622864
17 0.27302977442741394
18 0.2649058997631073
19 0.25826865434646606
20 0.25325891375541687
21 0.2512857913970947
22 0.2508273720741272
23 0.2506580352783203
24 0.2506104111671448
25 0.25057968497276306
26 0.2505510747432709
27 0.2505253851413727
28 0.2505001723766327
29 0.25047314167022705
30 0.25043126940727234
31 0.25035834312438965

In [380]:
net.eval()

Net(
  (fc1): Linear(in_features=8, out_features=64)
  (fc2): Linear(in_features=64, out_features=64)
  (fc3): Linear(in_features=64, out_features=64)
  (fc4): Linear(in_features=64, out_features=64)
  (fc5): Linear(in_features=64, out_features=8)
  (fc6): Linear(in_features=8, out_features=2)
)

In [381]:
dataloader = DataLoader(valid, batch_size=128,
                        shuffle=True, num_workers=4)
itea = iter(dataloader)
sample_batched = itea.next()
x = sample_batched["x"].float()
x = Variable(x)
y = sample_batched["y"].float()
y = Variable(y)
# x , y = x.cuda(),y.cuda()

output = net(x)
# loss = F.nll_loss(output,y)
loss = F.mse_loss(output[0],y)
# print(output,y,loss)

for i_batch, sample_batched in enumerate(dataloader):
    x = sample_batched["x"].float()
    x = Variable(x)
#     print(x)
    y = sample_batched["y"].float()
    y = Variable(y)
#     x , y = x.cuda(),y.cuda()
    optimizer.zero_grad()
#     print(x)
    output = net(x)
#     print(output)
#         print(output,y)
#     loss = F.nll_loss(output,y)
#         loss = criterion(output,y)
    loss = F.mse_loss(output[0],y)
#     loss = loss.cuda()

    print(loss.data[0])

0.25107279419898987
0.2610432207584381


In [382]:
test = TitanicDataset("./data/test.csv",mode="test")
dataloader = DataLoader(test, batch_size=128)

In [383]:
import numpy as np
classes = []
for i_batch, sample_batched in enumerate(dataloader):
    x = sample_batched["x"].float()
    x = Variable(x)
#     print(x)
#     y = sample_batched["y"].float()
#     y = Variable(y)
#     x , y = x.cuda(),y.cuda()
#     optimizer.zero_grad()
#     print(x)
    output = net(x)
#     _, predicted = torch.max(output, 1)
#     print(output)
    classes.append(output[1].data.numpy())
#     print(output)
#         print(output,y)
#     loss = F.nll_loss(output,y)
#         loss = criterion(output,y)
#     loss = F.mse_loss(output,y)
#     loss = loss.cuda()

#     print(loss.data[0])
# classes = np.append(classes[0],classes[1],classes[2],classes[3])
result = []
for i in classes:
    result = np.append(result,i)
print(result.astype(int))

[1 1 1 1 1 1 1 1 1 1 1 1 0 1 0 1 1 1 1 1 0 1 1 0 0 1 0 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 0 1 0 1 0 1 1 0 1 1 1 1 1 0 1 1 1 1 0 1 1 1 1 0 1 1 1 1
 0 0 1 1 1 1 1 0 1 1 1 1 1 1 1 1 1 1 0 1 1 1 0 1 1 1 0 1 1 1 1 1 1 1 1 1 1
 1 0 1 0 1 1 1 0 1 1 1 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 1 1 1 1 1
 1 1 0 1 1 1 1 1 0 1 1 1 1 1 1 1 1 1 0 1 1 1 1 1 1 1 1 1 1 0 1 0 1 0 1 1 0
 1 1 1 0 1 1 1 1 1 1 1 0 1 1 1 1 1 0 1 1 1 1 1 0 1 1 1 0 1 1 1 1 0 0 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 0 1 1 1 1 0 1 1 0 1 1 1 1 1 1 1 1 1 0 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 0 1 0 1 1 1 1 1 1 1 1 0 1 1 1 1 1 0 1 1 1 1 1 0 1 1
 1 1 1 1 1 1 1 1 1 1 0 1 0 1 1 1 1 1 0 1 0 1 1 1 1 1 1 1 0 1 1 0 1 1 1 0 1
 1 1 1 1 1 1 1 1 1 0 0 1 1 1 1 1 1 0 1 0 1 1 1 1 1 1 1 0 1 1 1 1 0 1 1 1 1
 1 0 1 1 0 0 1 1 0 1 1 1 1 1 1 1 1 1 1 1 0 1 1 1 1 0 1 0 1 1 0 1 1 1 1 0 1
 0 1 0 1 0 1 1 0 1 1 1]


In [384]:
testset = pd.read_csv("./data/test.csv")
submission = pd.DataFrame({
    "PassengerId": testset["PassengerId"],
    "Survived": result.astype(int)})
print(submission)

submission.to_csv('titanic_lin.csv', index=False)

     PassengerId  Survived
0            892         1
1            893         1
2            894         1
3            895         1
4            896         1
5            897         1
6            898         1
7            899         1
8            900         1
9            901         1
10           902         1
11           903         1
12           904         0
13           905         1
14           906         0
15           907         1
16           908         1
17           909         1
18           910         1
19           911         1
20           912         0
21           913         1
22           914         1
23           915         0
24           916         0
25           917         1
26           918         0
27           919         1
28           920         1
29           921         1
..           ...       ...
388         1280         1
389         1281         1
390         1282         0
391         1283         1
392         1284         1
3

In [385]:
!kg submit titanic_lin.csv -c titanic -m "My First Titanic Pytorch output"

list indices must be integers or slices, not str
