In [1]:
import torch
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import torch.nn as nn
from sklearn import datasets

In [2]:
train = pd.read_csv('../data/train.csv')
test = pd.read_csv('../data/test.csv')

train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [3]:
train_test_data= [train, test] 
for dataset in train_test_data:
    dataset['Title'] = dataset['Name'].str.extract(' ([A-Za-z]+)\.', expand=False)

title_mapping = {"Mr": "Mr", "Miss": "Miss", "Mrs": "Mrs", 
                 "Master": "etc", "Dr": "etc", "Rev": "etc", "Col": "etc", "Major": "etc", "Mlle": "etc","Countess": "etc",
                 "Ms": "etc", "Lady": "etc", "Jonkheer": "etc", "Don": "etc", "Dona" : "etc", "Mme": "etc","Capt": "etc","Sir": "etc" }
for dataset in train_test_data:
    dataset['Title'] = dataset['Title'].map(title_mapping)
    
train.drop('Name', axis=1, inplace=True)
test.drop('Name', axis=1, inplace=True)

In [4]:
train.drop('Ticket', axis=1, inplace=True)
test.drop('Ticket', axis=1, inplace=True)

test_PassengerId=test["PassengerId"]
train.drop('PassengerId', axis = 1, inplace=True)
test.drop('PassengerId', axis = 1, inplace=True)

In [5]:
# Cabin 앞글자만 따기
train['Cabin'] = train['Cabin'].str[:1]

# mapping
cabin_mapping = {"A": 0, "B": 0, "C": 0, "D": 0, "E": 0, "T": 0, "F": 1, "G": 1}

for dataset in train_test_data:
    dataset['Cabin'] = dataset['Cabin'].map(cabin_mapping)

In [6]:
# fillna
train['Cabin'].fillna((train['Pclass'] == 1) * 1, inplace=True)
test['Cabin'].fillna((test['Pclass'] == 1) * 1, inplace=True)

train["Age"].fillna(train.groupby("Title")["Age"].transform("median"), inplace=True)
test["Age"].fillna(test.groupby("Title")["Age"].transform("median"), inplace=True)

train.replace({"Fare": {0: np.nan}}, inplace=True)
test.replace({"Fare": {0: np.nan}}, inplace=True)

train["Fare"].fillna(train.groupby("Pclass")["Fare"].transform("median"), inplace=True)
test["Fare"].fillna(test.groupby("Pclass")["Fare"].transform("median"), inplace=True)

for dataset in train_test_data:
    dataset['Embarked'] = dataset['Embarked'].fillna('S')

In [7]:
# binning
for dataset in train_test_data:
    dataset.loc[ dataset['Age'] <= 10, 'Age'] = 0
    dataset.loc[(dataset['Age'] > 10) & (dataset['Age'] <= 18), 'Age'] = 1
    dataset.loc[(dataset['Age'] > 18) & (dataset['Age'] <= 22), 'Age'] = 2
    dataset.loc[(dataset['Age'] > 22) & (dataset['Age'] <= 34), 'Age'] = 3
    dataset.loc[(dataset['Age'] > 34) & (dataset['Age'] <= 46), 'Age'] = 4
    dataset.loc[(dataset['Age'] > 46) & (dataset['Age'] <= 62), 'Age'] = 5
    dataset.loc[ dataset['Age'] > 62, 'Age'] = 6

    
for dataset in train_test_data:
    dataset.loc[ dataset['Fare'] <= 10, 'Fare'] = 0
    dataset.loc[(dataset['Fare'] > 10) & (dataset['Fare'] <= 30), 'Fare'] = 1
    dataset.loc[(dataset['Fare'] > 30) & (dataset['Fare'] <= 100), 'Fare'] = 2
    dataset.loc[ dataset['Fare'] > 100, 'Fare'] = 3

In [8]:
# 머신에게 학습을 시킬때는 문자열로는 학습이 안된다. 모든 값을 숫자로 매핑시킨다.
sex_mapping = {"male": 0, "female": 1}
for dataset in train_test_data:
    dataset['Sex'] = dataset['Sex'].map(sex_mapping)

In [9]:
# 선착장을 숫자로 맵핑
embarked_mapping = {"S": 0, "C": 1, "Q": 2}
for dataset in train_test_data:
    dataset['Embarked'] = dataset['Embarked'].map(embarked_mapping)

In [10]:
title_mapping = {"Mr": 0, "Miss": 1, "Mrs": 2, "etc": 3}
for dataset in train_test_data:
    dataset['Title'] = dataset['Title'].map(title_mapping)

In [11]:
train.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Cabin,Embarked,Title
0,0,3,0,2.0,1,0,0.0,0.0,0,0
1,1,1,1,4.0,1,0,2.0,0.0,1,2
2,1,3,1,3.0,0,0,0.0,0.0,0,1
3,1,1,1,4.0,1,0,2.0,0.0,0,2
4,0,3,0,4.0,0,0,0.0,0.0,0,0


In [12]:
test.head()

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Cabin,Embarked,Title
0,3,0,4.0,0,0,0.0,0.0,2,0
1,3,1,5.0,1,0,0.0,0.0,0,2
2,2,0,5.0,0,0,0.0,0.0,2,0
3,3,0,3.0,0,0,0.0,0.0,0,0
4,3,1,2.0,1,1,1.0,0.0,0,2


In [13]:
train_data = train.drop('Survived', axis=1)
train_data

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Cabin,Embarked,Title
0,3,0,2.0,1,0,0.0,0.0,0,0
1,1,1,4.0,1,0,2.0,0.0,1,2
2,3,1,3.0,0,0,0.0,0.0,0,1
3,1,1,4.0,1,0,2.0,0.0,0,2
4,3,0,4.0,0,0,0.0,0.0,0,0
...,...,...,...,...,...,...,...,...,...
886,2,0,3.0,0,0,1.0,0.0,0,3
887,1,1,2.0,0,0,1.0,0.0,0,1
888,3,1,2.0,1,2,1.0,0.0,0,1
889,1,0,3.0,0,0,1.0,0.0,1,0


In [14]:
train_label = train['Survived']
train_label

0      0
1      1
2      1
3      1
4      0
      ..
886    0
887    1
888    0
889    1
890    0
Name: Survived, Length: 891, dtype: int64

In [15]:
input_size = 9
hidden_size = 56
num_classes = 2
num_epochs = 100
learning_rate = 0.01

In [16]:
class NeuralNet(nn.Module): 
    
    def __init__(self, input_size, hidden_size, num_classes):
        super(NeuralNet, self).__init__()
        self.fc1 = nn.Linear(input_size, hidden_size)
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(hidden_size, num_classes)

    # 모델의 Forward Path를 정의
    def forward(self, x):
        out = self.fc1(x)
        out = self.relu(out)
        out = self.fc2(out)
    
        return out

In [17]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(train_data, 
                                                    train_label, 
                                                    random_state=42,
                                                    stratify=train_label)
X_train = torch.FloatTensor(X_train.values)
X_test = torch.FloatTensor(X_test.values)
y_train = torch.LongTensor(y_train.values)
y_test = torch.LongTensor(y_test.values)

In [18]:
# 위에서 정의한 클래스를 인스턴스화 시킴
# model = NeuralNet(input_size, hidden_size, num_classes).to(device) # to(device) : 이 모델을 gpu 서버에서 돌린다는 뜻
model = NeuralNet(input_size, hidden_size, num_classes)
# loss, optimizer를 선정의
loss_function = nn.CrossEntropyLoss() # Loss 기능 안에 Softmax 함수 기능 포함되어져 있다.
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

loss_list = []

for epoch in range(num_epochs):
    # Forward Pass
    pred = model(X_train)
    loss = loss_function(pred, y_train)
    
    loss_list.append(loss.item())

    optimizer.zero_grad()
    loss.backward()

    optimizer.step()

    print(f"Epoch : [{epoch + 1}/{num_epochs}], Loss : {loss.item():.4f}")

Epoch : [1/100], Loss : 0.6865
Epoch : [2/100], Loss : 0.6133
Epoch : [3/100], Loss : 0.5793
Epoch : [4/100], Loss : 0.5414
Epoch : [5/100], Loss : 0.5095
Epoch : [6/100], Loss : 0.4926
Epoch : [7/100], Loss : 0.4808
Epoch : [8/100], Loss : 0.4665
Epoch : [9/100], Loss : 0.4538
Epoch : [10/100], Loss : 0.4478
Epoch : [11/100], Loss : 0.4452
Epoch : [12/100], Loss : 0.4411
Epoch : [13/100], Loss : 0.4363
Epoch : [14/100], Loss : 0.4342
Epoch : [15/100], Loss : 0.4344
Epoch : [16/100], Loss : 0.4341
Epoch : [17/100], Loss : 0.4323
Epoch : [18/100], Loss : 0.4304
Epoch : [19/100], Loss : 0.4289
Epoch : [20/100], Loss : 0.4267
Epoch : [21/100], Loss : 0.4234
Epoch : [22/100], Loss : 0.4204
Epoch : [23/100], Loss : 0.4181
Epoch : [24/100], Loss : 0.4164
Epoch : [25/100], Loss : 0.4146
Epoch : [26/100], Loss : 0.4130
Epoch : [27/100], Loss : 0.4121
Epoch : [28/100], Loss : 0.4115
Epoch : [29/100], Loss : 0.4107
Epoch : [30/100], Loss : 0.4099
Epoch : [31/100], Loss : 0.4092
Epoch : [32/100],

In [19]:
with torch.no_grad(): # 미분 안하겠다...실제로 학습할 필요가 없을 때 이 구문을 반드시 작성
    correct = 0
    total = 0

    for features, labels in zip(X_test, y_test):
        outputs = model(features)
        
        total += 1
        correct += (torch.argmax(outputs) == labels).sum().item()

    print(f"Accuracy of the Network on the Test Images : {100*correct/total}%")

Accuracy of the Network on the Test Images : 80.71748878923766%


### Submit

In [20]:
train_data = torch.FloatTensor(train_data.values)
test = torch.FloatTensor(test.values)
train_label = torch.LongTensor(train_label.values)

In [21]:
# 위에서 정의한 클래스를 인스턴스화 시킴
# model = NeuralNet(input_size, hidden_size, num_classes).to(device) # to(device) : 이 모델을 gpu 서버에서 돌린다는 뜻
model = NeuralNet(input_size, hidden_size, num_classes)
# loss, optimizer를 선정의
loss_function = nn.CrossEntropyLoss() # Loss 기능 안에 Softmax 함수 기능 포함되어져 있다.
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

for epoch in range(num_epochs): # 100번
    # Forward Pass
    pred = model(train_data)
    loss = loss_function(pred, train_label)

    optimizer.zero_grad()
    loss.backward()

    optimizer.step()

    print(f"Epoch : [{epoch + 1}/{num_epochs}], Loss : {loss.item():.4f}")

Epoch : [1/100], Loss : 0.7175
Epoch : [2/100], Loss : 0.6662
Epoch : [3/100], Loss : 0.6215
Epoch : [4/100], Loss : 0.5814
Epoch : [5/100], Loss : 0.5522
Epoch : [6/100], Loss : 0.5265
Epoch : [7/100], Loss : 0.5025
Epoch : [8/100], Loss : 0.4841
Epoch : [9/100], Loss : 0.4716
Epoch : [10/100], Loss : 0.4611
Epoch : [11/100], Loss : 0.4530
Epoch : [12/100], Loss : 0.4487
Epoch : [13/100], Loss : 0.4461
Epoch : [14/100], Loss : 0.4435
Epoch : [15/100], Loss : 0.4423
Epoch : [16/100], Loss : 0.4409
Epoch : [17/100], Loss : 0.4381
Epoch : [18/100], Loss : 0.4350
Epoch : [19/100], Loss : 0.4325
Epoch : [20/100], Loss : 0.4292
Epoch : [21/100], Loss : 0.4258
Epoch : [22/100], Loss : 0.4233
Epoch : [23/100], Loss : 0.4210
Epoch : [24/100], Loss : 0.4187
Epoch : [25/100], Loss : 0.4172
Epoch : [26/100], Loss : 0.4161
Epoch : [27/100], Loss : 0.4147
Epoch : [28/100], Loss : 0.4134
Epoch : [29/100], Loss : 0.4119
Epoch : [30/100], Loss : 0.4100
Epoch : [31/100], Loss : 0.4083
Epoch : [32/100],

In [22]:
with torch.no_grad(): # 미분 안하겠다...실제로 학습할 필요가 없을 때 이 구문을 반드시 작성
    correct = 0
    total = 0
    
    prediction = []
    
    for features in test:
        outputs = model(features)
        prediction.append(torch.argmax(outputs).item())
        
len(prediction)

418

In [25]:
submission = pd.read_csv('../data/gender_submission.csv')
submission

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,1
2,894,0
3,895,0
4,896,1
...,...,...
413,1305,0
414,1306,1
415,1307,0
416,1308,0


In [26]:
# 예측한 결과를 집어넣어 다시 csv파일로 저장한다.
submission['Survived'] = prediction
submission.to_csv('../data/gender_submission.csv', index=False)