## Kaggle Notebook

In [1]:

# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# Any results you write to the current directory are saved as output.
import seaborn as sns

from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.base import BaseEstimator, TransformerMixin

# pytorch에서 사용할 함수들 호출하기
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from torch.optim.lr_scheduler import StepLR

from sklearn.model_selection import train_test_split

In [None]:
submission = pd.read_csv('./gender_submission.csv')
df_train = pd.read_csv('./train.csv')
df_test = pd.read_csv('./test.csv')

In [None]:
df_train.shape, df_test.shape

In [None]:
dataset =  pd.concat([df_train, df_test], axis=0).reset_index(drop=True)
dataset.head()

In [None]:
dataset.info()

In [None]:
dataset.isnull().sum()

In [None]:
df_train.head()

In [None]:
df_test.head()

In [None]:
df_train.describe()

In [None]:
df_test.describe()

> ### Preprocessing

In [None]:
numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

In [None]:
dataset_title = [i.split(",")[1].split(".")[0].strip() for i in dataset["Name"]]
dataset["Title"] = pd.Series(dataset_title)
dataset["Title"]

In [None]:
dataset[['Survived', 'Title']].groupby(['Title']).count()

In [None]:
class TitleSelector(BaseEstimator, TransformerMixin):
    def __init__( self):
        self.dict_title = {
            "Capt":0,
            "Col":0,
            "Don":0,
            "Dona":0,
            "Dr":4,
            "Jonkheer":0,
            "Lady":0,
            "Major":0,
            "Master":4,
            "Miss":2,
            "Mlle":0,
            "Mme":0,
            "Mr":1,
            "Mrs":3,
            "Ms":0,
            "Rev":0,
            "Sir":0,
            "the Countess":0
        }

    def fit(self, X, y=None):
        return self

    def transform( self, X, y=None):
        for i, name in enumerate(X["Name"]):
            for title in self.dict_title.keys():
                if title in name:
                    X["Name"][i] = self.dict_title[title]
                    break

            assert X["Name"][i] in self.dict_title.values()

        return X

name_transformer = Pipeline(steps=[
    ('name', TitleSelector()),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

In [None]:
dataset["Fare"] = dataset["Fare"].map(lambda i: np.log(i) if i > 0 else 0)

In [None]:
g = sns.displot(dataset["Fare"][(dataset["Fare"].notnull())], kde=True)

In [None]:
dataset.head()

In [None]:
dataset = dataset.drop(['Title'], axis=1)
dataset.head()

In [None]:
num_cols = ["Age", "Fare"]
cat_cols = ["Pclass", "Sex", "SibSp", "Parch", "Ticket", "Cabin", "Embarked"]
cols = num_cols + cat_cols + ["Name"]


preprocessor = ColumnTransformer(transformers=[
    ('num', numerical_transformer, num_cols),
    ('name', name_transformer, ["Name"]),
    ('cat', categorical_transformer, cat_cols),
])

X_train = preprocessor.fit_transform(df_train[cols])
y_train = df_train["Survived"].values

In [None]:
X_train.shape

In [None]:
X_test = preprocessor.transform(df_test[cols])
X_test.shape

### Dataset, DataLoader

In [None]:
#교차 검증
X_trn, X_val, y_trn, y_val = train_test_split(X_train, y_train, test_size=0.1)

In [None]:
X_trn.toarray().shape

In [None]:
X_test

In [None]:
class CustomDataset(Dataset):
    def __init__(self, x, y=None):
        self.x = torch.from_numpy(x.toarray()).to(torch.float)
        if type(y) != type(None):
            self.y = torch.from_numpy(y).to(torch.float)
        else:
            self.y = None

    def __getitem__(self, index):
        # x = torch.from_numpy(self.x[index]).float()
        return self.x[index], self.y[index] if type(self.y) != type(None) else 0

    def __len__(self):
        return self.x.shape[0]

In [None]:
train_dataset = CustomDataset(X_trn, y_trn)
valid_dataset = CustomDataset(X_val, y_val)
test_dataset = CustomDataset(X_test)

In [None]:
train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=32, shuffle=True, drop_last=True)
valid_loader = torch.utils.data.DataLoader(valid_dataset, batch_size=32, shuffle=False)
test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=32, shuffle=False)

### modeling

In [None]:
class TitanicModel(nn.Module):
    def __init__(self):
        super(TitanicModel, self).__init__()

        self.model = nn.Sequential(
            nn.Linear(856, 64),
            nn.BatchNorm1d(64),
            nn.ReLU(True),
            nn.Dropout(0.8),
            nn.Linear(64, 32),
            nn.BatchNorm1d(32),
            nn.ReLU(True),
            nn.Dropout(0.8),
            nn.Linear(32, 16),
            nn.BatchNorm1d(16),
            nn.ReLU(True),
            nn.Dropout(0.8),

            nn.Linear(16, 1),
            nn.Sigmoid()
        )

    def forward(self, x):
        output = self.model(x)
        return output

In [None]:
model = TitanicModel()

criterion = nn.BCELoss()

optimizer = optim.Adam(model.parameters(), lr=0.02)
scheduler = StepLR(optimizer, step_size=20, gamma=0.7)

### Training

In [None]:
# 모델 학습
epochs = 100
dry_run = False # 1 배치만 훈련

for epoch in range(1, epochs+1):
    # 학습
    model.train()
    train_loss = 0
    correct =0
    for batch_idx, (data, target) in enumerate(train_loader):
        optimizer.zero_grad()
        output = model(data)
        # print(output)
        loss = criterion(output, target.view(-1, 1))
        loss.backward()
        optimizer.step()
        train_loss += loss.detach().sum()
        pred = output.argmax(dim=1, keepdim=True)  # get the index of the max log-probability
        correct += pred.eq(target.view_as(pred)).sum().item()

    train_loss /= len(train_loader.dataset)
    print('Train Epoch: {}/{}\tLoss: {:.6f}\t Accuracy: {}/{} ({:.0f}%)'.format(
        epoch, epochs, train_loss, correct, len(train_loader.dataset),
        100. * correct / len(train_loader.dataset)))
    if dry_run:
        break

    # 테스트
    model.eval()
    test_loss = 0
    correct = 0
    for data, target in valid_loader:
        with torch.no_grad():
            output = model(data)
        test_loss += criterion(output, target.view(-1, 1)).detach().sum()  # sum up batch loss
        pred = output.argmax(dim=1, keepdim=True)  # get the index of the max log-probability
        correct += pred.eq(target.view_as(pred)).sum().item()

    test_loss /= len(valid_loader.dataset)

    print('Valid set: Average loss: {:.4f}, Accuracy: {}/{} ({:.0f}%)\n'.format(
        test_loss, correct, len(valid_loader.dataset),
        100. * correct / len(valid_loader.dataset)))

    scheduler.step()

### Prediction

In [None]:
predictions = []
for data, _ in test_loader:
    with torch.no_grad():
        output = model(data)
    pred = output.argmax(dim=1, keepdim=True)  # get the index of the max log-probability
    predictions += pred.reshape(-1).tolist()


In [None]:
df_pred = pd.DataFrame(df_test["PassengerId"])
df_pred["Survived"] = predictions
df_pred.head()

In [None]:
df_pred.to_csv("submission.csv", index=False)