# Spaceship Titanic

- 두번째 시도

## Import modules

In [None]:
import os
from datetime import datetime
from zipfile import ZipFile
from io import BytesIO
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.linear_model import LogisticRegression

## Set envs

In [None]:
PATH_INPUT = './data/in/'
PATH_OUTPUT = './data/out/'
NOW_STR = datetime.now().strftime('%Y%m%d_%H%M%S')
PATH_OUTPUT_NOW = f'./data/out/{NOW_STR}/'

## Check Data

### Get Data

In [None]:
with ZipFile(PATH_INPUT + 'spaceship-titanic.zip') as f:
    print(f.namelist())

    df_train = pd.read_csv(BytesIO(f.read('train.csv')))
    df_test = pd.read_csv(BytesIO(f.read('test.csv')))
    df_sample = pd.read_csv(BytesIO(f.read('sample_submission.csv')))

### Print Data

In [None]:
df_train.head()

In [None]:
df_test.head()

In [None]:
df_sample.head()

- 목표<br>
3개 데이터를 비교한 결과 PassengerId는 key값이며 다른 여러 값들을 학습시켜 Transported를 유추해야함을 파악할 수 있다.<br>
학습 데이터가 주어져있으며, 종속변수는 한 개인 점, Transported값이 True, False 두개로 분리되어있다.<br>
따라서 이는 `이진분류`문제이다.<br><br>--> 활성화함수를 Sigmoid로 사용하여 학습하는 것이 좋겠다<br>

## Data Preprocessing

In [None]:
df_train.describe()

In [None]:
df_train.info()

### Delete Null Data

In [None]:
df_train.isnull().sum()

- HomePlanet,CryoSleep,Cabin,Destination,VIP: null -> delete

In [None]:
df_train.dropna(subset=['HomePlanet','CryoSleep','Cabin','Destination','VIP'], inplace=True)

- Age,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck: null -> most frequency

In [None]:
columns = ['Age','RoomService','FoodCourt','ShoppingMall','Spa','VRDeck']
for col in columns:
    most_freq = df_train[col].mode()[0]
    df_train[col].fillna(value=most_freq, inplace=True)

### Data Bias

In [None]:
x = df_train['Transported'].unique()
y = [ len(df_train[df_train['Transported'] == i]) for i in x ]
plt.title('Transported')
plt.bar(x,y)
plt.show()

### String Data

In [None]:
non_num = ['HomePlanet', 'Cabin', 'Destination']
for col in non_num:
    uniq = df_train[col].unique()
    print("{0:=^40}".format(col))
    print(f'{uniq}  ({len(uniq)})')

-> HomePlanet, Destination : One-hot Encoding

In [None]:
df_train = pd.get_dummies(df_train, columns=['HomePlanet', 'Destination'])

-> Cabin: Label Encoding

In [None]:
vocab = {
    "Cabin": {}
}

cabin_uniq = df_train['Cabin'].unique()
for idx,val in enumerate(cabin_uniq):
    vocab['Cabin'][val] = idx

df_train['Cabin'] = df_train['Cabin'].map(lambda v: vocab['Cabin'][v])

-> CryoSleep, VIP, Transported: Label Encoding (True: 1, False: 0)

In [None]:
df_train["CryoSleep"] = df_train["CryoSleep"].astype(int)
df_train["VIP"] = df_train["VIP"].astype(int)
df_train["Transported"] = df_train["Transported"].astype(int)

In [None]:
df_train

## Train

### Set Input data

In [None]:
exception_cols = ['PassengerId', 'Name']
dependants = ['Transported']
independents = [ i for i in df_train.keys() if i not in dependants and i not in exception_cols ]
# 'CryoSleep', 'Cabin', 'Age', 'VIP', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck', 'HomePlanet_Earth', 'HomePlanet_Europa', 'HomePlanet_Mars', 'Destination_55 Cancri e', 'Destination_PSO J318.5-22', 'Destination_TRAPPIST-1e'

X = df_train[independents]
Y = df_train[dependants]

In [None]:
X.shape, Y.shape

### Build model

In [None]:
model = LogisticRegression()

### Train model

In [None]:
history = model.fit(X, Y)

## Result

### Model history graph

In [None]:
def plot_graphs(history, string):
    plt.plot(history.history[string])
    plt.plot(history.history['val_'+string], '')
    plt.xlabel("Epochs")
    plt.ylabel(string)
    plt.legend([string, 'val_'+string])
    plt.show()

In [None]:
plot_graphs(history, 'loss')

In [None]:
plot_graphs(history, 'accuracy')

### Save result data

In [None]:
data_out_path = f'./data/out/{NOW_STR}/'

if not os.path.exists(data_out_path):
    os.makedirs(data_out_path)

model.load_weights(os.path.join(data_out_path, "monitor", "weights.h5"))
model.save(os.path.join(data_out_path, "models", "model.h5"))

## Validation

### Predict

In [None]:
df_test.head()

### Preprocess test data

- HomePlanet, Destination: One-hot Encoding

In [None]:
df_test = pd.get_dummies(df_test, columns=['HomePlanet', 'Destination'])

- Cabin: Label Encoding

In [None]:
# 훈련 당시에 정한 사전 그대로 이용
for idx,val in enumerate(df_test['Cabin'].unique()):
    if val not in vocab['Cabin'].keys():
        vocab['Cabin'][val] = idx

df_test['Cabin'] = df_test['Cabin'].map(lambda v: vocab['Cabin'][v])

- CryoSleep, VIP, Transported: Label Encoding (True: 1, False: 0)

In [None]:
df_test["CryoSleep"] = df_test["CryoSleep"].map(lambda v: 1 if v else 0)
df_test["VIP"] = df_test["VIP"].map(lambda v: 1 if v else 0)

In [None]:
df_test.head()

In [None]:
predictions = model.predict(df_test[independents], batch_size=hyper_param['batch_size'])

#### Export prediction to csv file

In [None]:
output = pd.DataFrame({ "id": df_test['PassengerId'].to_list(), "Transported": list(predictions) })
output_dir = os.path.join(PATH_OUTPUT_NOW, "predict")
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

output.to_csv(os.path.join(output_dir, 'predict.csv'))