In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

## データ読み込み

In [None]:
# 学習データ
train = pd.read_csv('../input/spaceship-titanic/train.csv')

# nanを含む行を削除
train.dropna(inplace=True)

# テストデータ
test = pd.read_csv('../input/spaceship-titanic/test.csv')

# 提出データ
sample_submission = pd.read_csv("../input/spaceship-titanic/sample_submission.csv")

print(len(train), len(test))

## データ統合

In [None]:
# 学習データとテストデータの列数を合わせるため，二つのcsvデータを繋げる(列数が同じでないと入力が通らない(但し提出データは例外))
data = pd.concat([train, test], sort=False)
data.tail(10)

## 特徴量エンジニアリング

In [None]:
# 文字列を整数に置き換える
data['HomePlanet'].replace(['Earth','Europa', 'Mars'], [0, 1, 2], inplace=True)
data['Destination'].replace(['TRAPPIST-1e', '55 Cancri e', 'PSO J318.5-22'], [0, 1, 2], inplace=True)
data['Transported'].replace([False, True], [0, 1], inplace=True)
data['CryoSleep'].replace([False, True], [0, 1], inplace=True)
data['VIP'].replace([False, True], [0, 1], inplace=True)

In [None]:
# Cabin列のデータを"/"で区切り，"deck", "num", "side"列を追加して区切った情報を代入する
data[['deck', 'num','side']] = data['Cabin'].str.split('/', expand=True)

# 用済みになったCabin列を削除する
data = data.drop(['Cabin'], axis=1)

# deck, side列のデータを整数に置き換える
data['deck'] = data['deck'].replace({'A':0, 'B':1, 'C':2, 'D':3, 'E':4, 'F':5, 'G':6, 'T':7})
data['side'] = data['side'].replace({'P':0, 'S':1})

data.head()

In [None]:
# 苗字の列に分ける
data['Name'] = data['Name'].str.split(' ').str[1]
Names = data['Name'].tolist()

NameValues = []
for name in Names:
    if type(name) is str:
        NameValues.append(Names.count(name)/len(Names))
    else:
        NameValues.append(0)

data['Name'].replace(Names, NameValues, inplace=True)
data.head()

In [None]:
data['PassengerId'] = data['PassengerId'].str.split('_').str[1]

In [None]:
# 整数列のデータをfloat型に変換する
data = data.astype('float')
data.info()

In [None]:
# 欠損値を最頻値で穴埋めする
data = data.fillna(data.mean())
data.head(10)

In [None]:
train = data[:len(train)]
test = data[len(train):]

Y_train = train['Transported']
X_train = train.drop('Transported', axis = 1)
X_test = test.drop('Transported', axis = 1)

In [None]:
# TensorFlow と tf.keras のインポート
import tensorflow as tf
from tensorflow import keras

model = keras.Sequential()
model.add(keras.layers.Dense(16, input_dim=15, bias_initializer='zeros', activation='softsign'))
model.add(keras.layers.Dense(8, bias_initializer='zeros', activation='tanh'))
model.add(keras.layers.Dense(1, bias_initializer='zeros', activation='sigmoid'))

model.compile(optimizer='adam',
             loss='binary_crossentropy',
             metrics=['accuracy'])

model.fit(X_train, Y_train, epochs=128)

In [None]:
# 予測
y_pred = model.predict(X_test)
y_pred[:20]

In [None]:
# 値をboolに変換
def booling(n):
    if n >= 0.5:
        return True
    else:
        return False
sub = sample_submission
sub['Transported'] = [booling(i) for i in y_pred]
sub.to_csv("submission.csv", index=False)
sub.head(10)