In [None]:
#匯入套件
import numpy as np #線性運算
import pandas as pd #資料處理
import tensorflow as tf #建立類神經網路
import plotly.express as px #畫圖
import plotly.graph_objects as go
from plotly.offline import iplot
from sklearn.preprocessing import StandardScaler, OrdinalEncoder
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer

In [None]:
pd.set_option('display.max_rows', None) #顯示所有列
pd.set_option('display.max_columns', None) #顯示所有行
_template = dict(layout=go.Layout(font=dict(family='Frankling Gothic', size=12), width=1000))
print(tf.__version__)

In [None]:
raw_data = pd.read_csv('../input/spaceship-titanic/train.csv') #導入資料
raw_data.head() #顯上前五筆

In [None]:
train_dataset = raw_data.drop(['PassengerId', 'Name'], axis=1) #拿掉欲預測之結果，並將其餘特徵放入train_dataset
y_train = train_dataset['Transported'] #將結果放入y_train

In [None]:
train_dataset.tail(10) #顯示最後10筆資料

In [None]:
train_dataset.describe().transpose().round(2) #顯示基本資料，並且將其轉置

In [None]:
train_dataset.isnull().sum() #每一colume的空缺值的總和

# Visualization of Features Distribution

In [None]:
#繪圖
fig = px.box(train_dataset, y='Age', color='Transported', points='all')
fig.update_layout(template=_template, title='Age Distribution')
fig.show()

In [None]:
#繪圖
fig = px.box(train_dataset, y='RoomService', color='Transported', points='all', notched=True, 
             color_discrete_map={False:'rgb(255, 0, 153)', True:'rgb(52, 0, 255)'})
fig.update_traces(quartilemethod='exclusive')
fig.update_layout(template=_template, title='RoomService Distribution')
fig.show()

In [None]:
#繪圖
fig = px.box(train_dataset, y='FoodCourt', color='Transported', points='all', notched=True, 
             color_discrete_map={False:'rgb(255, 0, 153)', True:'rgb(227, 255, 0)'})
fig.update_traces(quartilemethod='exclusive')
fig.update_layout(template=_template, title='Foodcourt Distribution')
fig.show()

In [None]:
#繪圖
fig = px.box(train_dataset, y='ShoppingMall', color='Transported', points='all', notched=True, 
             color_discrete_map={False:'rgb(255, 133, 3)', True:'rgb(52, 0, 255)'})
fig.update_traces(quartilemethod='exclusive')
fig.update_layout(template=_template, title='RoomService Distribution')
fig.show()

In [None]:
#繪圖
fig = px.bar(train_dataset, y='VIP', color='Transported')
fig.update_traces(dict(marker_line_width=0))
fig.update_layout(template=_template, title='VIP Distribution')
fig.show()

In [None]:
#繪圖
fig = px.bar(train_dataset, y='CryoSleep', color='Transported')
fig.update_traces(dict(marker_line_width=0))
fig.update_layout(template=_template, title='CryoSleep Distribution')
fig.show()

In [None]:
#繪圖
fig = px.bar(train_dataset, y='HomePlanet', color='Transported')
fig.update_traces(dict(marker_line_width=0))
fig.update_layout(template=_template, title='HomePlanet Distribution')
fig.show()

# Feature Engineering

In [None]:
#建立函數，將年齡的值做轉換
def LabelAge(age):
    if ( age > 80 ):
        return 1
    elif ( age > 60 ):
        return 2
    elif ( age > 40 ):
        return 2
    elif ( age > 10 ):
        return 4
    elif ( age > 0 ):
        return 1

In [None]:
train_dataset['Age'] = train_dataset['Age'].map(LabelAge) #使用函數映射並將新的值其放入原colume
train_dataset['VIP'] = train_dataset['VIP'].astype('float64') #轉換資料型態為浮點數
train_dataset['CryoSleep'] = train_dataset['CryoSleep'].astype('float64') #轉換資料型態為浮點數
train_dataset.drop(['Cabin'], axis=1, inplace=True) #Cabin這個colume拿掉
train_dataset.tail(10) #顯示倒數10筆資料

In [None]:
x_train = train_dataset.iloc[:, :-1] #拿最後一個row
y_train = train_dataset.iloc[:, -1] #拿最後一個row
categorical_columns = list(train_dataset.select_dtypes(include='object').columns) #做子清單，裡面只有資料型態為object的元素
numerical_columns = list(train_dataset.select_dtypes(include='float64').columns) ##做子清單，裡面只有資料型態為浮點數的元素

In [None]:
numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('Scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps = [
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('encoder', OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1)),
    ('scaler', StandardScaler())
])

In [None]:
preprocessing = ColumnTransformer(transformers=[
    ('cat', categorical_transformer, categorical_columns),
    ('num', numerical_transformer, numerical_columns)
])

In [None]:
x = preprocessing.fit_transform(x_train) #預處理。先fit再transform(進行標準化、降為等等)

In [None]:
#建立類神經網路。全連接10-16-32-8-1
the_model = tf.keras.Sequential([
    tf.keras.layers.Dense(units=16, activation='relu', input_shape=[10,]),
    tf.keras.layers.Dense(units=32, activation='relu'),
    tf.keras.layers.Dense(units=8, activation='relu', kernel_regularizer=tf.keras.regularizers.l2(0.01)),
    tf.keras.layers.Dense(units=1, activation='sigmoid')
])

the_model.compile(optimizer='adam',
                 loss=tf.keras.losses.BinaryCrossentropy(),
                 metrics=['accuracy'])

the_model.summary()

In [None]:
# 讓他不要Overfitting(且一直跑很浪費時間)。這是唯一我自己加的!!
from tensorflow import keras
from tensorflow.keras import layers, callbacks

early_stopping = callbacks.EarlyStopping(
    min_delta=0.001,
    patience=10, 
    restore_best_weights=True,
)

In [None]:
#最終訓練
history = the_model.fit(x, y_train,
                        epochs=100, #練100次
                        validation_split=0.3,
                        callbacks=[early_stopping],#及早停止
                       )

In [None]:
#畫出誤差和正確率
hist = history.history
hist = pd.DataFrame(hist)
fig = px.line(hist, color_discrete_map={'val_accuracy':'rgb(0, 197, 255)', 'accuracy':'rgb(255, 23, 4)'})
fig.update_layout(template=_template, title='Metrics')
fig.show()

# TEST Submission

In [None]:
test_data = pd.read_csv('../input/spaceship-titanic/test.csv') #匯入測試資料
id_col = test_data.iloc[:, 0] #取第一個row
test_data.drop(['Cabin', 'Name', 'PassengerId'], axis=1, inplace=True) #將'Cabin', 'Name', 'PassengerId'拿掉
test_data.head() #前五筆資料

In [None]:
x_test = preprocessing.fit_transform(test_data) #做預處理，如前面所述

In [None]:
#預測
prediction = the_model.predict(x_test)
prediction = prediction.reshape(-1)
df = {'PassengerId':id_col, 'Transported':prediction} #做成字典對應
df = pd.DataFrame(df) #轉成DataFrame
df.head() #前五筆

In [None]:
df['Transported'] = df['Transported'].map(lambda x: True if x>=0.5 else False) #轉換資料
df.to_csv('submission.csv', index=False) #上傳