In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

from matplotlib import pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import tensorflow as tf
from tensorflow.keras.utils import to_categorical
from keras.layers import Dropout, Dense
from keras.models import Sequential, load_model
from keras.callbacks import EarlyStopping, Callback, LearningRateScheduler


# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current sessiond outside of the current session

In [None]:
# overview 
train_data=pd.read_csv('/kaggle/input/spaceship-titanic/train.csv')
test_data=pd.read_csv('/kaggle/input/spaceship-titanic/test.csv')
test_id=test_data['PassengerId']

train_data.drop(labels='Name',axis=1, inplace=True)
#train_data.drop(labels='PassengerId',axis=1, inplace=True)
test_data.drop(labels='Name',axis=1, inplace=True)
#test_data.drop(labels='PassengerId',axis=1, inplace=True)

train_data.head()

In [None]:
# fix missing data
# numeric use mean

def fix_missing(data):
    numeric_cols = [column for column in data.select_dtypes(["int", "float"])]
    for j in numeric_cols:
        data[j].fillna(data[j].mean(), inplace=True)
    categoric_cols = [column for column in data.select_dtypes(exclude = ["int", "float"])]
    for j in categoric_cols:
        data[j].fillna(data[j].value_counts().index[0], inplace=True)
    return data
        
train_data_nonull = fix_missing(train_data)
test_data_nonull = fix_missing(test_data)

In [None]:

# processing 
def processing_data(data):
    destination = pd.get_dummies(data['Destination'],prefix='Des')
    data = pd.concat([data, destination], axis=1)
    
    home = pd.get_dummies(data['HomePlanet'], prefix='Home')
    data = pd.concat([data, home], axis=1)
    
    data.drop(labels='Destination', axis=1, inplace=True)
    data.drop(labels='HomePlanet', axis=1, inplace=True)
    
    data['group_num']=data['PassengerId'].apply(lambda x:x.split('_')[0]).astype('int')
    data['in_group_num']=data['PassengerId'].apply(lambda x:x.split('_')[1]).astype('int')
    
    data['deck']=data['Cabin'].apply(lambda x:x.split('/')[0])
    data['num']=data['Cabin'].apply(lambda x:x.split('/')[1])
    data['side']=data['Cabin'].apply(lambda x:x.split('/')[2])
    
    data['num']=data['num'].astype('int')
    
    data['CryoSleep']=data['CryoSleep'].map({False:0, True:1})
    data['VIP']=data['VIP'].map({False:0, True:1})
    data['side']=data['side'].map({'P':0, 'S':1})
    
    data.drop(labels='Cabin', axis=1, inplace=True)
    data.drop(labels='PassengerId', axis=1, inplace=True)
    
    deck = pd.get_dummies(data['deck'], prefix='deck')
    data = pd.concat([data, deck], axis=1)
    data.drop(labels='deck', axis=1, inplace=True)
    
    return data

data_new_train = processing_data(train_data_nonull)
data_new_test = processing_data(test_data_nonull)


In [None]:
def scaler_data(train_data,test_data):
    target_data = train_data['Transported']
    feature_data = train_data.copy()
    feature_data_t = test_data.copy()
    feature_data.drop(labels='Transported', axis=1, inplace=True)
    all_data = pd.concat([feature_data, feature_data_t])
    
    float_col=[col for col in all_data.select_dtypes(["float",'int'])]
    scaler = StandardScaler()
    for j in float_col:
        scaler_params = scaler.fit(all_data[j].values.reshape(-1, 1))
        feature_data[j] = scaler.transform(feature_data[j].values.reshape(-1, 1), scaler_params)
        feature_data_t[j] = scaler.transform(feature_data_t[j].values.reshape(-1, 1), scaler_params)
    return feature_data,target_data,feature_data_t
scaled_train_feature, scaled_train_target, scaled_test_feature = scaler_data(data_new_train,data_new_test)
scaled_train_feature.head()

In [None]:
class LossHistory(Callback):
    def on_train_begin(self, logs={}):
        self.loss = []
        self.val_loss = []
    def on_epoch_end(self, batch, logs={}):
        self.loss.append(logs.get('categorical_accuracy'))
        self.val_loss.append(logs.get('val_categorical_accuracy'))

In [None]:
# nn model and train_try
def nn_model(input_data):
    in_dim = len(input_data.columns)
    model = Sequential()
    model.add(Dense(64,input_dim=in_dim ,activation='relu'))
    model.add(Dropout(0.25))
    model.add(Dense(64, activation='relu'))
    model.add(Dropout(0.25))
    model.add(Dense(64, activation='relu'))
    model.add(Dense(2, activation='softmax'))
    model.compile(loss="categorical_crossentropy", optimizer='adam', metrics=['categorical_accuracy'])
    return model


nn=nn_model(scaled_train_feature)

x_train_data = scaled_train_feature.values
y_train_data = scaled_train_target.values
y_train_data = to_categorical(y_train_data, 2)

x_train_nn, x_test_nn, y_train_nn, y_test_nn = train_test_split(x_train_data, y_train_data, 
                                                                test_size=0.1, 
                                                                shuffle=True, 
                                                                random_state = 1)
early_stopping = EarlyStopping(monitor='val_categorical_accuracy', patience=50, mode='max')
history = LossHistory()
callback_list = [early_stopping,history]
nn.fit(x_train_nn, 
       y_train_nn, 
       epochs=500, 
       batch_size=512,
       validation_data=(x_test_nn, y_test_nn),
       callbacks = callback_list, 
       shuffle=True
       )


In [None]:
# plot result
train_acc = history.loss
test_acc = history.val_loss

plt.plot(train_acc, label='1', color='r')
plt.plot(test_acc, label='2', color='b')
plt.legend(['acc', 'val_acc'])
plt.show()


In [None]:
# full nn
nn_full = nn_model(scaled_train_feature)
early_stopping_full = EarlyStopping(monitor='categorical_accuracy', patience=50, mode='max')
nn_full.fit(x_train_data, 
            y_train_data, 
            epochs=50, 
            batch_size=512,
            #callbacks = [early_stopping_full], 
            shuffle=True
            )
nn_full.save('./nn_model.h5')

In [None]:
# predict with nn
model_saved=tf.keras.models.load_model('./nn_model.h5')
input_x = scaled_test_feature.values
result = nn_full.predict(input_x, batch_size = 512)
result_array = np.zeros(result.shape[0])
for i in range(result.shape[0]):
    result_array[i]=(result[i][1] > result[i][0])
pred_nn = pd.Series(result_array).map({0:False, 1:True})
submission_nn=pd.DataFrame({"PassengerId": test_id.values, "Transported": pred_nn})
submission_nn.to_csv('submission_nn.csv', index = False)