# Define Data

In [None]:
import numpy as np
import pandas as pd
import os,random,warnings
warnings.simplefilter('ignore')

import sklearn
from sklearn import metrics
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import export_graphviz, DecisionTreeClassifier
from sklearn.preprocessing import StandardScaler,LabelEncoder
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.exceptions import NotFittedError

# import keras
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras import optimizers
from tensorflow.keras.wrappers.scikit_learn import KerasClassifier

TRAIN_PATH = "../input/spaceship-titanic/train.csv"
TEST_PATH = "../input/spaceship-titanic/test.csv"
SAMPLE_SUBMISSION_PATH = "../input/spaceship-titanic/sample_submission.csv"
SUBMISSION_PATH = "submission.csv"

ID = "PassengerId"
TARGET = "Transported"
DELETE_COL = ["Name","Cabin"]
BOOL_COL = ["CryoSleep","VIP"]

SEED = 2022
def seed_everything(seed=SEED):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    tf.random.set_seed(seed)

seed_everything()

MODEL_VALIDATION_SIZE = 0.25
MODEL_BATCH_SIZE = 16
MODEL_EPOCHS = 200

TARGET_NUM = 1

MODEL_KERNER_ITNITIALIZER = "glorot_uniform"
MODEL_ACTIVATION = "relu"
MODEL_LAST_ACTIVATION = "sigmoid"

MODEL_OPTIMIZER = "adam"
MODEL_LOSS = "binary_crossentropy"
MODEL_METRICS = ["accuracy"]

MODEL_LR = 0.001
MODEL_DENSE = [32,64,64,128,128]
MODEL_DROPOUT = [0.2,0.2,0.2,0.2]

# Preprocess Data

In [None]:
# load 
train = pd.read_csv(TRAIN_PATH)
test = pd.read_csv(TEST_PATH)
    
train = train.drop(DELETE_COL,axis=1)
test = test.drop(DELETE_COL,axis=1)

# check null
def checkNull_fillData(df):
    for col in df.columns:
        if len(df.loc[df[col].isnull() == True]) != 0:
            if df[col].dtype == "float64" or df[col].dtype == "int64":
                df.loc[df[col].isnull() == True,col] = df[col].median()
            else:
                df.loc[df[col].isnull() == True,col] = "Missing"
                
checkNull_fillData(train)
checkNull_fillData(test)

# object -> int
train[TARGET] = train[TARGET].astype(int)
for col in BOOL_COL:
    train[col] = train[col].astype(str)
    test[col] = test[col].astype(str)

# check duplicated data
feature = [col for col in train.columns if col != ID and col != TARGET]
train = train[train[feature].duplicated()==False]

#standard scaler 
num_col = []
for col in train.columns:
    if train[col].dtypes != "object" and col != TARGET and col != ID:
        num_col.append(col)
        
scaler = StandardScaler()
train[num_col] = scaler.fit_transform(train[num_col])
test[num_col] = scaler.transform(test[num_col])

# label encoding
str_col = []
for col in train.columns:
    if train[col].dtypes == "object" and col != TARGET and col != ID:
        str_col.append(col)

for col in str_col:
    encoder = LabelEncoder()
    encoder.fit(train[col])
    train[col] = encoder.transform(train[col])

    for label in np.unique(test[col]):
        if label not in encoder.classes_: 
            encoder.classes_ = np.append(encoder.classes_, label) 
    test[col] = encoder.transform(test[col])

# Build Model

In [None]:
X = train.drop([ID,TARGET],axis=1)
y= train[TARGET]

X_train, X_valid, y_train, y_valid = train_test_split(X, 
                                                      y, 
                                                      test_size=MODEL_VALIDATION_SIZE, 
                                                      random_state=SEED,
                                                      stratify=y)

ann = Sequential()

# first layer 
ann.add(Dense(units=MODEL_DENSE[0], 
              kernel_initializer=MODEL_KERNER_ITNITIALIZER, 
              activation=MODEL_ACTIVATION, 
              input_shape=(len(X.columns),)))

# middle layer
ann.add(Dense(units=MODEL_DENSE[1], 
              kernel_initializer=MODEL_KERNER_ITNITIALIZER, 
              activation=MODEL_ACTIVATION))
ann.add(Dropout(rate=MODEL_DROPOUT[0]))
        
ann.add(Dense(units=MODEL_DENSE[2], 
              kernel_initializer=MODEL_KERNER_ITNITIALIZER, 
              activation=MODEL_ACTIVATION))
ann.add(Dropout(rate=MODEL_DROPOUT[1]))
        
ann.add(Dense(units=MODEL_DENSE[3], 
              kernel_initializer=MODEL_KERNER_ITNITIALIZER, 
              activation=MODEL_ACTIVATION))
ann.add(Dropout(rate=MODEL_DROPOUT[2]))

ann.add(Dense(units=MODEL_DENSE[4], 
              kernel_initializer=MODEL_KERNER_ITNITIALIZER, 
              activation=MODEL_ACTIVATION))
ann.add(Dropout(rate=MODEL_DROPOUT[3]))

# last layer
ann.add(Dense(units=TARGET_NUM, 
              kernel_initializer=MODEL_KERNER_ITNITIALIZER, 
              activation=MODEL_LAST_ACTIVATION))

ann.compile(optimizer=MODEL_OPTIMIZER, 
            loss=MODEL_LOSS, 
            metrics=MODEL_METRICS)

opt = optimizers.Adam(lr=MODEL_LR)

callback = tf.keras.callbacks.EarlyStopping(monitor='val_accuracy',mode='max', patience=20)

history = ann.fit(
    X_train, 
    y_train, 
    batch_size=MODEL_BATCH_SIZE, 
    epochs=MODEL_EPOCHS, 
    callbacks=[callback],
    validation_data=(X_valid, y_valid)
)

In [None]:
ann.summary()

# Predict Test Data

In [None]:
X_test = test.drop([ID],axis=1)
sub = pd.read_csv(SAMPLE_SUBMISSION_PATH)
sub[TARGET] = (ann.predict(X_test) > 0.5).astype(bool)
sub.to_csv(SUBMISSION_PATH,index=False)
sub.head()