In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
import tensorflow as tf
import os
import cv2
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
from sklearn import metrics
import datetime

### **Load and prepare training data**

In [None]:
train = pd.read_csv("../input/spaceship-titanic/train.csv")
test = pd.read_csv("../input/spaceship-titanic/test.csv")
test.shape

In [None]:
train.isna().sum()

In [None]:
test.isna().sum()

### **EDA, Data cleansing and dealing with NA values**

In [None]:
# Impute value with most frequent
from sklearn.impute import SimpleImputer
train_columns = train.columns
test_columns = test.columns

# Impute values
imputer = SimpleImputer(strategy = "most_frequent")
df_train = imputer.fit_transform(train) #It's in nddarray form 
df_test = imputer.fit_transform(test)   #It's in nddarray form 

In [None]:
print(df_train)

In [None]:
# Bring columns back
df_train = pd.DataFrame(df_train, columns = train_columns)
df_test = pd.DataFrame(df_test, columns = test_columns)

In [None]:
df_test.isna().sum()

In [None]:
df_train.isna().sum()

In [None]:
#Seperate fetures for any correlation
cabin_columns = ['Deck','Deck Number','Side']
id_columns = ['Passenger Group','Passenger Number']

#split train data set
sepr_cabin = df_train['Cabin'].str.split('/', n =-1, expand = True)
sepr_id = df_train['PassengerId'].str.split('_', n =-1, expand = True)
#Provide columns name 
sepr_cabin.columns = cabin_columns
sepr_id.columns = id_columns

#split test data set
sepr_cabin_test = df_test['Cabin'].str.split('/', n =-1, expand = True)
sepr_id_test = df_test['PassengerId'].str.split('_', n =-1, expand = True)
#Provide columns name 
sepr_cabin_test.columns = cabin_columns
sepr_id_test.columns = id_columns

#combined Cabin_columns,Id_columns into main data frame 
df_train = pd.concat([df_train,sepr_cabin,sepr_id],axis=1)
df_test = pd.concat([df_test,sepr_cabin_test,sepr_id_test],axis=1)

#Drops columns not involved:
df_train.drop(columns = ['PassengerId','Cabin'],inplace =True)
df_test.drop(columns = ['PassengerId','Cabin'],inplace =True)

In [None]:
df_train.head(5)

In [None]:
df_train.describe(include=['O'])

In [None]:
#Correlation between HomePlanet and Transported
sns.catplot(x="HomePlanet", y="Transported", kind="bar", data=df_train)

In [None]:
#Data preprocessing
# Encoding categorical data
from sklearn.preprocessing import OrdinalEncoder,LabelEncoder

In [None]:
cat_columns = ['HomePlanet','CryoSleep','Destination','VIP','Deck','Side','Passenger Number']
num_columns = ['Age','RoomService','FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']

In [None]:
# Encoding categorical data
encoder_train = OrdinalEncoder().fit_transform(df_train[cat_columns])
encoder_train = pd.DataFrame(encoder_train,columns = cat_columns)

encoder_test = OrdinalEncoder().fit_transform(df_test[cat_columns])
encoder_test = pd.DataFrame(encoder_test,columns = cat_columns)

In [None]:
x_train = pd.concat([encoder_train,df_train[num_columns]],axis=1)
x_test = pd.concat([encoder_test,df_test[num_columns]],axis=1)

y_train = df_train['Transported']
label_encoder = LabelEncoder().fit_transform(y_train)
y_train = pd.DataFrame(label_encoder,columns = ['Transported'])

In [None]:
y_train.head()

### **Modeling**

In [None]:
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout,Flatten
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.losses import SparseCategoricalCrossentropy
from tensorflow.keras.callbacks import TensorBoard,EarlyStopping

In [None]:
# define the model
model = Sequential()
model.add(layers.Dense(512, activation = 'relu'))
model.add(layers.Dropout(0.01))
model.add(layers.Dense(512, activation = 'relu'))
#model.add(layers.Dropout(0.1))
model.add(layers.Dense(256, activation = 'relu'))
#model.add(layers.Dropout(0.1))
model.add(layers.Dense(128, activation = 'relu'))
#model.add(layers.Dropout(0.01))
model.add(layers.Dense(64, activation = 'relu'))
model.add(layers.Dense(1, activation = 'sigmoid'))

In [None]:
opt = keras.optimizers.Adam(learning_rate = 0.0005)
model.compile(optimizer = opt,
              loss = 'binary_crossentropy',
              metrics = ['accuracy'])

In [None]:
log_dir = "logs/fit/" + datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
tensorboard_callback = TensorBoard(log_dir=log_dir, histogram_freq=1)
earlystopping_callback = EarlyStopping(
    monitor='val_loss', min_delta=0, patience=3, verbose=2, mode='auto',
    baseline=None, restore_best_weights=True
)

### **Validate the model**

In [None]:
x_train = np.asarray(x_train).astype('float32')
y_train = np.asarray(y_train).astype('float32')

In [None]:
history = model.fit(x = x_train,
          y = y_train,
          batch_size = 200,
          epochs = 30,
          validation_split = 0.2,
          #callbacks = [tensorboard_callback, earlystopping_callback],
                   )

In [None]:
fig, ax = plt.subplots(1, 2, figsize=(10, 4))

history_df = pd.DataFrame(history.history)
history_df[['loss', 'val_loss']].plot(kind='line', ax=ax[0])
history_df[['accuracy', 'val_accuracy']].plot(kind='line', ax=ax[1]);

### **Make prediction**

In [None]:
x_test = np.asarray(x_test).astype('float32')

In [None]:
y_test = model.predict(x_test)

In [None]:
x_test.shape

In [None]:
y_test[0]

In [None]:
y_test = pd.DataFrame(y_test)
y_test = y_test.set_axis(['Transported'], axis = 1)

In [None]:
y_test["Transported"] = np.where(y_test["Transported"] >= 0.5, "True", False)

In [None]:
y_test

In [None]:
test.PassengerId.head()

In [None]:
final_prediction = pd.concat([test.PassengerId, y_test], axis = 1)
final_prediction

In [None]:
final_prediction.head()

In [None]:
final_prediction.shape

In [None]:
#output = pd.DataFrame({'PassengerId': test.PassengerId, 'Transported': prediction_final})
final_prediction.to_csv('my_submission_4.csv', index = False)
print("Your submission was successfully saved!")