In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import numpy as np
import pandas as pd

data = pd.read_csv('/kaggle/input/spaceship-titanic/train.csv')
data

In [None]:
data.info()

In [None]:
data.columns[data.isnull().any()]

In [None]:
data[data.columns[data.isnull().any()]].isnull().sum()

In [None]:
data[data.columns[data.isnull().any()]].isnull().sum() * 100 / data.shape[0]

In [None]:
data.loc[data['Transported'] == True, 'Transported'].count() / data['Transported'].count()

In [None]:
data.Name.fillna(value='No_name', inplace=True)

In [None]:
surname = []
for name in data.Name:
    a = name.split()
    surname.append(a[-1])
data['Surname'] = surname

In [None]:
data['Total Spending'] = data['RoomService'] + data['FoodCourt'] + data['ShoppingMall'] + data['Spa'] + data['VRDeck']

In [None]:
data.drop(labels=['PassengerId', 'Name'], axis=1, inplace = True)

In [None]:
X = data.drop('Transported', axis=1)
y = data['Transported']

from sklearn.model_selection import train_test_split
X_train, X_valid, y_train, y_valid = train_test_split(X, y, train_size=0.8, test_size=0.2, random_state=0)

In [None]:
X_train.select_dtypes(["object"]).columns

In [None]:
X_train.select_dtypes(['float64', 'int64']).columns

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OrdinalEncoder

num_pipeline = Pipeline([
        ("imputer", SimpleImputer(strategy="median")),
        ("scaler", StandardScaler())
    ])

cat_pipeline = Pipeline([
        ("imputer", SimpleImputer(strategy="most_frequent")),
        ("cat_encoder", OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1)), 
        ("scaler", StandardScaler()), 
    ])

from sklearn.compose import ColumnTransformer

num_attribs = ['Age', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck', 'Total Spending']
cat_attribs = ['HomePlanet', 'CryoSleep', 'Cabin', 'Destination', 'VIP', 'Surname']

preprocess_pipeline = ColumnTransformer([
        ("num", num_pipeline, num_attribs),
        ("cat", cat_pipeline, cat_attribs),
    ])

X_train_prepared = preprocess_pipeline.fit_transform(X_train[num_attribs + cat_attribs])
y_train_prepared = preprocess_pipeline.transform(X_valid[num_attribs + cat_attribs])

In [None]:
X_train_prepared = pd.DataFrame(X_train_prepared, columns = X_train.columns)
X_valid_prepared = pd.DataFrame(y_train_prepared, columns = X_valid.columns)

In [None]:
from sklearn.feature_selection import mutual_info_regression

def make_mi_scores(X, y):
    mi_scores = mutual_info_regression(X, y)
    mi_scores = pd.Series(mi_scores, name="MI Scores", index=X.columns)
    mi_scores = mi_scores.sort_values(ascending=False)
    return mi_scores

mi = pd.DataFrame(make_mi_scores(X_train_prepared, y_train))

corr = pd.DataFrame(X_train_prepared[X_train_prepared.columns].corrwith(y_train), columns=['Correlation'])

relation = mi.join(corr)
relation

In [None]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers, callbacks

model = keras.Sequential([
    layers.BatchNormalization(input_shape=[13]),
    layers.Dense(512, activation='relu', input_shape=[13]),
    layers.BatchNormalization(),
    layers.Dropout(0.3),
    layers.Dense(512, activation='relu'),
    layers.BatchNormalization(),
    layers.Dropout(0.3),
    layers.Dense(512, activation='relu'),
    layers.BatchNormalization(),
    layers.Dropout(0.3),
    layers.Dense(512, activation='relu'),
    layers.BatchNormalization(),
    layers.Dropout(0.3),
    layers.Dense(512, activation='relu'),
    layers.BatchNormalization(),
    layers.Dropout(0.3),
    layers.Dense(1, activation='sigmoid'),
])

model.compile(
    optimizer='adam',
    loss='binary_crossentropy',
    metrics=['binary_accuracy'],
)

early_stopping = keras.callbacks.EarlyStopping(
    patience=10,
    min_delta=0.001,
    restore_best_weights=True,
)

In [None]:
history = model.fit(
    X_train_prepared, y_train,
    validation_data=(X_valid_prepared, y_valid),
    #batch_size=512,
    epochs=1000,
    callbacks=[early_stopping],
)

history_df = pd.DataFrame(history.history)
history_df.loc[:, ['loss', 'val_loss']].plot(title="Cross-entropy")
history_df.loc[:, ['binary_accuracy', 'val_binary_accuracy']].plot(title="Accuracy")

In [None]:
data_test = pd.read_csv('/kaggle/input/spaceship-titanic/test.csv')
data_test_copy = data_test.copy(deep=True)

In [None]:
data_test_copy.Name.fillna(value='No_name', inplace=True)

In [None]:
surname = []
for name in data_test_copy.Name:
    a = name.split()
    surname.append(a[-1])
data_test_copy['Surname'] = surname

In [None]:
data_test_copy['Total Spending'] = data_test_copy['RoomService'] + data_test_copy['FoodCourt'] + data_test_copy['ShoppingMall'] + data_test_copy['Spa'] + data_test_copy['VRDeck']

In [None]:
data_test_copy.drop(labels=['PassengerId', 'Name'], axis=1, inplace = True)

In [None]:
data_test_prepared = preprocess_pipeline.transform(data_test_copy[num_attribs + cat_attribs])

In [None]:
data_test_prepared = pd.DataFrame(data_test_prepared, columns = data_test_copy.columns)

In [None]:
pred_data_test_prepared = model.predict(data_test_prepared)
pred_data_test_prepared

In [None]:
pred_data_test_prepared[pred_data_test_prepared <= 0.5] = 0 # True
pred_data_test_prepared[pred_data_test_prepared > 0.5] = 1 # False
pred_data_test_prepared