In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from IPython.display import display
import matplotlib.pyplot as plt
import warnings

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# Settings the warnings to be ignored 
warnings.filterwarnings('ignore') 

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# 1. load data

In [None]:
train_df = pd.read_csv('/kaggle/input/spaceship-titanic/train.csv')
test_df = pd.read_csv('/kaggle/input/spaceship-titanic/test.csv')

display(train_df)
display(test_df)

# 2. Preprocessing

## 2.1 impute null values

In [None]:
# drop name column
train_df = train_df.drop(columns = ['Name'])
test_df = test_df.drop(columns = ['Name'])

In [None]:
#show number of missing values

display(train_df.isna().sum())
print('\n')
display(test_df.isna().sum())

In [None]:
display(train_df.describe())
display(test_df.describe())

In [None]:
# impute missing values
from sklearn.impute import SimpleImputer

mean_imputer = SimpleImputer(missing_values = np.nan, strategy = 'mean')

In [None]:
for col in ["Age", "RoomService", "FoodCourt","ShoppingMall","Spa", "VRDeck"]:
    mean_imputer.fit(np.array(train_df[col]).reshape(-1,1))
    train_df[col] = list(mean_imputer.transform(np.array(train_df[col]).reshape(-1,1)).flatten())
    test_df[col] = list(mean_imputer.transform(np.array(test_df[col]).reshape(-1,1)).flatten())

In [None]:
#show number of missing values

display(train_df.isna().sum())
print('\n')
display(test_df.isna().sum())

In [None]:
display(train_df.describe())
display(test_df.describe())

## 2.2 split "cabin"

In [None]:
# split cabin
train_df[["Cabin1","Cabin2","Cabin3"]] = train_df["Cabin"].str.split("/", n=3, expand= True)
test_df[["Cabin1","Cabin2","Cabin3"]] = test_df["Cabin"].str.split("/", n=3, expand= True)

train_df = train_df.drop(columns = ['Cabin'])
test_df = test_df.drop(columns = ['Cabin'])

In [None]:
# put cabin2 into bins
train_df["Cabin2"] = train_df["Cabin2"].astype(float)
test_df["Cabin2"] = test_df["Cabin2"].astype(float)

In [None]:
print(min(train_df["Cabin2"]), max(train_df["Cabin2"]))
print(min(test_df["Cabin2"]), max(test_df["Cabin2"]))

In [None]:
# put cabin2 into bins
train_df["Cabin2_bin"] = ''
test_df["Cabin2_bin"] = ''
train_df.loc[train_df["Cabin2"].isna(), "Cabin2_bin"] = np.nan
test_df.loc[test_df["Cabin2"].isna(), "Cabin2_bin"] = np.nan

for i in range(1,39):
    upper = i*100
    lower = (i-1)*100
    
    scope = str(lower)+"_"+str(upper)
    
    train_df.loc[(train_df["Cabin2"]>=lower) & (train_df["Cabin2"]<upper), "Cabin2_bin"] = scope
    test_df.loc[(test_df["Cabin2"]>=lower) & (test_df["Cabin2"]<upper), "Cabin2_bin"] = scope

train_df = train_df.drop(columns = ['Cabin2'])
test_df = test_df.drop(columns = ['Cabin2'])

In [None]:
display(train_df.iloc[:,10:])
display(test_df.iloc[:,10:])

## 2.3 create dummy variables

In [None]:
# create dummy variables for Transported
def create_dummy_from_TF(df, col):
    new_col = col+"_dummy" 
    
    df[new_col] = 0
    df.loc[df[col] == True, new_col] = 1
    
create_dummy_from_TF(train_df, 'Transported')

train_df = train_df.drop(columns = ['Transported'])

In [None]:
# create dummy variables for HomePlanet, CryoSleep, Cabin, Destination, VIP
train_df_dummy = pd.get_dummies(data = train_df,
                                dummy_na  = True,
                                columns = ["HomePlanet", "CryoSleep", "Destination", "VIP", "Cabin1", "Cabin3", "Cabin2_bin"],
                                dtype = int
                               )

test_df_dummy = pd.get_dummies(data = test_df,
                               dummy_na  = True,
                               columns = ["HomePlanet", "CryoSleep", "Destination", "VIP", "Cabin1", "Cabin3", "Cabin2_bin"],
                               dtype = int
                              )

In [None]:
# make sure test df has same columns as train df
cols = train_df_dummy.columns

for col in cols:
    if (col not in test_df_dummy.columns):
        test_df_dummy[col] = 0

# reorder column orders
test_df_dummy = test_df_dummy[cols]
test_df_dummy = test_df_dummy.drop(columns = ["Transported_dummy"])

In [None]:
display(train_df_dummy)
display(test_df_dummy)

## 2.4 convert dataframes in to numpy arrays

In [None]:
# retrive "transported_dummy"
y_train = np.array(train_df['Transported_dummy'])

train_df_dummy = train_df_dummy.drop(columns = ['Transported_dummy', 'PassengerId'])
x_train = np.array(train_df_dummy)

test_PassengerId = list(test_df_dummy['PassengerId'])
test_df_dummy = test_df_dummy.drop(columns = [ 'PassengerId'])
x_test = np.array(test_df_dummy)

In [None]:
print(y_train.shape)
print(x_train.shape)
print(x_test.shape)

# 3. Exploratary analysis

In [None]:
# correlation
import seaborn as sns
sns.heatmap(train_df_dummy.iloc[:, 0:6].corr());

In [None]:
print("number of people transported: ", np.sum(y_train))
print("number of people not transported: ", y_train.shape[0] - np.sum(y_train))

In [None]:
plt.figure(figsize = (12, 3.5), layout = 'constrained')

for i, col in enumerate(["Age", "RoomService", "FoodCourt","ShoppingMall","Spa", "VRDeck"]):

    plt.subplot(1,6,i+1)
    sns.boxplot(x='Transported_dummy', y= col, data=train_df)

plt.show()

In [None]:
fig, axes = plt.subplots(nrows=1,
                         ncols=6,
                         figsize = (16, 6),
                         layout = 'constrained')

for i, col in enumerate(["HomePlanet", "CryoSleep", "VIP", "Cabin1", "Cabin3", "Cabin2_bin"]):
    
    sub_df_sum = train_df[["Transported_dummy",  col]].groupby([col], dropna = False).sum()
    sub_df_count = train_df[["Transported_dummy",  col]].groupby([col], dropna = False).count()
    sub_df_count = sub_df_count.rename(columns = {"Transported_dummy": "Count"})
    
    sub_df = sub_df_sum.merge(sub_df_count,
                              how = 'inner',
                              left_index = True,
                              right_index = True
                             )
    
    sub_df["Not Transported"] = sub_df["Count"] - sub_df["Transported_dummy"]  
    sub_df = sub_df.drop(columns = ["Count"])
    
    sub_df.plot(kind='bar', stacked=True, ax=axes[i])

plt.show()

In [None]:
for i, col in enumerate(["HomePlanet", "CryoSleep", "Destination", "VIP", "Cabin1", "Cabin3","Cabin2_bin"]):
    
    sub_df_sum = train_df[["Transported_dummy",  col]].groupby([col], dropna = False).count()
    display(sub_df_sum)

# 4. Train & Predict

In [None]:
from sklearn.model_selection import GridSearchCV, KFold

def display_cv_result(grid_search_cv):
    df = pd.DataFrame(grid_search_cv.cv_results_['params'])
    df['mean_test_score'] = grid_search_cv.cv_results_['mean_test_score']
    df = df.sort_values(by = ['mean_test_score'], ascending = [False])

    display(df)

cv_splitter = KFold(n_splits = 5, shuffle = True)

## 4.1 Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier

In [None]:
RandomForest = RandomForestClassifier()

param = {'n_estimators': [i*20 for i in range(2,21)],
         'criterion': ["gini", "entropy"],
         'class_weight': [None],
        }

RandomForest_select = GridSearchCV(estimator = RandomForest,
                                   param_grid = param,
                                   scoring = "accuracy",
                                   cv =cv_splitter,
                                   return_train_score = True
                                  )

RandomForest_select.fit(x_train, y_train)

In [None]:
display_cv_result(RandomForest_select)

In [None]:
prediction = RandomForest_select.predict(x_test)

In [None]:
pred_df = pd.DataFrame({"PassengerId": test_PassengerId,
                        "Transported 0_1": list(prediction)})

pred_df["Transported 0_1"] = pred_df["Transported 0_1"].astype(int)
pred_df["Transported"] = 'False'
pred_df.loc[pred_df["Transported 0_1"] == 1,"Transported"] = 'True'
pred_df = pred_df.drop(columns = ["Transported 0_1"])

display(pred_df)

pred_df.to_csv('/kaggle/working/Spaceship Titanic-Random Forest.csv', index = False)