## Table Of Contents
* **[EDA](#eda)**
* **[Fix Missing Values](#fixmiss)**
* **[Encode Values](#encode)**
* **[CatBoost Model Training](#cat)**
* **[XGB Model Training](#xgb)**
* **[RFC Model Training](#rfc)**
* **[ETC Model Training](#etc)**
* **[Model Voting](#voting)**
* **[Submission](#submit)**

In [None]:
import numpy as np
import pandas as pd
import random
import os
import matplotlib.pyplot as plt
import seaborn as sns
import xgboost as xgb

from sklearn.model_selection import train_test_split, GridSearchCV
from catboost import CatBoostClassifier
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, VotingClassifier

# Checks whether the kernal is in the batch (submission) mode
kaggle = True if 'KAGGLE_KERNEL_RUN_TYPE' in os.environ.keys() else False

if kaggle:
    data_dir = '/kaggle/input/spaceship-titanic/'
    name_data_dir = '../input/gendername-dataset/'
else:
    data_dir = 'Data/'
    name_data_dir = 'Data/'

In [None]:
# Make sure I don't froget to put a random seed on something.
def seed_everything(seed):
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    random.seed(seed)
    
seed_everything(50)

In [None]:
train = pd.read_csv(data_dir + 'train.csv').sample(frac = 1, random_state=123)
test = pd.read_csv(data_dir + 'test.csv')
name_gender = pd.read_csv(name_data_dir + 'name_gender.csv')

In [None]:
M_names = name_gender.loc[name_gender['gender'] == 'M']
F_names = name_gender.loc[name_gender['gender'] == 'F']

# EDA <a id="eda"></a>

In [None]:
train.shape, test.shape

In [None]:
train.head()

In [None]:
M_names.head()

In [None]:
name_gender.head(10)

In [None]:
train.isna().sum()

In [None]:
test.isna().sum()

In [None]:
train.info()

In [None]:
train.duplicated().sum()

In [None]:
test.duplicated().sum()

**Looking at what values the column have**

*Looking at PassengerId*

* What do the two numbers in PassengerId mean?

In [None]:
pass_id = train['PassengerId']

In [None]:
pass_id

* With this we can see the first number is the id. It is unique except for cases where the second number is different.

In [None]:
group_id = pass_id.str[0 : 4 : 1]
person_in_group_id = pass_id.str[5 : 7 : 1]

In [None]:
sns.histplot(person_in_group_id)

* If we look at the last names of the people they are the same which proves that the people are related.

In [None]:
train.iloc[[2, 3]][['PassengerId', 'Name']]

* This proves that the second number is the amount of people in the group and the first number is the unique **group** id.

*Looking at HomePlanet*

In [None]:
train['HomePlanet'].value_counts()

In [None]:
sns.histplot(train['HomePlanet'])

*Looking at CryoSleep*

In [None]:
train['CryoSleep'].value_counts()

*Looking at Cabin*

In [None]:
train['Cabin']

*Looking at Destination*

In [None]:
train['Destination']

In [None]:
train['Destination'].value_counts()

*Looking at Age*

In [None]:
train['Age']

In [None]:
train['Age'].value_counts()

*Looking at VIP*

In [None]:
train['VIP']

In [None]:
train['VIP'].value_counts()

*Looking at RoomService*

In [None]:
train['RoomService']

*Looking at FoodCourt*

In [None]:
train['FoodCourt']

*Looking at ShoppingMall*

In [None]:
train['ShoppingMall']

*Looking at Spa*

In [None]:
train['Spa']

*Looking at VRDeck*

In [None]:
train['VRDeck']

In [None]:
sns.heatmap(train.corr().abs())

In [None]:
train.corrwith(train['Transported'])

# Fix Missing Values <a id="fixmiss"></a>

In [None]:
def fixMiss(data):    
    data['HomePlanet'].fillna('None', inplace=True)
    data['CryoSleep'].fillna(False, inplace=True)
    data['Cabin'].fillna('A/-1/A', inplace=True)
    data['Destination'].fillna('None', inplace=True)
    data['Age'].fillna(int(train['Age'].mode()), inplace=True)
    data['VIP'].fillna(False, inplace=True)
    data['RoomService'].fillna(0, inplace=True)
    data['FoodCourt'].fillna(0, inplace=True)
    data['ShoppingMall'].fillna(0, inplace=True)
    data['Spa'].fillna(0, inplace=True)
    data['VRDeck'].fillna(0, inplace=True)
    data['Name'].fillna('ABCD', inplace=True)

In [None]:
fixMiss(train)
train.isna().sum()

# Encode Values <a id="encode"></a>

In [None]:
def encode(data):
    names = data['Name']
    first_names = pd.Series(index=np.arange(len(names)), dtype = float)
    for i in names.index:
        first_names[i] = names[i].split(' ')[0]
    
    data['PersonInGroupId'] = person_in_group_id.astype(int)
    
    data['CryoSleep'] = data['CryoSleep'].astype(int)
    data['VIP'] = data['VIP'].astype(int)
    data['RoomService'] = data['RoomService'].astype(int)
    data['FoodCourt'] = data['FoodCourt'].astype(int)
    data['ShoppingMall'] = data['ShoppingMall'].astype(int)
    data['Spa'] = data['Spa'].astype(int)
    data['VRDeck'] = data['VRDeck'].astype(int)
    
    data['TotalSpent'] = (data['RoomService'] + data['FoodCourt'] + data['ShoppingMall'] + data['Spa'] + data['VRDeck'])
    
    data['SpentMuch'] = (data['TotalSpent'] > 7500).astype(int)
    
    data['HomePlanet'] = data['HomePlanet'].map({'Earth': 0, 'Europa': 1,
                                                 'Mars': 2, 'None': 3})
    
    def extract_first(s):
        return s.split('/')[0]
    def extract_mid(s):
        return s.split('/')[1]
    def extract_last(s):
        return s.split('/')[2]
    data['Deck'] = data['Cabin'].apply(extract_first)
    data['Num'] = data['Cabin'].apply(extract_mid).astype(int)
    data['Side'] = data['Cabin'].apply(extract_last)
    
#     data['Deck'] = data['Deck'].map({'A': 0, 'B': 1, 'C': 2, 'D': 3, 'E': 4, 'F': 5, 'G': 6, 'T': 7})
#     data['Side'] = data['Side'].map({'A': 0, 'P': 1, 'S': 2})

    data[pd.get_dummies(data['Deck'], prefix='Deck').columns] = pd.get_dummies(
        data['Deck'], prefix='Deck')
    data[pd.get_dummies(data['Side'], prefix='Side').columns] = pd.get_dummies(
        data['Side'], prefix='Side')
    
    data['Destination'] = data['Destination'].map({'55 Cancri e': 0, 'PSO J318.5-22': 1,
                                                   'TRAPPIST-1e': 2, 'None': 3})
    
    data['Age'] = data['Age'].astype(int)
    
    data['Gender'] = np.where(first_names.isin(F_names['name']),
                              0, np.where(first_names.isin(M_names['name']), 1, 2)) * 1
    
    data.drop(['Cabin', 'Name', 'PassengerId', 'VIP', 'TotalSpent'], axis=1, inplace=True)

In [None]:
encode(train)    
train['Transported'] = train['Transported'] * 1
train.info()

In [None]:
_, ax = plt.subplots(1,3, figsize=(15, 5))
plt.sca(ax[0])
sns.countplot(data=train[['HomePlanet', 'Transported']], x='HomePlanet', hue='Transported')
plt.sca(ax[1])
sns.countplot(data=train[['Destination', 'Transported']], x='Destination', hue='Transported')
plt.sca(ax[2])
sns.countplot(data=train[['Transported', 'CryoSleep']], x='CryoSleep', hue='Transported')

In [None]:
_, ax = plt.subplots(1,3, figsize=(15, 5))
plt.sca(ax[0])
sns.countplot(data=train[['Transported', 'Side']], x='Side', hue='Transported')
plt.sca(ax[1])
sns.countplot(data=train[['Transported', 'Gender']], x='Gender', hue='Transported')
plt.sca(ax[2])
sns.countplot(data=train[['Transported', 'Deck']], x='Deck', hue='Transported')

In [None]:
X = train.drop(['Transported', 'Deck', 'Side'], axis=1)
y = train['Transported']

In [None]:
train.info()

# CatBoost Model Training<a id="cat"></a>

In [None]:
params = {'iterations': [50, 50, 60, 70, 80, 90, 100, 120, 130, 150], 
              'depth': [4, 5, 6], 
              'learning_rate':  [0.1, 0.15, 0.2]}
model_cat = CatBoostClassifier(silent=True)
grid_cat = GridSearchCV(model_cat, param_grid=params, cv=10, scoring='accuracy')
grid_cat.fit(X, y)

print('Score: ', grid_cat.best_score_)
print('Parameters:', grid_cat.best_params_)
print(grid_cat.best_estimator_.feature_importances_)

# XGB Model Training <a id="xgb"></a>

In [None]:
# params = {'n_estimators': [50, 60, 70, 80, 90], 'learning_rate': [0.095, 0.085, 0.075]}
# model_xgb1 = xgb.XGBClassifier(use_label_encoder=False, verbosity=0, n_jobs=-1)
# grid_xgb1 = GridSearchCV(model_xgb1, param_grid=params, cv=15, scoring='accuracy')
# grid_xgb1.fit(X, y)

# print('Score: ', grid_xgb1.best_score_)
# print('Parameters:', grid_xgb1.best_params_)
# print(grid_xgb1.best_estimator_.feature_importances_)

In [None]:
# params = {'n_estimators': [50, 60, 70, 80, 90], 'learning_rate': [0.095, 0.085, 0.075], 'max_depth': [3, 4, 5, 6, 7]}
# model_xgb2 = xgb.XGBClassifier(use_label_encoder=False, verbosity=0, n_jobs=-1)
# grid_xgb2 = GridSearchCV(model_xgb2, param_grid=params, cv=3, scoring='accuracy')
# grid_xgb2.fit(X, y)

# print('Score: ', grid_xgb2.best_score_)
# print('Parameters:', grid_xgb2.best_params_)
# print(grid_xgb2.best_estimator_.feature_importances_)

In [None]:
# params = {'n_estimators': [50, 60, 70, 80, 90], 'learning_rate': [0.095, 0.085, 0.075]}
# model_xgb3 = xgb.XGBClassifier(use_label_encoder=False, verbosity=0, n_jobs=-1)
# grid_xgb3 = GridSearchCV(model_xgb3, param_grid=params, cv=5, scoring='accuracy')
# grid_xgb3.fit(X, y)

# print('Score: ', grid_xgb3.best_score_)
# print('Parameters:', grid_xgb3.best_params_)
# print(grid_xgb3.best_estimator_.feature_importances_)

# RFC Model Training <a id="rfc"></a>

**Does not help**

In [None]:
# params = {'n_estimators': [100, 110, 120, 130, 140, 150, 160, 170, 180, 190, 200],
#          'max_depth': [7, 8, 9, 10, None],
#          'min_samples_leaf': [1, 2, 3, 4, 5]}
# model_rfc = RandomForestClassifier(n_jobs=-1)
# grid_rfc = GridSearchCV(model_rfc, param_grid=params, random_state=1234, cv=5, scoring='accuracy', n_jobs=-1)
# grid_rfc.fit(X, y)

# print('Score: ', grid_rfc.best_score_)
# print('Parameters:', grid_rfc.best_params_)

# ETC Model Training <a id="etc"></a>

**Does not help**

In [None]:
# params = {'n_estimators': [100, 110, 120, 130, 140, 150, 160, 170, 180, 190, 200],
#          'max_depth': [4, 5, 7, 8, 9, 10, None],
#          'min_samples_leaf': [1, 2, 3, 4, 5]}
# model_etc = ExtraTreesClassifier(n_jobs=-1)
# grid_etc = GridSearchCV(model_etc, param_grid=params, random_state=1234, cv=5, scoring='accuracy')
# grid_etc.fit(X, y)

# print('Score: ', grid_etc.best_score_)
# print('Parameters:', grid_etc.best_params_)

# Model Voting <a id="voting"></a>

In [None]:
# vote = VotingClassifier(estimators = [('XGB1', grid_xgb1), ('XGB2', grid_xgb2), ('XGB3', grid_xgb3)], voting='soft', 
#                        weights=[0.6, 0.1, 0.3], n_jobs=-1)
# grid_vote = GridSearchCV(vote, param_grid={}, cv=5, scoring='accuracy')
# grid_vote.fit(X, y)

# print('Score: ', grid_vote.best_score_)

# Submission <a id="submit"></a>

In [None]:
pass_ids = test['PassengerId']
fixMiss(test)
encode(test)
test.drop(['Deck', 'Side'])

In [None]:
y_test = pd.Series(grid_cat.predict(test))

In [None]:
submit = pd.DataFrame(index = range(test.shape[0]))

submit['PassengerId'] = pass_ids
submit['Transported'] = y_test.astype(bool)

In [None]:
submit

In [None]:
submit.to_csv('submission.csv', index=False)