## Import libraries

In [None]:
# data analysis and wrangling

import pandas as pd
import numpy as np
import random as rnd
import re

# visualization

import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px
from plotly.subplots import make_subplots
import plotly.graph_objects as go
%matplotlib inline

# machine learning

from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier

## Load the Data

In [None]:
train = pd.read_csv('../input/spaceship-titanic/train.csv')
test = pd.read_csv('../input/spaceship-titanic/test.csv')

train.head()

In [None]:
train.describe()

In [None]:
train.info()
print()
print('----------'*6)
test.info()

we can see there are lot of missing values, so it is important to handle them well.

In [None]:
print(train.columns.values)
print('-----'*5)
print(train['HomePlanet'].value_counts())
print('-----'*5)
print(train['Destination'].value_counts())
print('-----'*5)
print(train['VIP'].value_counts())
print('-----'*5)
print(train['Transported'].value_counts())
print('-----'*5)
print(train['Cabin'].value_counts())
print('-----'*5)
print(train['CryoSleep'].value_counts())
print('-----'*5)
print(train['Age'].value_counts())
print('-----'*5)

### Plan for replacing missing values

* HomePlanet: Since most of the data is Earth, we will replace missing values with Earth.
* Destination: Since most of the data is Earth, we will replace the missing values with TRAPPIST-1e
* VIP: Most of them are False so we will replace the missing values with False.
* Cabin: Cabin consists of Deck, Num, Side. It can be divided and analyzed
* CryoSleep: Missing values will be replaced as False.

## Handling Missing Values

In [None]:
Missing_features = ['FoodCourt', 'Spa', 'ShoppingMall', 'RoomService', 'VRDeck', 'Cabin', 'CryoSleep', 'VIP', 'HomePlanet', 'Destination', 'Age']
for feature in Missing_features:
    if feature == 'Age':
        fill = train[feature].mean()
    else:
        fill = train[feature].value_counts().index[0]
    train[feature] = train[feature].fillna(fill)
    test[feature] = test[feature].fillna(fill)

## Divide and Analyze Cabins

In [None]:
def extract_deck(s):
    return s.split('/')[0]

def extract_num(s):
    return s.split('/')[1]

def extract_side(s):
    return s.split('/')[2]

train['Deck'] = train['Cabin'].apply(extract_deck)
train['Num'] = train['Cabin'].apply(extract_num)
train['Side'] = train['Cabin'].apply(extract_side)

test['Deck'] = test['Cabin'].apply(extract_deck)
test['Num'] = test['Cabin'].apply(extract_num)
test['Side'] = test['Cabin'].apply(extract_side)

## Convert categorical features into numerical

In [None]:
features_cat = ['HomePlanet', 'Destination', 'Deck', 'Side']
for feature in features_cat:
    train[pd.get_dummies(train[feature], prefix=feature).columns] = pd.get_dummies(train[feature], prefix=feature)
    test[pd.get_dummies(test[feature], prefix=feature).columns] = pd.get_dummies(test[feature], prefix=feature)

To handle the name data, combining the train and test dataset. We will split it later.

In [None]:
data = pd.concat([train[test.columns], test])
data

## Handling name

In [None]:
def extract_last_name(s):
    return str(s).split(' ')[-1]

data['LastName'] = data['Name'].apply(extract_last_name)

dict_names = data['LastName'].value_counts().to_dict()

def same_name(s):
    return dict_names[s]-1

data['SameName'] = data['LastName'].apply(same_name)

we will add column ['SameName'] to train and test set

In [None]:
to_train = data.iloc[0:8693]
train['SameName'] = pd.Series(to_train['SameName'])

In [None]:
to_test = data.iloc[8693:]
test['SameName'] = pd.Series(to_test['SameName'])

## Handling with age

let us replace age with ordinals based on these groups

In [None]:
def age_group(s):
    if s==0:
        return -1
    elif (s > 0) and (s <= 11):
        return 1
    elif (s > 11) and (s <= 22):
        return 2
    elif (s > 22) and (s <= 33):
        return 3
    elif (s > 33) and (s <= 45):
        return 4
    elif (s > 45) and (s <= 56):
        return 5
    elif (s > 56) and (s <= 67):
        return 6
    elif (s > 67) and (s <= 79):
        return 7
    elif (s > 79) and (s <= 80):
        return 8

    train['Age_Group'] = train['Age'].apply(age_group)
    test['Age_Group'] = test['Age'].apply(age_group)

## Drop Columns

* passengerId is unique Id for each passenger. It is not important while building model
* we converted HomePlanet into 3 numerical columns so we don't need the original column
* we analysed Cabin, Destination, Name, Age and we don't need them.

In [None]:
train.drop(['PassengerId', 'HomePlanet', 'Cabin', 'Destination', 'Name', 'Age', 'Deck', 'Side'], axis = 1, inplace = True)

In [None]:
test.drop(['PassengerId', 'HomePlanet', 'Cabin', 'Destination', 'Name', 'Age', 'Deck', 'Side'], axis = 1, inplace = True)

## Convert False to 0 and True to 1

In [None]:
train['CryoSleep'] = train['CryoSleep'].astype(int)
train['VIP'] = train['VIP'].astype(int)
train['Num'] = train['Num'].astype(int)

test['CryoSleep'] = test['CryoSleep'].astype(int)
test['VIP'] = test['VIP'].astype(int)
test['Num'] = test['Num'].astype(int)

In [None]:
x_train = train.drop(['Transported'], axis = 1)
y_train = train['Transported']
x_test = test

In [None]:
x_test.shape, x_train.shape

## Apply Models

### Random Forest

In [None]:
parameter = {
    'n_estimators': [5, 10, 15, 20, 25],
    'max_depth': [3, 5, 7, 9, 11, 12],
    'min_samples_leaf': [18, 19, 20],
    'min_samples_split': [8, 9, 10]
}

rfc_model = RandomForestClassifier()
rfc_grid = GridSearchCV(rfc_model, param_grid = parameter, cv = 5, scoring = 'accuracy', n_jobs = -1)
rfc_grid.fit(x_train, y_train)
print('Best parameters:', rfc_grid.best_params_)
print()
print('Best Accuracy:', rfc_grid.best_score_)

In [None]:
rf = RandomForestClassifier(max_depth = 12, min_samples_leaf = 18, min_samples_split = 9, n_estimators = 25)
rf.fit(x_train, y_train)
y_pred_rf = rf.predict(x_test)

## KGBClassifier

In [None]:
param_grid = {'n_estimators': [10, 25, 50, 75,100], 'learning_rate': [0.2, 0.15, 0.1, 0.05],
             'eval_metric': ['mlogloss']}
grid = GridSearchCV(XGBClassifier(), param_grid = param_grid, cv = 5, scoring = 'accuracy')
grid.fit(x_train, y_train)
best_params = grid.best_params_
print('Best score of cross validation: {:.2f}' .format(grid.best_score_))
print('Best parameters:', best_params)

In [None]:
xgb = XGBClassifier()
xgb.set_params(**best_params)
xgb.fit(x_train, y_train)

In [None]:
y_pred_xgb = xgb.predict(x_test)

## Submission

In [None]:
subs = pd.read_csv('../input/spaceship-titanic/sample_submission.csv')
subs

In [None]:
subs['Transported'] = y_pred_xgb
subs.to_csv('./Transported', index = False)