# Load Dataset


In [None]:
# data analysis and wrangling
import pandas as pd
import numpy as np
import random as rnd
import re

# visualization
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

# machine learning
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import StratifiedKFold, GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import AdaBoostClassifier


<b>- Categorical Feature : HomePlanet, CryoSleep, Cabin, Destination, VIP, Name</b>

<b>- Numerical Feature : RoomService, FoodCount, ShoppingMall, Spa, VRDeck</b>
<br>

* (PassengerId is unique Id for each passenger. It doesn't importance making Model.)

In [None]:
train = pd.read_csv('../input/spaceship-titanic/train.csv')
test = pd.read_csv('../input/spaceship-titanic/test.csv')

train.head()

In [None]:
train.describe()

<b>Train Data, Test Data have a lot of NaN. We have to do some prprocessing.</b>

In [None]:
train.info()
print()
print('--------'*6)
test.info()

- HomePlanet : Since most of Data is Earth, We will replace NaN values with Earth.
- Destionation : Since most of Data is Earth, We will replace NaN values with TRAPPIST-1e.
- VIP : Most of them are False, so we will relpace them with False as well
<br>

- Cabin : Cabin is consists of Deck, Num, Side. It can be divided and analyzed
- CryoSleep : NaN replace False as well.

In [None]:
print(train.columns.values)
print('------'*6)
print(train['HomePlanet'].value_counts())
print('------'*6)
print(train['Destination'].value_counts())
print('------'*6)
print(train['VIP'].value_counts())
print('------'*6)
print(train['Transported'].value_counts())
print('------'*6)
print(train['Cabin'].str[0].value_counts())
print('------'*6)
print(train['CryoSleep'].value_counts())
print('------'*6)
print(train['Age'].value_counts())

In [None]:
train[['HomePlanet', 'Transported']].groupby(['HomePlanet'], as_index = False).mean().sort_values(by = 'Transported', ascending = False)

In [None]:
train[['Destination', 'Transported']].groupby(['Destination'], as_index = False).mean().sort_values(by = 'Transported', ascending = False)

In [None]:
train[['VIP', 'Transported']].groupby(['VIP'], as_index = False).mean().sort_values(by = 'Transported', ascending = False)

In [None]:
Missing_features = ['FoodCourt',
                    'Spa',
                    'ShoppingMall',
                    'RoomService',
                    'VRDeck',
                    'Cabin',
                   'CryoSleep',
                   'VIP',
                   'HomePlanet',
                   'Destination',
                   'Age']
for feature in Missing_features:
    if feature=='Age':
        fill = train[feature].mean()
    else:
        fill = train[feature].value_counts().index[0]
    train[feature] = train[feature].fillna(fill)
    test[feature] = test[feature].fillna(fill)


## Handle with cabine

In [None]:
def extract_deck(s):
    return s.split('/')[0]

def extract_num(s):
    return s.split('/')[1]

def extract_side(s):
    return s.split('/')[2]

train['Deck'] = train['Cabin'].apply(extract_deck)
train['Num'] = train['Cabin'].apply(extract_num)
train['Side'] = train['Cabin'].apply(extract_side)

test['Deck'] = test['Cabin'].apply(extract_deck)
test['Num'] = test['Cabin'].apply(extract_num)
test['Side'] = test['Cabin'].apply(extract_side)


In [None]:
features_cat = ['HomePlanet', 'Destination', 'Deck', 'Side']
for feature in features_cat:
    train[pd.get_dummies(train[feature], prefix=feature).columns] = pd.get_dummies(train[feature], prefix=feature)
    test[pd.get_dummies(test[feature], prefix=feature).columns] = pd.get_dummies(test[feature], prefix=feature)


In [None]:
data = pd.concat([train[test.columns], test])
data

## Handing with Name

In [None]:
def extract_last_name(s):
    return str(s).split(' ')[-1]

data['LastName'] = data['Name'].apply(extract_last_name)

dict_names = data['LastName'].value_counts().to_dict()

def same_name(s):
    return dict_names[s]-1

data['SameName'] = data['LastName'].apply(same_name)


In [None]:
data.info()

In [None]:
train.info()

In [None]:
lnam = data.iloc[0:8693]
train['SameName'] = pd.Series(lnam['SameName'])


In [None]:
ltest = data.iloc[8693:]
test['SameName'] = pd.Series(lnam['SameName'])


## Handling with Age

In [None]:
def age_group(s):
    if s == 0:
        return -1
    elif (s > 0) & (s <= 11):
        return 1
    elif (s > 11) & (s <= 22):
        return 2
    elif (s > 22) & (s <= 33):
        return 3
    elif (s > 33) & (s <= 45):
        return 4
    elif (s > 45) & (s <= 56):
        return 5
    elif (s > 56) & (s <= 67):
        return 6
    elif (s > 67) & (s <= 79):
        return 7
    elif (s > 79) & (s <= 80):
        return 8
    
train['Age_Group'] = train['Age'].apply(age_group)
test['Age_Group'] = test['Age'].apply(age_group)


In [None]:
train.info()

In [None]:
test.info()

In [None]:
train.drop(['PassengerId','HomePlanet','Cabin','Destination','Name','Age','Deck','Side'], axis = 1, inplace = True)


In [None]:
test.drop(['PassengerId','HomePlanet','Cabin','Destination','Name','Age','Deck','Side'], axis = 1, inplace = True)


## Convert False to 0 and True to 1

In [None]:
train['CryoSleep'] = train['CryoSleep'].astype(int)
train['VIP'] = train['VIP'].astype(int)
train['Num'] = train['VIP'].astype(int)
test['CryoSleep'] = test['CryoSleep'].astype(int)
test['VIP'] = test['VIP'].astype(int)
test['Num'] = test['VIP'].astype(int)


In [None]:
x_train = train.drop(['Transported'], axis=1)
y_train = train['Transported']
x_test = test


In [None]:
x_test.shape , x_train.shape


## Apply Models

### RandomForest

In [None]:
parameters = {
    "n_estimators": [5, 10, 15, 20, 25], 
    "max_depth": [3, 5, 7, 9, 11, 12],
    "min_samples_leaf" : [18, 19, 20],
    "min_samples_split" : [8, 9, 10]
}


RFC_Model = RandomForestClassifier()
RFC_grid = GridSearchCV(RFC_Model, param_grid = parameters, cv = 5, scoring = 'accuracy', n_jobs= -1)
RFC_grid.fit(x_train, y_train)
print('Best Parameters : ', RFC_grid.best_params_)
print()
print('Best Accuracy : ', RFC_grid.best_score_)


In [None]:
rf = RandomForestClassifier(max_depth= 12, min_samples_leaf= 18, min_samples_split= 9, n_estimators= 25)
rf.fit(x_train,y_train)
y_pred_rf = rf.predict(x_test)


### XGBClassifier

In [None]:
param_grid = {'n_estimators': [10, 25, 50, 75, 100],
              'learning_rate': [0.2, 0.15, 0.1, 0.05],
              'eval_metric': ['mlogloss']}
grid = GridSearchCV(XGBClassifier(), param_grid=param_grid, cv=5, scoring='accuracy')
grid.fit(x_train, y_train)
best_params = grid.best_params_
print('Best score of cross validation: {:.2f}'.format(grid.best_score_))
print('Best parameters:', best_params)


In [None]:
xgb = XGBClassifier()
xgb.set_params(**best_params)
xgb.fit(x_train, y_train)


In [None]:
y_pred_xgb = xgb.predict(x_test)

In [None]:
#pip install flaml


In [None]:
#from sklearn.metrics import mean_absolute_error
#from flaml.automl import AutoML
#automl = AutoML()


In [None]:
#automl.fit(x_train, y_train, task="classification",metric='ap',time_budget=300)


In [None]:
#print('Best ML leaner:', automl.best_estimator)
#print('Best hyperparmeter config:', automl.best_config)
#print('Best ap on validation data: {0:.4g}'.format(1-automl.best_loss))
#print('Training duration of best run: {0:.4g} s'.format(automl.best_config_train_time))


In [None]:
#y_pred = automl.predict(test)


## Submission

In [None]:

subs = pd.read_csv('../input/spaceship-titanic/sample_submission.csv')
subs

In [None]:
subs['Transported'] = y_pred_xgb
subs.to_csv('./Transported', index = False)