# Before

In [None]:
import numpy as np
import pandas as pd
import os,random

import gc

from IPython.display import clear_output
!pip install -q -U keras-tuner
clear_output()

import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler,MaxAbsScaler,RobustScaler,LabelEncoder
import kerastuner as kt

from sklearn import ensemble

from sklearn import metrics
from sklearn import model_selection
from sklearn.svm import SVC, LinearSVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.tree import DecisionTreeClassifier

from scipy import stats
from scipy.stats import norm, skew
from subprocess import check_output

TRAIN_PATH = "../input/spaceship-titanic/train.csv"
TEST_PATH = "../input/spaceship-titanic/test.csv"
SAMPLE_SUBMISSION_PATH = "../input/spaceship-titanic/sample_submission.csv"
SUBMISSION_PATH = "submission.csv"

ID = "PassengerId"
TARGET = "Transported"
DELETE_COL = ["Name","Cabin"]
BOOL_COL = ["CryoSleep","VIP"]

SEED = 2022
def seed_everything(seed=SEED):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)

seed_everything()

MAX_TRIALS = 20
MODEL_METRIC = "score"
MODEL_METRIC_DIRECTION = "max"

# Preprocess

In [None]:
# load 
train = pd.read_csv(TRAIN_PATH)
test = pd.read_csv(TEST_PATH)
    
train = train.drop(DELETE_COL,axis=1)
test = test.drop(DELETE_COL,axis=1)

# check null
def checkNull_fillData(df):
    for col in df.columns:
        if len(df.loc[df[col].isnull() == True]) != 0:
            if df[col].dtype == "float64" or df[col].dtype == "int64":
                df.loc[df[col].isnull() == True,col] = df[col].median()
            else:
                df.loc[df[col].isnull() == True,col] = df[col].mode()[0]
                
checkNull_fillData(train)
checkNull_fillData(test)

# object -> int
train[TARGET] = train[TARGET].astype(int)
for col in BOOL_COL:
    train[col] = train[col].astype(str)
    test[col] = test[col].astype(str)

# check duplicated data
feature = [col for col in train.columns if col != ID and col != TARGET]
train = train[train[feature].duplicated()==False]

# # #standard scaler 
num_col = []
for col in train.columns:
    if train[col].dtypes != "object" and col != TARGET and col != ID:
        num_col.append(col)
        
scaler = StandardScaler()
train[num_col] = scaler.fit_transform(train[num_col])
test[num_col] = scaler.transform(test[num_col])

# label encoding
str_col = []
for col in train.columns:
    if train[col].dtypes == "object" and col != TARGET and col != ID:
        str_col.append(col)

for col in str_col:
    encoder = LabelEncoder()
    encoder.fit(train[col])
    train[col] = encoder.transform(train[col])

    for label in np.unique(test[col]):
        if label not in encoder.classes_: 
            encoder.classes_ = np.append(encoder.classes_, label) 
    test[col] = encoder.transform(test[col])

# Build

In [None]:
y = train[TARGET].values
X = train.drop([ID,TARGET],axis=1)
X_test = test.drop([ID],axis=1)

def build_random_forest(hp):
    model = ensemble.RandomForestClassifier(
        n_estimators=hp.Int('n_estimators', 50, 500),
        random_state=hp.Int('random_state',0,1000),
        max_depth=hp.Int('max_depth', 2, 20))
    return model

tuner = kt.tuners.Sklearn(
    oracle=kt.oracles.BayesianOptimization(
        objective=kt.Objective(MODEL_METRIC, MODEL_METRIC_DIRECTION),
        max_trials=MAX_TRIALS),
    hypermodel= build_random_forest,
    directory='.',
    project_name='random_forest')

tuner.search(X.values, y)
best_hp = tuner.get_best_hyperparameters(num_trials=MAX_TRIALS)[0]

model = tuner.hypermodel.build(best_hp)
model.fit(X.values, y)

# Eval

In [None]:
from sklearn.model_selection import cross_validate
cross_validate(model, X, y, cv=5, scoring=('accuracy'))

# After 

In [None]:
pred_test = model.predict(X_test)

sub = pd.read_csv(SAMPLE_SUBMISSION_PATH)
sub[TARGET]=pred_test
sub[TARGET]=sub[TARGET].astype(bool)
sub.to_csv(SUBMISSION_PATH,index=False)
sub.head()