In [None]:
import numpy as np
import pandas as pd 
import random,os

from sklearn.preprocessing import StandardScaler, MinMaxScaler,MaxAbsScaler,RobustScaler

from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import uniform as sp_randFloat
from scipy.stats import randint as sp_randInt

from catboost import CatBoostClassifier
from sklearn.metrics import accuracy_score

# variables
TRAIN_PATH = "../input/spaceship-titanic/train.csv"
TEST_PATH = "../input/spaceship-titanic/test.csv"
SAMPLE_SUBMISSION_PATH = "../input/spaceship-titanic/sample_submission.csv"
SUBMISSION_PATH = "submission.csv"

ID = "PassengerId"
TARGET = "Transported"

SEED = 2022
def seed_everything(seed=SEED):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)

seed_everything()

SESSION_ID = 2021
FEATURE = "feature"
EXPERIMENT_NAME = 'titanic_prediction'
N_SPLITS = 10

In [None]:
def getOutStdColumnList(df,ID,TARGET):
    df_describe = df.describe()
    col_list = [col for col in df_describe.columns.tolist() if col not in [ID,TARGET] ]
    std_col_list = [col for col in col_list if df_describe[col].T["std"] > 3.3]
    
    return std_col_list

def autoPreProcess(train,test,DROP_COLS,BOOL_COLS,TARGET):
    
    train = train.drop(DROP_COLS, axis = 1)
    test = test.drop(DROP_COLS, axis = 1)
    
    def checkNull_fillData(train,test):
        for col in train.columns:
            if len(train.loc[train[col].isnull() == True]) != 0:
                if train[col].dtype == "float64" or train[col].dtype == "int64":
                    train.loc[train[col].isnull() == True,col] = train[col].median()
                    test.loc[test[col].isnull() == True,col] = train[col].median()
                elif col in BOOL_COLS:
                    train.loc[train[col].isnull() == True,col] = train[col].mode()[0]
                    test.loc[test[col].isnull() == True,col] = test[col].mode()[0]
                else:
                    train.loc[train[col].isnull() == True,col] = "Missing"
                    test.loc[test[col].isnull() == True,col] = "Missing"
        
            
    checkNull_fillData(train,test)
    
    for col in BOOL_COLS:
        train[col] =  train[col].astype(int)
        test[col] =  test[col].astype(int)
    
    str_list = [] 
    num_list = []
    for colname, colvalue in train.iteritems():
        if colname == TARGET:
            continue
            
        if type(colvalue[1]) == str:
            str_list.append(colname)
        else:
            num_list.append(colname)

    scaler = StandardScaler()
    std_num_col_list = getOutStdColumnList(train,ID,TARGET)
    train[std_num_col_list] = scaler.fit_transform(train[std_num_col_list])
    test[std_num_col_list] = scaler.transform(test[std_num_col_list])
    
    train_len = len(train)

    train_test = pd.concat(objs=[train, test], axis=0).reset_index(drop=True)
    
    train_test = pd.get_dummies(train_test, columns=str_list)
    
    train = train_test[:train_len]
    test = train_test[train_len:]

    test.drop(labels=[TARGET],axis = 1,inplace=True)
    
    return train,test

train = pd.read_csv(TRAIN_PATH)
test = pd.read_csv(TEST_PATH)
DROP_COLS = [ID,"Name","Cabin"]
BOOL_COLS = ["CryoSleep","VIP"]

train,test = autoPreProcess(train,test,DROP_COLS,BOOL_COLS,TARGET)
train[TARGET] = train[TARGET].astype(int)
train.head()

In [None]:
MODEL_MAX_DEPTH = 15
MODEL_TASK_TYPE = 'GPU'
MODEL_RL = 0.002
MODEL_EVAL_METRIC ='Accuracy'
MODEL_LOSS_FUNCTION = 'Logloss'
MODEL_ESR = 10
MODEL_VERBOSE = 10
MODEL_ITERATIONS = 200

CROSS_VAL_CV = 5

# import module we'll need to import our custom module
from shutil import copyfile

# copy our file into the working directory (make sure it has .py suffix)
copyfile(src = "../input/stratified-shuffle-split-oof-py/function_stratifiedshufflesplit_oof.py", dst = "../working/function_stratifiedshufflesplit_oof.py")

from function_stratifiedshufflesplit_oof import *

X = train.drop([TARGET],axis =1)
y = train[TARGET]
X_test = test

model = CatBoostClassifier(
    verbose=MODEL_VERBOSE,
    early_stopping_rounds=MODEL_ESR,
    random_seed=SEED,
    max_depth=MODEL_MAX_DEPTH,
    task_type=MODEL_TASK_TYPE,
    learning_rate=MODEL_RL,
    iterations=MODEL_ITERATIONS,
    loss_function=MODEL_LOSS_FUNCTION,
    eval_metric= MODEL_EVAL_METRIC
)

preds = stratifiedShuffleSplitOOF(CROSS_VAL_CV,X,y,X_test,model,accuracy_score)

In [None]:
submission = pd.read_csv(SAMPLE_SUBMISSION_PATH)
submission[TARGET] =preds.astype(bool)
submission.to_csv(SUBMISSION_PATH,index=False)
submission.head()