In [None]:
# imports 
import numpy as np
import pandas as pd 
import os,random
import warnings
warnings.filterwarnings("ignore")

from sklearn.preprocessing import LabelEncoder,StandardScaler, MinMaxScaler,MaxAbsScaler,RobustScaler

from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import uniform as sp_randFloat
from scipy.stats import randint as sp_randInt

from catboost import CatBoostClassifier

# variables
TRAIN_PATH = "../input/spaceship-titanic/train.csv"
TEST_PATH = "../input/spaceship-titanic/test.csv"
SAMPLE_SUBMISSION_PATH = "../input/spaceship-titanic/sample_submission.csv"
SUBMISSION_PATH = "submission.csv"

ID = "PassengerId"
TARGET = "Transported"
BOOL_COL = ["PassengerId","CryoSleep","VIP"]
DELETE_COL = ["Name","Cabin"]

SEED = 2022
def seed_everything(seed=SEED):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    
seed_everything()

In [None]:
# load 
train = pd.read_csv(TRAIN_PATH)
test = pd.read_csv(TEST_PATH)
    
train = train.drop(DELETE_COL,axis=1)
test = test.drop(DELETE_COL,axis=1)

# check null
def checkNull_fillData(df):
    for col in df.columns:
        if len(df.loc[df[col].isnull() == True]) != 0:
            if df[col].dtype == "float64" or df[col].dtype == "int64":
                df.loc[df[col].isnull() == True,col] = df[col].median()
            else:
                df.loc[df[col].isnull() == True,col] = df[col].mode()[0]
                
checkNull_fillData(train)
checkNull_fillData(test)

# object -> int
train[TARGET] = train[TARGET].astype(int)
for col in BOOL_COL:
    train[col] = train[col].astype(int)
    test[col] = test[col].astype(int)

# check duplicated data
feature = [col for col in train.columns if col != ID and col != TARGET]
train = train[train[feature].duplicated()==False]

#standard scaler 
num_col = []
for col in train.columns:
    if train[col].dtypes != "object" and col != TARGET and col != ID:
        num_col.append(col)
        
scaler = StandardScaler()
train[num_col] = scaler.fit_transform(train[num_col])
test[num_col] = scaler.transform(test[num_col])

# label encoding
str_col = []
for col in train.columns:
    if train[col].dtypes == "object" and col != TARGET and col != ID:
        str_col.append(col)

for col in str_col:
    encoder = LabelEncoder()
    encoder.fit(train[col])
    train[col] = encoder.transform(train[col])

    for label in np.unique(test[col]):
        if label not in encoder.classes_: 
            encoder.classes_ = np.append(encoder.classes_, label) 
    test[col] = encoder.transform(test[col])

In [None]:
RS_CV = 3
RS_N_ITER = 10
RS_N_JOBS = -1
RS_SCORING = 'accuracy'

X = train.drop([TARGET],axis=1)
y = train[TARGET]

model = CatBoostClassifier()
parameters = {'depth'         : sp_randInt(1, 10),
              'learning_rate' : sp_randFloat(),
              'iterations'    : sp_randInt(10, 1000)
             }

rs = RandomizedSearchCV(estimator=model, 
                           param_distributions = parameters,
                           scoring=RS_SCORING,
                           cv = RS_CV, 
                           n_iter = RS_N_ITER, 
                           n_jobs=RS_N_JOBS)
rs.fit(X, y)

# best model 
print(rs.best_params_)
bestModel = rs.best_estimator_

In [None]:
#predict
X_test = test
pred_test = bestModel.predict(X_test)
#submit
sub = pd.read_csv(SAMPLE_SUBMISSION_PATH)
sub[TARGET] = pred_test.astype(int)
sub[TARGET] = pred_test.astype(bool)
sub.to_csv(SUBMISSION_PATH, index=False)
sub.head()