In [None]:
import numpy as np 
import pandas as pd
import random,os
import warnings
warnings.filterwarnings('ignore')

from sklearn.preprocessing import StandardScaler, MinMaxScaler,MaxAbsScaler,RobustScaler
from sklearn import ensemble

from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_validate,GridSearchCV
# import xgboost as xgb

from sklearn.model_selection import KFold
from sklearn.metrics import roc_auc_score, roc_curve
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.calibration import CalibrationDisplay
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier

TRAIN_PATH = "../input/spaceship-titanic/train.csv"
TEST_PATH = "../input/spaceship-titanic/test.csv"
SAMPLE_SUBMISSION_PATH = "../input/spaceship-titanic/sample_submission.csv"
SUBMISSION_PATH = "submission.csv"

ID = "PassengerId"
TARGET = "Transported"

SEED = 2022
def seed_everything(seed=SEED):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    
seed_everything()

In [None]:
def autoPreProcess(train,test,DROP_COLS,BOOL_COLS,TARGET):
    train_len = len(train)

    train_test = pd.concat(objs=[train, test], axis=0).reset_index(drop=True)
    train_test = train_test.drop(DROP_COLS, axis = 1)

    def checkNull_fillData(df):
        for col in df.columns:
            if len(df.loc[df[col].isnull() == True]) != 0:
                if df[col].dtype == "float64" or df[col].dtype == "int64":
                    df.loc[df[col].isnull() == True,col] = df[col].mean()
                elif col in BOOL_COLS:
                    df.loc[df[col].isnull() == True,col] = df[col].mode()[0]
                else:
                    df.loc[df[col].isnull() == True,col] = "Missing"

    checkNull_fillData(train_test)
    
    for col in BOOL_COLS:
        train_test[col] =  train_test[col].astype(int)

    str_list = [] 
    num_list = []
    for colname, colvalue in train_test.iteritems():
        if colname == TARGET:
            continue
            
        if type(colvalue[1]) == str:
            str_list.append(colname)
        else:
            num_list.append(colname)

    train_test = pd.get_dummies(train_test, columns=str_list)

    scaler = MinMaxScaler(feature_range=(-3,3))
    train_test[num_list] = scaler.fit_transform(train_test[num_list])
    
    train = train_test[:train_len]
    test = train_test[train_len:]

    test.drop(labels=[TARGET],axis = 1,inplace=True)
    
    return train,test

train = pd.read_csv(TRAIN_PATH)
train[TARGET] = train[TARGET].astype(int)

test = pd.read_csv(TEST_PATH)
DROP_COLS = ['PassengerId', 'Name', 'Cabin']
BOOL_COLS = ['CryoSleep','VIP']

train,test = autoPreProcess(train,test,DROP_COLS,BOOL_COLS,TARGET)
train.head()

In [None]:
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import uniform as sp_randFloat
from scipy.stats import randint as sp_randInt

from sklearn.model_selection import train_test_split

from catboost import CatBoostClassifier

VAL_SIZE = 0.2

RS_CV = 5
# RS_N_ITER = 20
RS_N_JOBS = -1
RS_SCORING = 'accuracy'

#split input data and target data 
X = train.drop([TARGET],axis=1)
y = train[TARGET]

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=VAL_SIZE,stratify=y) 


# search best parameter and model
model = LGBMClassifier()
parameters = {'max_depth'         : sp_randInt(1, 20),
              'learning_rate' : sp_randFloat(),
              'num_iterations'    : sp_randInt(1, 5000),
              'min_samples_leaf':sp_randInt(10, 30)
             }

rs = RandomizedSearchCV(
    estimator=model, 
    param_distributions = parameters,
    scoring=RS_SCORING,
    cv = RS_CV, 
#     n_iter = RS_N_ITER, 
    n_jobs=RS_N_JOBS,
    verbose = 3
)
rs.fit(X_train, y_train)

# best model 
print(rs.best_params_)
bestModel = rs.best_estimator_
model = bestModel

In [None]:
pred_test = model.predict(test)

submission = pd.read_csv(SAMPLE_SUBMISSION_PATH)
submission[TARGET] = pred_test.astype(bool)
submission.to_csv(SUBMISSION_PATH, index=False)
submission.head()