In [None]:
import os
import glob
import pandas as pd
import xgboost as xgb
import warnings
import mlflow
import json
from sklearn.metrics import accuracy_score
from ydata_profiling import ProfileReport
from sklearn.model_selection import train_test_split
from sdv.single_table import CTGANSynthesizer
from sdv.metadata import SingleTableMetadata

warnings.filterwarnings('ignore')

#load data
sample = pd.read_csv('sample_submission.csv')
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

#ProfileReport(train, title="Profiling Report")

#drop some vars
drop_list = ['PassengerId', 'Name', 'Cabin']
train = train.drop(drop_list, axis=1)
train = train.dropna(how='any')

In [None]:
from sklearn.preprocessing import OneHotEncoder
onehot_list = ['HomePlanet', 'Destination', 'CryoSleep', 'VIP', 'Transported']

warnings.filterwarnings('ignore')

# インスタンス化
enc = OneHotEncoder(sparse=False)

for column in onehot_list:
    # OneHotエンコーディングを適用
    transformed = enc.fit_transform(train[[column]])
    
    # エンコーディングされたデータをDataFrameに変換
    transformed_df = pd.DataFrame(transformed, columns=[f"{column}_{cat}" for cat in enc.categories_[0]], index=train.index)  # インデックスを指定
    
    # 元のデータから対象の列を削除
    train = train.drop(column, axis=1)
    
    # エンコーディングされたデータを元のDataFrameに結合
    train = pd.concat([train, transformed_df], axis=1)

train = train.drop(['HomePlanet_Mars', 'Destination_TRAPPIST-1e', 'CryoSleep_False', 'VIP_False', 'Transported_False'], axis=1)
train.head()

#説明変数と被説明変数に分割
x = train.drop(['Transported_True'], axis=1)
y = train['Transported_True']

X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.3)

train_all = pd.concat([y_train, X_train], axis=1)


#cross varidation
dtrain = xgb.DMatrix(X_train, label=y_train)
params = {'max_depth':3, 'eta':0.1}
cross_val = xgb.cv(
    params, dtrain, num_boost_round=1000, early_stopping_rounds=50
)
best_n_boost_round = cross_val.shape[0]

### SDVによるCTGAN実装

In [None]:
#SDVによるCT-GAN実装

#metadata(json形式)の作成
metadata = SingleTableMetadata()
metadata.detect_from_dataframe(data=train_all)

# CTGANSynthesizerのインスタンスを作成
synthesizer = CTGANSynthesizer(metadata)

# モデルの学習
synthesizer.fit(train_all)

#所要時間約10分

#synthetic data 作成
synthetic_data = synthesizer.sample(num_rows=10000)
synthetic_data.head()

train_all = pd.concat([train_all, synthetic_data], axis=0)
y_train_extended = train_all.iloc[:,0]
X_train_extended = train_all.iloc[:,1:]

### ベイズ最適化による最適パラメータ選択

In [None]:
from bayes_opt import BayesianOptimization
import xgboost as xgb
import mlflow

#pre setting of categorical parameters
try_grow_policy = 'depthwise'
try_objective = 'reg:squarederror'
try_booster = 'gbtree'
try_tree_method = 'auto'
try_sampling_method = 'uniform'
try_importance_type = 'gain'
try_device = 'cpu'
try_multi_strategy = 'diagonal'
try_eval_metric = 'rmse'

#evaluation function
def xgboost_eval(try_max_depth,try_learning_rate, try_n_estimators, try_gamma, try_min_child_weight, try_subsample, try_colsample_bytree, try_reg_alpha, try_reg_lambda):
    # convert to int since these are not continuous variables
    try_max_depth = int(try_max_depth)
    try_n_estimators = int(try_n_estimators)

    #parameter settings
    #最大値を設定する系はナシ
    #データセットの前処理に関わる変数もナシ。
    model = xgb.XGBClassifier(
        max_depth=try_max_depth,
        learning_rate=try_learning_rate,
        n_estimators=try_n_estimators,
        gamma=try_gamma,
        min_child_weight=try_min_child_weight,
        subsample=try_subsample,
        colsample_bytree=try_colsample_bytree,
        reg_alpha=try_reg_alpha,
        reg_lambda=try_reg_lambda,
        try_grow_policy = try_grow_policy,
        try_objctive = try_objective,
        try_booster = try_booster,
        try_tree_method = try_tree_method,
        try_importance_type = try_importance_type,
        try_device = try_device,
        try_multi_strategy = try_multi_strategy,
        try_eval_metric = try_eval_metric
    )
    
    # model training
    model.fit(X_train_extended, y_train_extended)
    # calculate model score
    score = model.score(X_test, y_test)
    #start logging (nested)
    with mlflow.start_run(run_name = 'XGBoost',
                          experiment_id= experiment,
                          nested = True):
        #logging settings
        mlflow.log_param('max_depth', try_max_depth)
        mlflow.log_param('learning_rate', try_learning_rate)
        mlflow.log_param('n_estimators', try_n_estimators)
        mlflow.log_param('gamma', try_gamma)
        mlflow.log_param('min_child_weight', try_min_child_weight)
        mlflow.log_param('subsample', try_subsample)
        mlflow.log_param('colsample_bytree', try_colsample_bytree)
        mlflow.log_param('reg_alpha', try_reg_alpha)
        mlflow.log_param('reg_lambda', try_reg_lambda)
        mlflow.log_param('grow_policy', try_grow_policy)
        mlflow.log_param('objective', try_objective)
        mlflow.log_param('booster', try_booster)
        mlflow.log_param('tree_method', try_tree_method)
        mlflow.log_param('sampling_method', try_sampling_method)
        mlflow.log_param('importance_type', try_importance_type)
        mlflow.log_param('device', try_device)
        mlflow.log_param('multi_strategy', try_multi_strategy)
        mlflow.log_param('eval_metric', try_eval_metric)
        mlflow.log_metric('score', score)
        mlflow.xgboost.log_model(model,'mdoel')
        
    return score


# set search bounds of each parameter
pbounds = {
    'try_max_depth': (3, 50),
    'try_learning_rate': (0.01, 0.5),
    'try_n_estimators': (100, 1000),
    'try_gamma': (0, 5),
    'try_min_child_weight': (1, 10),
    'try_subsample': (0.5, 1.0),
    'try_colsample_bytree': (0.5, 1.0),
    'try_reg_alpha': (0, 1),
    'try_reg_lambda': (0, 1)
}


#create an experiment
experiment = mlflow.create_experiment('spaceship_titanic_bayes_opt_extended')

#start run experiment
with mlflow.start_run(run_name='XGboost',
                      experiment_id=experiment):
    
    #instansation of optimizer
    optimizer = BayesianOptimization(
        f=xgboost_eval,
        pbounds=pbounds,
        random_state=1
    )
    
    #calculation
    optimizer.maximize(init_points=5, n_iter=95)
