In [None]:
import pandas as pd
import xgboost as xgb
import warnings
import mlflow
import boto3
import datetime
from sklearn.model_selection import train_test_split
from bayes_opt import BayesianOptimization


warnings.filterwarnings('ignore')

#load data
sample = pd.read_csv('../data/sample_submission.csv')
train = pd.read_csv('../data/train.csv')
test = pd.read_csv('../data/test.csv')

#ProfileReport(train, title="Profiling Report")

#drop some vars
drop_list = ['PassengerId', 'Name', 'Cabin']
train = train.drop(drop_list, axis=1)
train = train.dropna(how='any')

print(train.shape)
print(test.shape)

In [None]:
# S3クライアントの初期化
BUCKET_NAME = 'your-buket-name'
s3_client = boto3.client('s3')

# ファイルをローカルにダウンロード
s3_client.download_file(BUCKET_NAME, 'data/train.csv', 'train.csv')

# Pandasを使用してCSVファイルを読み込む
df = pd.read_csv('train.csv')

# データフレームの内容を表示
print(df.head())


In [None]:
# バケット内のオブジェクトのリストを取得
objects = s3_client.list_objects_v2(Bucket=BUCKET_NAME)

# バケット内のオブジェクトのキーを表示
if 'Contents' in objects:
    for obj in objects['Contents']:
        print(obj['Key'])
else:
    print("バケット内にオブジェクトが存在しません。")

In [None]:
from sklearn.preprocessing import OneHotEncoder
onehot_list = ['HomePlanet', 'Destination', 'CryoSleep', 'VIP', 'Transported']

warnings.filterwarnings('ignore')

# インスタンス化
enc = OneHotEncoder(sparse=False)

for column in onehot_list:
    # OneHotエンコーディングを適用
    transformed = enc.fit_transform(train[[column]])
    
    # エンコーディングされたデータをDataFrameに変換
    transformed_df = pd.DataFrame(transformed, columns=[f"{column}_{cat}" for cat in enc.categories_[0]], index=train.index)  # インデックスを指定
    
    # 元のデータから対象の列を削除
    train = train.drop(column, axis=1)
    
    # エンコーディングされたデータを元のDataFrameに結合
    train = pd.concat([train, transformed_df], axis=1)

train = train.drop(['HomePlanet_Mars', 'Destination_TRAPPIST-1e', 'CryoSleep_False', 'VIP_False', 'Transported_False'], axis=1)
train.head()

#説明変数と被説明変数に分割
x = train.drop(['Transported_True'], axis=1)
y = train['Transported_True']

X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.3)

train_all = pd.concat([y_train, X_train], axis=1)


#cross varidation
dtrain = xgb.DMatrix(X_train, label=y_train)
params = {'max_depth':3, 'eta':0.1}
cross_val = xgb.cv(
    params, dtrain, num_boost_round=1000, early_stopping_rounds=50
)
best_n_boost_round = cross_val.shape[0]

In [None]:
import matplotlib.pyplot as plt
%matplotlib inline

plt.figure(figsize=(8,6))
plt.plot(cross_val.loc[:, ['test-rmse-mean', 'train-rmse-mean']])
plt.grid()
plt.xlabel('num_boost_round')
plt.ylabel('RMSE')

In [None]:
parameters = [
    'n_estimators',  # Number of gradient boosted trees. Equivalent to number of boosting rounds.
    'max_depth',  # Maximum tree depth for base learners.
    'max_leaves',  # Maximum number of leaves; 0 indicates no limit.
    'max_bin',  # If using histogram-based algorithm, maximum number of bins per feature.
    'grow_policy',  # Tree growing policy.
    'learning_rate',  # Boosting learning rate (xgb's "eta").
    'verbosity',  # The degree of verbosity.
    'objective',  # Specify the learning task and the corresponding learning objective or a custom objective function.
    'booster',  # Specify which booster to use: gbtree, gblinear or dart.
    'tree_method',  # Specify which tree method to use.
    'n_jobs',  # Number of parallel threads used to run xgboost.
    'gamma',  # Minimum loss reduction required to make a further partition on a leaf node of the tree.
    'min_child_weight',  # Minimum sum of instance weight(hessian) needed in a child.
    'max_delta_step',  # Maximum delta step we allow each tree's weight estimation to be.
    'subsample',  # Subsample ratio of the training instance.
    'sampling_method',  # Sampling method. Used only by the GPU version of "hist" tree method.
    'colsample_bytree',  # Subsample ratio of columns when constructing each tree.
    'colsample_bylevel',  # Subsample ratio of columns for each level.
    'colsample_bynode',  # Subsample ratio of columns for each split.
    'reg_alpha',  # L1 regularization term on weights (xgb's alpha). LASSO
    'reg_lambda',  # L2 regularization term on weights (xgb's lambda). Ridge
    'scale_pos_weight',  # Balancing of positive and negative weights.
    'base_score',  # The initial prediction score of all instances, global bias.
    'random_state',  # Random number seed.
    'missing',  # Value in the data which needs to be present as a missing value.
    'num_parallel_tree',  # Used for boosting random forest.
    'monotone_constraints',  # Constraint of variable monotonicity.
    'interaction_constraints',  # Constraints for interaction representing permitted interactions.
    'importance_type',  # The feature importance type for the feature_importances_ property.
    'device',  # Device ordinal, available options are "cpu", "cuda", and "gpu".
    'validate_parameters',  # Give warnings for unknown parameter.
    'enable_categorical',  # Experimental support for categorical data.
    'feature_types',  # Used for specifying feature types without constructing a dataframe.
    'max_cat_to_onehot',  # A threshold for deciding whether to use one-hot encoding based split for categorical data.
    'max_cat_threshold',  # Maximum number of categories considered for each split.
    'multi_strategy',  # The strategy used for training multi-target models.
    'eval_metric',  # Metric used for monitoring the training result and early stopping.
    'early_stopping_rounds',  # Activates early stopping.
    'callbacks',  # List of callback functions that are applied at end of each iteration.
    'kwargs'  # Keyword arguments for XGBoost Booster object.
]


### ベイズ最適化による最適ハイパーパラメータ詮索

In [None]:
# クラウド上で動かしているmlflow tracking serverのパブリックURL
TRACKING_SERVER_HOST = "your-mlflow-tracking-server-host"

# URLをセット
mlflow.set_tracking_uri(f"http://{TRACKING_SERVER_HOST}:5000") 
print(f"Tracking Server URI: '{mlflow.get_tracking_uri()}'")

#experimentの名前を設定(今回は日付け情報)
dt = datetime.datetime.today()
exp_name = f'exp_{dt.date()}'
#experimentを作成
experiment = mlflow.create_experiment(exp_name)

In [None]:
#pre setting of categorical parameters
try_grow_policy = 'depthwise'
try_objective = 'reg:squarederror'
try_booster = 'gbtree'
try_tree_method = 'auto'
try_sampling_method = 'uniform'
try_importance_type = 'gain'
try_device = 'cpu'
try_multi_strategy = 'diagonal'
try_eval_metric = 'rmse'

#evaluation function
def xgboost_eval(try_max_depth,
                 try_learning_rate, 
                 try_n_estimators, 
                 try_gamma, 
                 try_min_child_weight, 
                 try_subsample, 
                 try_colsample_bytree, 
                 try_reg_alpha, 
                 try_reg_lambda):
    # convert to int since these are not continuous variables
    try_max_depth = int(try_max_depth)
    try_n_estimators = int(try_n_estimators)

    #parameter settings
    #最大値を設定する系はナシ
    #データセットの前処理に関わる変数もナシ。
    model = xgb.XGBClassifier(
        max_depth=try_max_depth,
        learning_rate=try_learning_rate,
        n_estimators=try_n_estimators,
        gamma=try_gamma,
        min_child_weight=try_min_child_weight,
        subsample=try_subsample,
        colsample_bytree=try_colsample_bytree,
        reg_alpha=try_reg_alpha,
        reg_lambda=try_reg_lambda,
        try_grow_policy = try_grow_policy,
        try_objctive = try_objective,
        try_booster = try_booster,
        try_tree_method = try_tree_method,
        try_importance_type = try_importance_type,
        try_device = try_device,
        try_multi_strategy = try_multi_strategy,
        try_eval_metric = try_eval_metric
    )
    
    
    # model training
    model.fit(X_train, y_train)
    # calculate model score
    score = model.score(X_test, y_test)
    #start logging (nested)
    with mlflow.start_run(run_name = 'XGBoost',
                          experiment_id= experiment,
                          nested = True):
        #logging settings
        mlflow.log_params({
            'max_depth': try_max_depth,
            'learning_rate': try_learning_rate,
            'n_estimators': try_n_estimators,
            'gamma': try_gamma,
            'min_child_weight': try_min_child_weight,
            'subsample': try_subsample,
            'colsample_bytree': try_colsample_bytree,
            'reg_alpha': try_reg_alpha,
            'reg_lambda': try_reg_lambda,
            'grow_policy': try_grow_policy,
            'objective': try_objective,
            'booster': try_booster,
            'tree_method': try_tree_method,
            'sampling_method': try_sampling_method,
            'importance_type': try_importance_type,
            'device': try_device,
            'multi_strategy': try_multi_strategy,
            'eval_metric': try_eval_metric,
            'score': score
            })
        mlflow.xgboost.log_model(model,'mdoel')
        
    return score

# set search bounds of each parameter
pbounds = {
    'try_max_depth': (3,5,10,50),
    'try_learning_rate': (0.01,0.05,0.1,0.5),
    'try_n_estimators': (100,500,1000),
    'try_gamma': (0, 5),
    'try_min_child_weight': (1,3,5,10),
    'try_subsample': (0.5, 1.0),
    'try_colsample_bytree': (0.5, 1.0),
    'try_reg_alpha': (0, 1),
    'try_reg_lambda': (0, 1)
}

#start run experiment
with mlflow.start_run(run_name='XGboost',
                      experiment_id=experiment):
    
    #instansation of optimizer
    optimizer = BayesianOptimization(
        f=xgboost_eval,
        pbounds=pbounds,
        random_state=1
    )
    
    #calculation
    optimizer.maximize(init_points=5, n_iter=20)