In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import datetime
import jpholiday
import statsmodels.api as sm
from statsmodels.graphics import tsaplots
# from optuna.integration import lightgbm as lgb
import lightgbm as lgb
import xgboost as xgb
from sklearn.metrics import  log_loss
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import PowerTransformer
from sklearn.model_selection import KFold
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_auc_score
from datetime import date, timedelta
import sklearn.metrics
import optuna 
from dateutil.relativedelta import relativedelta
import pandas_profiling as pdp
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials


%matplotlib inline

# 基準モデルの作成

In [2]:
FILE_PATH = '/Users/atsushisato/git_clone/ai_quest_season2/data/'

train_data = pd.read_csv(FILE_PATH + 'external/train.csv').rename(columns={'顧客ID':'uuid', 'クーポンID':'coupon_id', 'クーポン利用':'coupon_use_result'})
test_data = pd.read_csv(FILE_PATH + 'external/test.csv').rename(columns={'顧客ID':'uuid', 'クーポンID':'coupon_id'})
customer_df = pd.read_csv(FILE_PATH + 'interim/customer_df.csv')

# 特徴量作成

In [3]:
#特徴量作成関数（trainとtestで異なる処理が可能）
def create_feature_separate(dataframe):
    # issue 9/10 カテゴリ変数の数値変換 
    dataframe = category_to_int_conversion(dataframe)
    
    # issue 11 カテゴリ別商品単価
    dataframe['unit_price_category'] = dataframe.apply(lambda x: 
                                unit_price_category(x['purchase_product_num'], x['coupon_use'], x['purchase_price']),axis=1)
    # issue 16 総購入商品の平均
    dataframe['purchase_price_average'] = dataframe.apply(lambda x: 
                                purchase_price_average(x['all_purchase_price'], x['visits_frequency']),axis=1)   
    # issue 20 平均来店周期
    dataframe['visit_cycle'] = dataframe.apply(lambda x:
                                visit_cycle(x['visits_frequency'], x['pass_days']),axis=1)
    # issue 21 経過日数と来店頻度の比較
    dataframe['visit_cycle_flg'] = dataframe.apply(lambda x: 
                                visit_cycle_flg(x['visit_cycle'], x['pass_days']),axis=1) 
    # issue 22 総購入金額に占めるカテゴリ別の購入金額比率
    dataframe['ratio_category'] = dataframe.apply(lambda x:
                                ratio_category(x['all_purchase_price'], x['purchase_price']),axis=1)
    # issue 23 大人の数
    dataframe['adult_num'] = dataframe['fammily_num'] - dataframe['child_num']
    
    # issue 24 カテゴリ別平均購入金額
    dataframe['purchase_price_category_average'] = dataframe.apply(lambda x:
                                purchase_price_category_average(x['purchase_price'], x['purchase_product_num'],x['coupon_use']),axis=1)
    # issue 28 家族1人あたりの総購入金額
    dataframe['purchase_price_1person'] = dataframe.apply(lambda x:
                                purchase_price_1person(x['all_purchase_price'], x['fammily_num']),axis=1)
    # issue 29 1日あたりの購入金額
    dataframe['purchase_price_1day'] = dataframe.apply(lambda x :
                                purchase_price_1day(x['all_purchase_price'], x['pass_days']),axis=1)
    
    
    dataframe = dataframe.drop(['category_id', 'house_flg', 'marry_flg', 'age', 'visit_cycle_flg', 'category_id'],axis=1)
    return dataframe

In [4]:
#===================================================================================
#特徴量作成関数
# issue 9/10 カテゴリ変数の数値変換    
def category_to_int_conversion(dataframe):
    marry_dict = {'無回答':None, '独身':0, '既婚':1}
    dataframe['marry_flg'] = dataframe['marry'].apply(lambda x: marry_dict.get(x))
    
    dataframe['age'] = dataframe['age_range'].apply(lambda x: int(x[:2]))
    dataframe['category_id'] = dataframe['category_id'].apply(lambda x: int(x[1:]))
    
    dataframe = dataframe.drop(['age_range','marry','category'],axis=1)
    return dataframe

# issue 11 カテゴリ別商品単価
def unit_price_category(product_num, coupon_use, price):
    if coupon_use != 0:
        price = price + abs(coupon_use)
    
    if price != 0:
        return int(price / product_num)
    else:
        return 0
    
# issue 16 総購入商品の平均
def purchase_price_average(all_purchase_price, visits_frequency):
    if (all_purchase_price != 0) & (visits_frequency != 0):
        one_purchase_price =  int(all_purchase_price / visits_frequency)            
    else:
        one_purchase_price =  0
    return one_purchase_price

# issue 20 平均来店周期
def visit_cycle(visits_frequency, pass_days):
    all_days = 60
    if visits_frequency != 0:
        cycle = int((all_days - pass_days) / visits_frequency)
    else:
        cycle = 0
    return cycle

# issue 21 経過日数と来店頻度の比較
def visit_cycle_flg(visit_cycle, pass_days):
    if visit_cycle <= pass_days:
        return 1
    else:
        return 0
    
# issue 22 総購入金額に占めるカテゴリ別の購入金額比率
def ratio_category(all_purchase_price, purchase_price):
    if purchase_price != 0:
        ratio = purchase_price / all_purchase_price
    else:
        ratio = 0
    return ratio

# issue 24 カテゴリ別平均購入金額
def purchase_price_category_average(purchase_price, purchase_product_num, coupon_use):
    if purchase_product_num != 0:
        purchase_price = purchase_price + abs(coupon_use)
        average_purchase = int(purchase_price / purchase_product_num)
    else:
        average_purchase = 0
    return average_purchase
    
# issue 28 家族1人あたりの総購入金額
def purchase_price_1person(all_purchase_price, fammily_num):
    if all_purchase_price != 0:
        return int(all_purchase_price / fammily_num)
    else:
        return 0

# issue 29 1日あたりの購入金額
def purchase_price_1day(all_purchase_price, pass_days):
    if all_purchase_price != 0:
        all_days = 60
        use_days = all_days - pass_days + 1
        return int(all_purchase_price / use_days)
    else:
        return 0
    
# issue 19 欠損値削除（購入履歴はあるが来店履歴がないuuid）
def drop_missing_value(dataframe):
    drop_uuid = dataframe.query("(visits_frequency ==0 )& (all_purchase_price != 0)")['uuid'].unique()
    dataframe = dataframe.query("uuid not in @drop_uuid") 
    return dataframe

In [5]:
#train_dataとtest_dataの特徴量作成
train_data = pd.merge(train_data, customer_df, on=['uuid', 'coupon_id'],how='inner')
test_data = pd.merge(test_data, customer_df, on=['uuid', 'coupon_id'],how='inner')

#特徴量作成（train_dataとtest_dataで異なる処理が可能）
train_data = create_feature_separate(train_data)
test_data = create_feature_separate(test_data)
train_data = drop_missing_value(train_data)

In [6]:
# pdp.ProfileReport(train_data.query("coupon_use_result == 1"))

In [9]:
sorted(train_data.columns)

['adult_num',
 'all_purchase_price',
 'child_num',
 'coupon_id',
 'coupon_use',
 'coupon_use_result',
 'fammily_num',
 'income',
 'pass_days',
 'purchase_num',
 'purchase_price',
 'purchase_price_1day',
 'purchase_price_1person',
 'purchase_price_average',
 'purchase_price_category_average',
 'purchase_product_num',
 'ratio_category',
 'unit_price_category',
 'uuid',
 'visit_cycle',
 'visits_frequency']

## XGBoost

In [10]:
train_x = train_data.drop(['coupon_use_result','uuid'], axis=1)
train_y = train_data[['coupon_use_result']]

val_size = 4
uuid_num = train_data['uuid'].nunique()
train_uuid = int((uuid_num/val_size)*(val_size-1))
filter_idx = (train_uuid*11)-1
tr_x, va_x = train_x.iloc[:filter_idx], train_x.iloc[filter_idx:]
tr_y, va_y = train_y.iloc[:filter_idx], train_y.iloc[filter_idx:]


In [83]:
dtrain = xgb.DMatrix(tr_x, label=tr_y)
dvalid = xgb.DMatrix(va_x, label=va_y)
dtest = xgb.DMatrix(test_data.drop('uuid',axis=1))

num_round = 1000
# ハイパーパラメータの設定
score_dict = {}
# for i in range(100):
params = {'objective': 'binary:logistic', 
          'eval_metric':'logloss',
          'eta': 0.01,
          'gamma': 0.0,
          'alpha': 0.0,
          'lambda': 1.0,
          'min_child_weight': 8,
          'max_depth': 6,
          'subsample': 1.0,
          'colsample_bytree': 0.8,
          'random_state': 27,
        }


watchlist = [(dtrain, 'train'), (dvalid, 'eval')]
model = xgb.train(params, 
                  dtrain, 
                  num_round, 
                  evals=watchlist,
                  early_stopping_rounds=100,
                  verbose_eval=False
                 )

va_pred = model.predict(dvalid)
score = log_loss(va_y, va_pred)
#     score_dict[i] = score
    
pred = model.predict(dtest)

In [84]:
score_dict

{}

In [85]:
score

0.04931323879549301

In [86]:
def round_score(pred):
    if pred <= 0.0056:
        pred = 0
    elif pred >= 0.9944:
        pred = 1
    return pred

In [87]:
pred = model.predict(dtest)
pred = pd.Series(pred, name='pred')
pred = pred.apply(lambda x: round_score(x))
print(pred.min(), pred.max())

result = test_data.join(pred)[['uuid', 'coupon_id', 'pred']]
result = result.pivot(index='uuid', columns='coupon_id', values='pred')
result

0.0 0.3816811740398407


coupon_id,1,2,3,4,5,6,7,8,9,10,11
uuid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
357,0.000000,0.000000,0.000000,0.000000,0.013146,0.000000,0.000000,0.064750,0.006409,0.000000,0.000000
358,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.018686,0.012669,0.000000,0.000000
359,0.026600,0.026600,0.026029,0.027812,0.043240,0.030989,0.030804,0.381681,0.259802,0.027671,0.029766
360,0.000000,0.013437,0.000000,0.000000,0.000000,0.000000,0.000000,0.014264,0.007093,0.000000,0.000000
361,0.022680,0.022931,0.018737,0.023165,0.025431,0.017372,0.023424,0.138719,0.020328,0.024063,0.024063
...,...,...,...,...,...,...,...,...,...,...,...
708,0.000000,0.000000,0.000000,0.005995,0.000000,0.000000,0.000000,0.026817,0.005874,0.000000,0.008146
709,0.010199,0.006210,0.006055,0.000000,0.000000,0.000000,0.019989,0.030289,0.008368,0.006287,0.005846
710,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
711,0.013801,0.016852,0.014602,0.015778,0.038051,0.014353,0.013637,0.290274,0.082430,0.016855,0.015604


# LightGBM

In [91]:
#===============================================================
#LightGBMによるモデル作成

# score_dict = dict()
# for i in range(100):
params = {'objective': 'binary',
         'metric': 'binary_logloss',
         'seed': 54,
         'lambda_l1': 0.059894240326589916,
         'lambda_l2': 0.000010302168677111394,
         'num_leaves': 382,
         'feature_fraction': 0.9760627682340816,
         'bagging_fraction': 0.957495412018543,
         'bagging_freq': 7,
         'min_child_samples': 14
        }

num_round =1000
lgb_train = lgb.Dataset(tr_x, tr_y)
lgb_eval = lgb.Dataset(va_x, va_y)

model = lgb.train(params, 
              lgb_train,
              verbose_eval=False,  # 50イテレーション毎に学習結果出力
              num_boost_round=num_round, 
              early_stopping_rounds=100,
              valid_names=['train', 'valid'], 
              valid_sets=[lgb_train, lgb_eval]
             )

va_pred = model.predict(va_x)
score = log_loss(va_y, va_pred)
#     score_dict[i] = score

In [90]:
score_dict

{0: 0.05011535679327886,
 1: 0.05162192857016291,
 2: 0.05083632669257753,
 3: 0.05242817167773983,
 4: 0.05135279743825156,
 5: 0.05277221671814219,
 6: 0.05378655496831985,
 7: 0.05387897511294902,
 8: 0.053047650936858204,
 9: 0.050457607604724314,
 10: 0.04995294142955738,
 11: 0.053573450378349305,
 12: 0.05361779806410574,
 13: 0.05217438555128535,
 14: 0.052495208752044914,
 15: 0.0523280409395143,
 16: 0.05243528317854127,
 17: 0.051263542649004054,
 18: 0.052049282520127645,
 19: 0.05236701829825393,
 20: 0.05218791965766804,
 21: 0.05382136721234172,
 22: 0.05467355395026898,
 23: 0.05270543342676882,
 24: 0.05233456246716357,
 25: 0.05535735358949069,
 26: 0.050166247461798145,
 27: 0.05371339751552648,
 28: 0.050101566450748314,
 29: 0.05173472661414455,
 30: 0.05128763280115434,
 31: 0.052207745615865,
 32: 0.05128713632334726,
 33: 0.0493571882989601,
 34: 0.051992100678358404,
 35: 0.04950157072566798,
 36: 0.05233519688890929,
 37: 0.05267518596972037,
 38: 0.0519919432

In [92]:
score

0.049856944094398795

In [93]:
def round_score(pred):
    if pred <= 0.0056:
        pred = 0
    elif pred >= 0.9944:
        pred = 1
    return pred

In [94]:
#テストデータの予測
pred = model.predict(test_data.drop('uuid',axis=1))
pred = pd.Series(pred, name='pred')
pred = pred.apply(lambda x: round_score(x))
print(pred.min(), pred.max())

result = test_data.join(pred)[['uuid', 'coupon_id', 'pred']]
result = result.pivot(index='uuid', columns='coupon_id', values='pred')
result

0.008839051063222482 0.36092409673862275


coupon_id,1,2,3,4,5,6,7,8,9,10,11
uuid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
357,0.009040,0.009013,0.009013,0.009013,0.037037,0.009013,0.009013,0.036575,0.009043,0.009013,0.009101
358,0.009040,0.009013,0.009013,0.009013,0.009013,0.009013,0.009013,0.008977,0.008977,0.009013,0.009101
359,0.017314,0.017262,0.074247,0.040174,0.012876,0.009044,0.009044,0.353265,0.360924,0.009044,0.040174
360,0.016511,0.009175,0.009013,0.009013,0.009013,0.009013,0.009013,0.008889,0.009013,0.009013,0.009101
361,0.009131,0.009104,0.025483,0.009102,0.009044,0.009102,0.009044,0.179353,0.012870,0.009044,0.009076
...,...,...,...,...,...,...,...,...,...,...,...
708,0.009040,0.009013,0.009013,0.009013,0.009013,0.009013,0.009013,0.055373,0.009013,0.009013,0.009108
709,0.009153,0.009024,0.009024,0.009183,0.009024,0.012972,0.027857,0.008846,0.009153,0.009078,0.009108
710,0.009040,0.009013,0.009013,0.009038,0.009013,0.009013,0.009013,0.012692,0.009038,0.009013,0.009101
711,0.020340,0.009133,0.023598,0.009133,0.019887,0.051629,0.020340,0.340421,0.032950,0.009446,0.009336


# アンサンブル学習（XGBoost/LightGBM）

In [63]:
#===============================================================
#XGBoostによるモデル作成
def xgb_train(tr_x, tr_y, va_x, va_y):
    dtrain = xgb.DMatrix(tr_x, label=tr_y)
    dvalid = xgb.DMatrix(va_x, label=va_y)
    dtest = xgb.DMatrix(test_data.drop('uuid',axis=1))

    # ハイパーパラメータの設定
    params = {'objective': 'binary:logistic', 
              'eval_metric':'logloss',
              'eta': 0.01,
              'gamma': 0.0,
              'alpha': 0.0,
              'lambda': 1.0,
              'min_child_weight': 8,
              'max_depth': 6,
              'subsample': 1.0,
              'colsample_bytree': 0.8,
              'random_state': 27,
            }
    num_round = 1000

    watchlist = [(dtrain, 'train'), (dvalid, 'eval')]
    model = xgb.train(params, 
                      dtrain, 
                      num_round, 
                      evals=watchlist,
                      early_stopping_rounds=100,
                      verbose_eval=50)

    va_pred = model.predict(dvalid)
    score = log_loss(va_y, va_pred)
    
    pred = model.predict(dtest)
    pred = pd.Series(pred, name='pred')

    return model, score, pred

#===============================================================
#LightGBMによるモデル作成
def lgb_train(tr_x, tr_y, va_x, va_y):
    params = {'objective': 'binary',
             'metric': 'binary_logloss',
             'seed': 118,
             'lambda_l1': 0.00416566836241902,
             'lambda_l2': 0.10371698288325823,
             'num_leaves': 333,
             'feature_fraction': 0.6873251818776734,
             'bagging_fraction': 0.6363448284692337,
             'bagging_freq': 10,
             'min_child_samples': 17
            }

    num_round =1000
    lgb_train = lgb.Dataset(tr_x, tr_y)
    lgb_eval = lgb.Dataset(va_x, va_y)

    model = lgb.train(params, 
                  lgb_train, 
                  verbose_eval=50,  # 50イテレーション毎に学習結果出力
                  num_boost_round=num_round, 
                  early_stopping_rounds=100,
                  valid_names=['train', 'valid'], 
                  valid_sets=[lgb_train, lgb_eval]
                 )

    va_pred = model.predict(va_x)
    score = log_loss(va_y, va_pred)
    
    #テストデータの予測
    pred = model.predict(test_data.drop('uuid',axis=1))
    pred = pd.Series(pred, name='pred')
    
    return model, score, pred

In [64]:
def round_score(pred):
    if pred <= 0.0056:
        pred = 0
    elif pred >= 0.9944:
        pred = 1
    return pred

def pred_to_result(pred):
    result = test_data.join(pred)[['uuid', 'coupon_id', 'pred']]
    result = result.pivot(index='uuid', columns='coupon_id', values='pred')
    return result

In [95]:
train_x = train_data.drop(['coupon_use_result','uuid'], axis=1)
train_y = train_data[['coupon_use_result']]

val_size = 4
uuid_num = train_data['uuid'].nunique()
train_uuid = int((uuid_num/val_size)*(val_size-1))
filter_idx = (train_uuid*11)-1
tr_x, va_x = train_x.iloc[:filter_idx], train_x.iloc[filter_idx:]
tr_y, va_y = train_y.iloc[:filter_idx], train_y.iloc[filter_idx:]

xgb_models = []
lgb_models = []
xgb_scores = []
lgb_scores = []
xgb_pred = []
lgb_pred = []

#アンサンブル学習
xgb_models, xgb_scores, xgb_pred = xgb_train(tr_x, tr_y ,va_x, va_y)
lgb_models, lgb_scores, lgb_pred = lgb_train(tr_x, tr_y ,va_x, va_y)
weight = 0.5
pred = (xgb_pred*weight + lgb_pred*(1-weight))
pred = pred = pred.apply(lambda x: round_score(x))
result = pred_to_result(pred)

[0]	train-logloss:0.68372	eval-logloss:0.68362
Multiple eval metrics have been passed: 'eval-logloss' will be used for early stopping.

Will train until eval-logloss hasn't improved in 100 rounds.
[50]	train-logloss:0.37434	eval-logloss:0.37121
[100]	train-logloss:0.22835	eval-logloss:0.22349
[150]	train-logloss:0.15124	eval-logloss:0.14551
[200]	train-logloss:0.10852	eval-logloss:0.10247
[250]	train-logloss:0.08456	eval-logloss:0.07830
[300]	train-logloss:0.07062	eval-logloss:0.06482
[350]	train-logloss:0.06238	eval-logloss:0.05736
[400]	train-logloss:0.05763	eval-logloss:0.05336
[450]	train-logloss:0.05460	eval-logloss:0.05131
[500]	train-logloss:0.05262	eval-logloss:0.04980
[550]	train-logloss:0.05130	eval-logloss:0.04929
[600]	train-logloss:0.05039	eval-logloss:0.04915
[650]	train-logloss:0.04968	eval-logloss:0.04917
[700]	train-logloss:0.04913	eval-logloss:0.04930
Stopping. Best iteration:
[624]	train-logloss:0.05004	eval-logloss:0.04907

Training until validation scores don't imp

In [96]:
pred

0       0.000000
1       0.000000
2       0.000000
3       0.000000
4       0.011583
          ...   
3911    0.006289
3912    0.016488
3913    0.007353
3914    0.007579
3915    0.011415
Name: pred, Length: 3916, dtype: float64

In [97]:
result

coupon_id,1,2,3,4,5,6,7,8,9,10,11
uuid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
357,0.000000,0.000000,0.000000,0.000000,0.011583,0.000000,0.000000,0.041989,0.000000,0.000000,0.000000
358,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.035996,0.025341,0.011235,0.011550
359,0.024849,0.018850,0.014964,0.017866,0.025920,0.018280,0.018263,0.392844,0.137140,0.015821,0.018025
360,0.000000,0.010092,0.000000,0.000000,0.000000,0.000000,0.000000,0.010517,0.000000,0.000000,0.000000
361,0.013202,0.013327,0.011232,0.013517,0.014577,0.010570,0.013574,0.075725,0.012556,0.013928,0.013928
...,...,...,...,...,...,...,...,...,...,...,...
708,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.027986,0.000000,0.000000,0.007908
709,0.007926,0.000000,0.000000,0.006235,0.000000,0.000000,0.013888,0.029290,0.012569,0.008953,0.009179
710,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
711,0.048702,0.012643,0.015624,0.023723,0.029219,0.012699,0.024976,0.287124,0.052808,0.012818,0.014882


In [98]:
print(xgb_scores, lgb_scores)
print(pred.min(), pred.max())

0.04931323879549301 0.04868782829132765
0.0 0.3928439470306875


In [99]:
xgb_pred.max()

0.38168117

In [100]:
lgb_pred.max()

0.4040067200215342

In [95]:
result.to_csv('/Users/atsushisato/git_clone/ai_quest_season2/models/model.csv',header=False)

In [15]:
pred.nunique()

1619

# optunaによるパラメータ調整

In [16]:
model.params

{'objective': 'binary',
 'metric': 'binary_logloss',
 'seed': 71,
 'feature_pre_filter': False,
 'lambda_l1': 1.0075794784727784e-08,
 'lambda_l2': 0.023033986719849988,
 'num_leaves': 317,
 'feature_fraction': 0.7,
 'bagging_fraction': 0.4983467582046316,
 'bagging_freq': 4,
 'min_data_in_leaf': 22,
 'min_child_samples': 20}

In [209]:
# パラメータ調整用

# prediction = np.rint(model.predict(va_x, num_iteration=model.best_iteration))
# accuracy = accuracy_score(va_y, prediction)
# best_params = model.params
# print("Best params:", display(best_params)
# print("  Accuracy = {}".format(accuracy))
# print("  Params: ")
# for key, value in best_params.items():
#     print("    {}: {}".format(key, value))

In [35]:
def objective(trial):
    param = {
        'objective': 'binary',
        'metric': 'binary_logloss',
        'boosting': 'gbdt',
        'learning_rate': 0.05,
        'lambda_l1': trial.suggest_loguniform('lambda_l1', 1e-8, 10.0),
        'lambda_l2': trial.suggest_loguniform('lambda_l2', 1e-8, 10.0),
        'num_leaves': trial.suggest_int('num_leaves', 2, 512),
        'feature_fraction': trial.suggest_uniform('feature_fraction', 0.4, 1.0),
        'bagging_fraction': trial.suggest_uniform('bagging_fraction', 0.4, 1.0),
        'bagging_freq': trial.suggest_int('bagging_freq', 0, 10),
        'min_child_samples': trial.suggest_int('min_child_samples', 5, 100),
        'seed': 0,
        'verbosity': -1,
    }
    gbm = lgb.train(param, 
                    lgb_train, 
#                     valid_sets=lgb_eval,
                    verbose_eval=False, 
                    num_boost_round=num_round, 
#                     early_stopping_rounds=50
                   )
    y_prob = gbm.predict(va_x)
    y_pred = np.round(y_prob)
    return roc_auc_score(
        np.round(va_y.values),
        np.round(y_pred)
    )

In [36]:
train_x = train_data.drop(['coupon_use_result','uuid'], axis=1)
train_y = train_data[['coupon_use_result']]

val_size = 4
uuid_num = train_data['uuid'].nunique()
train_uuid = int((uuid_num/val_size)*(val_size-1))
filter_idx = (train_uuid*11)-1
tr_x, va_x = train_x.iloc[:filter_idx], train_x.iloc[filter_idx:]
tr_y, va_y = train_y.iloc[:filter_idx], train_y.iloc[filter_idx:]

In [37]:
# study = optuna.create_study()
# study.optimize(xgb_objective, n_trials=100)

In [38]:
# print('Best trial:', display(study.best_trial.params))

In [39]:
lgb_train = lgb.Dataset(tr_x, tr_y)
lgb_eval = lgb.Dataset(va_x, va_y)

study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=50)

[32m[I 2021-01-10 17:16:49,336][0m A new study created in memory with name: no-name-4428f7ca-bb90-49ec-8906-3167c7ec4418[0m
[32m[I 2021-01-10 17:16:49,822][0m Trial 0 finished with value: 0.4989572471324296 and parameters: {'lambda_l1': 2.230483819354161e-06, 'lambda_l2': 0.7466146658073363, 'num_leaves': 466, 'feature_fraction': 0.6891702640405284, 'bagging_fraction': 0.7251771207195571, 'bagging_freq': 8, 'min_child_samples': 93}. Best is trial 0 with value: 0.4989572471324296.[0m
[32m[I 2021-01-10 17:16:49,937][0m Trial 1 finished with value: 0.5 and parameters: {'lambda_l1': 8.25214730452868, 'lambda_l2': 0.32057059962204465, 'num_leaves': 317, 'feature_fraction': 0.7038675542158336, 'bagging_fraction': 0.5790332692302901, 'bagging_freq': 2, 'min_child_samples': 54}. Best is trial 1 with value: 0.5.[0m
[32m[I 2021-01-10 17:16:50,511][0m Trial 2 finished with value: 0.4994786235662148 and parameters: {'lambda_l1': 3.791259876969217e-05, 'lambda_l2': 2.0725431546822375e-08

In [40]:
#ベストパラメータ
print('Number of finished trials:', len(study.trials))
#Number of finished trials: 50


Number of finished trials: 50


In [41]:
print('Best trial:', display(study.best_trial.params))

#初回
# {'lambda_l1': 5.204866804465978e-08,
#  'lambda_l2': 3.133667635701077e-05,
#  'num_leaves': 317,
#  'feature_fraction': 0.848640913268004,
#  'bagging_fraction': 0.4983467582046316,
#  'bagging_freq': 4,
#  'min_child_samples': 22}
# Best trial: None

#2回目
# {'lambda_l1': 0.00416566836241902,
#  'lambda_l2': 0.10371698288325823,
#  'num_leaves': 333,
#  'feature_fraction': 0.6873251818776734,
#  'bagging_fraction': 0.6363448284692337,
#  'bagging_freq': 10,
#  'min_child_samples': 17}
# Best trial: None

{'lambda_l1': 0.059894240326589916,
 'lambda_l2': 1.0302168677111394e-05,
 'num_leaves': 382,
 'feature_fraction': 0.9760627682340816,
 'bagging_fraction': 0.957495412018543,
 'bagging_freq': 7,
 'min_child_samples': 14}

Best trial: None


# グリッドサーチによるパラメータ調整

In [70]:
# ベースラインのパラメータ
params = {
    'booster': 'gbtree',
    'objective': 'binary:logistic',
    'eta': 0.1,
    'gamma': 0.0,
    'alpha': 0.0,
    'lambda': 1.0,
    'min_child_weight': 1,
    'max_depth': 5,
    'subsample': 0.8,
    'colsample_bytree': 0.8,
    'random_state': 71,
}

def optimize(trials):
    # パラメータの探索範囲
    param_space = {
        'min_child_weight': hp.loguniform('min_child_weight', np.log(0.1), np.log(10)),
        'max_depth': hp.quniform('max_depth', 3, 9, 1),
        'subsample': hp.quniform('subsample', 0.6, 0.95, 0.05),
        'colsample_bytree': hp.quniform('subsample', 0.6, 0.95, 0.05),
        'gamma': hp.loguniform('gamma', np.log(1e-8), np.log(1.0)),
        # 余裕があればalpha, lambdaも調整する
        # 'alpha' : hp.loguniform('alpha', np.log(1e-8), np.log(1.0)),
        # 'lambda' : hp.loguniform('lambda', np.log(1e-6), np.log(10.0)),
    }
    
    best = fmin(score, param_space, algo=tpe.suggest, trials=trials, max_evals=0)
    
def score(params):
    dtrain = xgb.DMatrix(tr_x, label=tr_y)
    dvalid = xgb.DMatrix(va_x, label=va_y)
    dtest = xgb.DMatrix(test_data.drop('uuid',axis=1))

    num_round = 1000

    scores= []
    watchlist = [(dtrain, 'train'), (dvalid, 'eval')]
    model = xgb.train(params, 
                      dtrain, 
                      num_round, 
                      evals=watchlist,
                      early_stopping_rounds=50,
                      verbose_eval=50
                     )

    va_pred = model.predict(dvalid)
    score = log_loss(va_y, va_pred)
    scores.append(score)
    return scores

In [71]:
trials = Trials()
optimize(trials)

[0]	train-rmse:0.35728	eval-rmse:0.35756                               

Multiple eval metrics have been passed: 'eval-rmse' will be used for early stopping.


Will train until eval-rmse hasn't improved in 50 rounds.               

[50]	train-rmse:0.03417	eval-rmse:0.10486                              

Stopping. Best iteration:                                              
[13]	train-rmse:0.07617	eval-rmse:0.10017


  0%|          | 0/9223372036854775807 [00:00<?, ?trial/s, best loss=?]

job exception: cannot convert dictionary update sequence element #0 to a sequence



  0%|          | 0/9223372036854775807 [00:00<?, ?trial/s, best loss=?]


TypeError: cannot convert dictionary update sequence element #0 to a sequence

In [None]:
# def create_feature_together(train, test, num_cols):
#     train, test = transform_box_cox(train, test, num_cols)
#     return train, test

# # issue 14 Box-Cox変換
# def transform_box_cox(train, test, column):
#     pt = PowerTransformer(method='box-cox')
#     pt.fit(train[column])

#     # 変換後のデータで各列を置換
#     train[column] = pt.transform(train[column])
#     test[column] = pt.transform(test[column])

#特徴量作成（train_dataとtest_dataで同一の処理が必要）
#変換するカラム
# num_cols = ['all_purchase_price']
# train_data, test_data = create_feature_together(train_data, test_data, num_cols)

In [None]:
# tr_x, va_x, tr_y, va_y = train_test_split(train_x, train_y,
#                                           test_size=0.25, random_state=71, shuffle=True)
# kf = KFold(n_splits=4, shuffle=True, random_state=71)
# for tr_idx, va_idx in kf.split(train_x):
#     tr_x, va_x = train_x.iloc[tr_idx], train_x.iloc[va_idx]
#     tr_y, va_y = train_y.iloc[tr_idx], train_y.iloc[va_idx]