In [19]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import datetime
import jpholiday
import statsmodels.api as sm
from statsmodels.graphics import tsaplots
# from optuna.integration import lightgbm as lgb
import lightgbm as lgb
from sklearn.metrics import  log_loss
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import PowerTransformer
from sklearn.model_selection import KFold
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_auc_score
from datetime import date, timedelta
import sklearn.metrics
import optuna 
from dateutil.relativedelta import relativedelta
import pandas_profiling as pdp


%matplotlib inline

# 基準モデルの作成

In [2]:
FILE_PATH = '/Users/atsushisato/git_clone/ai_quest_season2/data/'

train_data = pd.read_csv(FILE_PATH + 'external/train.csv').rename(columns={'顧客ID':'uuid', 'クーポンID':'coupon_id', 'クーポン利用':'coupon_use_result'})
test_data = pd.read_csv(FILE_PATH + 'external/test.csv').rename(columns={'顧客ID':'uuid', 'クーポンID':'coupon_id'})
customer_df = pd.read_csv(FILE_PATH + 'interim/customer_df.csv')

# 特徴量作成

In [3]:
#特徴量作成関数（trainとtestで異なる処理が可能）
def create_feature_separate(dataframe):
    # issue 9/10 カテゴリ変数の数値変換 
    dataframe = category_to_int_conversion(dataframe)
    
    # issue 11 カテゴリ別商品単価
    dataframe['unit_price_category'] = dataframe.apply(lambda x: 
                                unit_price_category(x['purchase_product_num'], x['coupon_use'], x['purchase_price']),axis=1)
    # issue 16 総購入商品の平均
    dataframe['purchase_price_average'] = dataframe.apply(lambda x: 
                                purchase_price_average(x['all_purchase_price'], x['visits_frequency']),axis=1)   
    # issue 20 平均来店周期
    dataframe['visit_cycle'] = dataframe.apply(lambda x:
                                visit_cycle(x['visits_frequency'], x['pass_days']),axis=1)
    # issue 21 経過日数と来店頻度の比較
    dataframe['visit_cycle_flg'] = dataframe.apply(lambda x: 
                                visit_cycle_flg(x['visit_cycle'], x['pass_days']),axis=1) 
    # issue 22 総購入金額に占めるカテゴリ別の購入金額比率
    dataframe['ratio_category'] = dataframe.apply(lambda x:
                                ratio_category(x['all_purchase_price'], x['purchase_price']),axis=1)
    # issue 23 大人の数
    dataframe['adult_num'] = dataframe['fammily_num'] - dataframe['child_num']
    
    # issue 24 カテゴリ別平均購入金額
    dataframe['purchase_price_category_average'] = dataframe.apply(lambda x:
                                purchase_price_category_average(x['purchase_price'], x['purchase_product_num'],x['coupon_use']),axis=1)
    # issue 28 家族1人あたりの総購入金額
    dataframe['purchase_price_1person'] = dataframe.apply(lambda x:
                                purchase_price_1person(x['all_purchase_price'], x['fammily_num']),axis=1)
    # issue 29 1日あたりの購入金額
    dataframe['purchase_price_1day'] = dataframe.apply(lambda x :
                                purchase_price_1day(x['all_purchase_price'], x['pass_days']),axis=1)
    
    
    dataframe = dataframe.drop(['category_id', 'house_flg', 'marry_flg', 'age', 'visit_cycle_flg', 'category_id'],axis=1)
    return dataframe

In [4]:
#===================================================================================
#特徴量作成関数
# issue 9/10 カテゴリ変数の数値変換    
def category_to_int_conversion(dataframe):
    marry_dict = {'無回答':None, '独身':0, '既婚':1}
    dataframe['marry_flg'] = dataframe['marry'].apply(lambda x: marry_dict.get(x))
    
    dataframe['age'] = dataframe['age_range'].apply(lambda x: int(x[:2]))
    dataframe['category_id'] = dataframe['category_id'].apply(lambda x: int(x[1:]))
    
    dataframe = dataframe.drop(['age_range','marry','category'],axis=1)
    return dataframe

# issue 11 カテゴリ別商品単価
def unit_price_category(product_num, coupon_use, price):
    if coupon_use != 0:
        price = price + abs(coupon_use)
    
    if price != 0:
        return int(price / product_num)
    else:
        return 0
    
# issue 16 総購入商品の平均
def purchase_price_average(all_purchase_price, visits_frequency):
    if (all_purchase_price != 0) & (visits_frequency != 0):
        one_purchase_price =  int(all_purchase_price / visits_frequency)            
    else:
        one_purchase_price =  0
    return one_purchase_price

# issue 20 平均来店周期
def visit_cycle(visits_frequency, pass_days):
    all_days = 60
    if visits_frequency != 0:
        cycle = int((all_days - pass_days) / visits_frequency)
    else:
        cycle = 0
    return cycle

# issue 21 経過日数と来店頻度の比較
def visit_cycle_flg(visit_cycle, pass_days):
    if visit_cycle <= pass_days:
        return 1
    else:
        return 0
    
# issue 22 総購入金額に占めるカテゴリ別の購入金額比率
def ratio_category(all_purchase_price, purchase_price):
    if purchase_price != 0:
        ratio = purchase_price / all_purchase_price
    else:
        ratio = 0
    return ratio

# issue 24 カテゴリ別平均購入金額
def purchase_price_category_average(purchase_price, purchase_product_num, coupon_use):
    if purchase_product_num != 0:
        purchase_price = purchase_price + abs(coupon_use)
        average_purchase = int(purchase_price / purchase_product_num)
    else:
        average_purchase = 0
    return average_purchase
    
# issue 28 家族1人あたりの総購入金額
def purchase_price_1person(all_purchase_price, fammily_num):
    if all_purchase_price != 0:
        return int(all_purchase_price / fammily_num)
    else:
        return 0

# issue 29 1日あたりの購入金額
def purchase_price_1day(all_purchase_price, pass_days):
    if all_purchase_price != 0:
        all_days = 60
        use_days = all_days - pass_days + 1
        return int(all_purchase_price / use_days)
    else:
        return 0
    
# issue 19 欠損値削除（購入履歴はあるが来店履歴がないuuid）
def drop_missing_value(dataframe):
    drop_uuid = dataframe.query("(visits_frequency ==0 )& (all_purchase_price != 0)")['uuid'].unique()
    dataframe = dataframe.query("uuid not in @drop_uuid") 
    return dataframe

In [5]:
#train_dataとtest_dataの特徴量作成
train_data = pd.merge(train_data, customer_df, on=['uuid', 'coupon_id'],how='inner')
test_data = pd.merge(test_data, customer_df, on=['uuid', 'coupon_id'],how='inner')

#特徴量作成（train_dataとtest_dataで異なる処理が可能）
train_data = create_feature_separate(train_data)
test_data = create_feature_separate(test_data)
train_data = drop_missing_value(train_data)

In [6]:
# pdp.ProfileReport(train_data.query("coupon_use_result == 1"))

In [7]:
sorted(train_data.columns)

['adult_num',
 'all_purchase_price',
 'child_num',
 'coupon_id',
 'coupon_use',
 'coupon_use_result',
 'fammily_num',
 'income',
 'pass_days',
 'purchase_num',
 'purchase_price',
 'purchase_price_1day',
 'purchase_price_1person',
 'purchase_price_average',
 'purchase_price_category_average',
 'purchase_product_num',
 'ratio_category',
 'unit_price_category',
 'uuid',
 'visit_cycle',
 'visits_frequency']

In [8]:
train_data.head(11)

Unnamed: 0,uuid,coupon_id,coupon_use_result,fammily_num,child_num,income,visits_frequency,pass_days,all_purchase_price,purchase_num,...,coupon_use,purchase_price,unit_price_category,purchase_price_average,visit_cycle,ratio_category,adult_num,purchase_price_category_average,purchase_price_1person,purchase_price_1day
0,1,1,0,1,0,5,9,19,56562,1,...,0,534,267,6284,4,0.009441,1,267,56562,1346
1,1,2,0,1,0,5,9,19,56562,1,...,0,2803,1401,6284,4,0.049556,1,1401,56562,1346
2,1,3,0,1,0,5,9,19,56562,0,...,0,0,0,6284,4,0.0,1,0,56562,1346
3,1,4,0,1,0,5,9,19,56562,1,...,0,764,764,6284,4,0.013507,1,764,56562,1346
4,1,5,0,1,0,5,9,19,56562,9,...,0,4981,293,6284,4,0.088063,1,293,56562,1346
5,1,6,0,1,0,5,9,19,56562,0,...,0,0,0,6284,4,0.0,1,0,56562,1346
6,1,7,0,1,0,5,9,19,56562,0,...,0,0,0,6284,4,0.0,1,0,56562,1346
7,1,8,0,1,0,5,9,19,56562,9,...,0,22620,293,6284,4,0.399915,1,293,56562,1346
8,1,9,0,1,0,5,9,19,56562,9,...,0,24860,2486,6284,4,0.439518,1,2486,56562,1346
9,1,10,0,1,0,5,9,19,56562,0,...,0,0,0,6284,4,0.0,1,0,56562,1346


In [24]:
#LightGBMによるモデル作成
train_x = train_data.drop(['coupon_use_result','uuid'], axis=1)
train_y = train_data[['coupon_use_result']]
val_size = 4
uuid_num = train_data['uuid'].nunique()
train_uuid = int((uuid_num/val_size)*(val_size-1))
filter_idx = (train_uuid*11)-1
tr_x, va_x = train_x.iloc[:filter_idx], train_x.iloc[filter_idx:]
tr_y, va_y = train_y.iloc[:filter_idx], train_y.iloc[filter_idx:]

score_dict = {}
# params = {
# 'objective': 'binary',
#  'metric': 'binary_logloss',
#  'seed': 118,
#  'lambda_l1': 0.00004456120176148555,
#  'lambda_l2': 0.0000004244567009620525,
#  'num_leaves': 371,
#  'feature_fraction': 0.6045840940131413,
#  'bagging_fraction': 0.7154034241314151,
#  'bagging_freq': 6,
#  'min_child_samples': 58
#         }

num_round =1000
# categorical_feature = ['marry_flg','age']
scores = []
lgb_train = lgb.Dataset(tr_x, tr_y)
lgb_eval = lgb.Dataset(va_x, va_y)

best_params, tuning_history = dict(), list()
# for i in range(100):
params = {
'objective': 'binary',
 'metric': 'binary_logloss',
 'seed':70,
'lambda_l1': 0.003158141941182639,
 'lambda_l2': 0.0008853206993091085,
 'num_leaves': 380,
 'feature_fraction': 0.5686541926682908,
 'bagging_fraction': 0.6569913398412971,
 'bagging_freq': 0,
 'min_child_samples': 5
 }
model = lgb.train(params, 
                  lgb_train, 
                  verbose_eval=50,  # 50イテレーション毎に学習結果出力
#                   categorical_feature = categorical_feature,
                  num_boost_round=num_round, 
                  early_stopping_rounds=100,
                  valid_names=['train', 'valid'], 
                  valid_sets=[lgb_train, lgb_eval]
                 )

va_pred = model.predict(va_x)
score = log_loss(va_y, va_pred)
scores.append(score)
#     score_dict[i] = score
#     print("="*80)

print("平均logloss：",np.mean(scores))

Training until validation scores don't improve for 100 rounds
[50]	train's binary_logloss: 0.000522087	valid's binary_logloss: 0.0763184
[100]	train's binary_logloss: 6.71439e-05	valid's binary_logloss: 0.0984091
Early stopping, best iteration is:
[10]	train's binary_logloss: 0.0142828	valid's binary_logloss: 0.0497867
平均logloss： 0.04978668763053791


In [25]:
score_dict

{}

In [26]:
scores

[0.04978668763053791]

In [27]:
#テストデータの予測
pred = model.predict(test_data.drop('uuid',axis=1))
pred = pd.Series(pred, name='pred')

In [28]:
# 特徴量重要度の算出 (データフレームで取得)
cols = list(train_x.columns)         # 特徴量名のリスト
f_importance = np.array(model.feature_importance()) # 特徴量重要度の算出
f_importance = f_importance / np.sum(f_importance)  # 正規化(必要ない場合はコメントアウト)
df_importance = pd.DataFrame({'feature':cols, 'importance':f_importance})
df_importance = df_importance.sort_values('importance', ascending=False) # 降順ソート
display(df_importance)

Unnamed: 0,feature,importance
11,unit_price_category,0.107547
10,purchase_price,0.092453
18,purchase_price_1day,0.081132
4,visits_frequency,0.079245
0,coupon_id,0.075472
14,ratio_category,0.069811
6,all_purchase_price,0.067925
12,purchase_price_average,0.066038
8,purchase_product_num,0.060377
3,income,0.060377


In [29]:
pred

0       0.004930
1       0.004930
2       0.004930
3       0.004930
4       0.006408
          ...   
3911    0.004934
3912    0.012246
3913    0.004934
3914    0.004936
3915    0.013915
Name: pred, Length: 3916, dtype: float64

In [30]:
def round_score(pred):
    if pred <= 0.0056:
        pred = 0
    elif pred >= 0.9944:
        pred = 1
    return pred

In [31]:
pred = pred.apply(lambda x: round_score(x))
result = test_data.join(pred)[['uuid', 'coupon_id', 'pred']]
result = result.pivot(index='uuid', columns='coupon_id', values='pred')

In [32]:
pred.min()

0.0

In [33]:
pred.max()

0.8808158900658125

In [34]:
result

coupon_id,1,2,3,4,5,6,7,8,9,10,11
uuid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
357,0.000000,0.000000,0.000000,0.000000,0.006408,0.000000,0.000000,0.131686,0.006337,0.000000,0.000000
358,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.055225,0.006616,0.000000,0.000000
359,0.000000,0.000000,0.021344,0.021220,0.006498,0.021299,0.021299,0.638803,0.480607,0.006289,0.006574
360,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
361,0.000000,0.072179,0.028101,0.000000,0.028659,0.000000,0.000000,0.007444,0.006187,0.007633,0.008241
...,...,...,...,...,...,...,...,...,...,...,...
708,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.006525,0.000000,0.000000,0.000000
709,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.011134,0.005928,0.000000,0.000000,0.000000
710,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
711,0.018502,0.000000,0.006397,0.006761,0.010398,0.000000,0.005993,0.413493,0.020336,0.000000,0.006269


In [35]:
result.to_csv('/Users/atsushisato/git_clone/ai_quest_season2/models/model.csv',header=False)

In [36]:
pred.nunique()

857

# optunaによるパラメータ調整

In [10]:
model.params

NameError: name 'model' is not defined

In [11]:
# パラメータ調整用

# prediction = np.rint(model.predict(va_x, num_iteration=model.best_iteration))
# accuracy = accuracy_score(va_y, prediction)

# best_params = model.params
# print("Best params:", display(best_params)
# print("  Accuracy = {}".format(accuracy))
# print("  Params: ")
# for key, value in best_params.items():
#     print("    {}: {}".format(key, value))

In [12]:
def objective(trial):
    param = {
        'objective': 'binary',
        'metric': 'binary_logloss',
        'boosting': 'gbdt',
        'learning_rate': 0.05,
        'lambda_l1': trial.suggest_loguniform('lambda_l1', 1e-8, 10.0),
        'lambda_l2': trial.suggest_loguniform('lambda_l2', 1e-8, 10.0),
        'num_leaves': trial.suggest_int('num_leaves', 2, 512),
        'feature_fraction': trial.suggest_uniform('feature_fraction', 0.4, 1.0),
        'bagging_fraction': trial.suggest_uniform('bagging_fraction', 0.4, 1.0),
        'bagging_freq': trial.suggest_int('bagging_freq', 0, 10),
        'min_child_samples': trial.suggest_int('min_child_samples', 5, 100),
        'seed': 0,
        'verbosity': -1,
    }
    gbm = lgb.train(param, 
                    lgb_train, 
                    valid_sets=lgb_eval,
                    verbose_eval=False, 
                    num_boost_round=num_round, 
                    early_stopping_rounds=100
                   )
    y_prob = gbm.predict(va_x)
    y_pred = np.round(y_prob)
    return roc_auc_score(
        np.round(va_y.values),
        np.round(y_pred)
    )

In [13]:
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=50)

[32m[I 2021-01-10 18:28:27,629][0m A new study created in memory with name: no-name-3f58daa7-0c06-4c53-bf0c-584c7220e26c[0m
[32m[I 2021-01-10 18:28:27,632][0m A new study created in memory with name: no-name-1a9ca4b6-894e-4afd-bfda-9e73bc89c0e8[0m

  0%|          | 0/7 [00:00<?, ?it/s][A
feature_fraction, val_score: inf:   0%|          | 0/7 [00:00<?, ?it/s][A
feature_fraction, val_score: 0.053962:   0%|          | 0/7 [00:00<?, ?it/s][A
feature_fraction, val_score: 0.053962:  14%|#4        | 1/7 [00:00<00:01,  5.20it/s][A[32m[I 2021-01-10 18:28:27,831][0m Trial 0 finished with value: 0.0539624985980035 and parameters: {'feature_fraction': 0.8}. Best is trial 0 with value: 0.0539624985980035.[0m

feature_fraction, val_score: 0.053962:  14%|#4        | 1/7 [00:00<00:01,  5.20it/s][A
feature_fraction, val_score: 0.053102:  14%|#4        | 1/7 [00:00<00:01,  5.20it/s][A
feature_fraction, val_score: 0.053102:  29%|##8       | 2/7 [00:00<00:00,  5.13it/s][A[32m[I 2021-01-10

In [14]:
#ベストパラメータ
print('Number of finished trials:', len(study.trials))
#Number of finished trials: 50


Number of finished trials: 50


In [15]:
print('Best trial:', display(study.best_trial.params))

#初回
# {'lambda_l1': 5.204866804465978e-08,
#  'lambda_l2': 3.133667635701077e-05,
#  'num_leaves': 317,
#  'feature_fraction': 0.848640913268004,
#  'bagging_fraction': 0.4983467582046316,
#  'bagging_freq': 4,
#  'min_child_samples': 22}
# Best trial: None

#2回目
# {'lambda_l1': 0.00416566836241902,
#  'lambda_l2': 0.10371698288325823,
#  'num_leaves': 333,
#  'feature_fraction': 0.6873251818776734,
#  'bagging_fraction': 0.6363448284692337,
#  'bagging_freq': 10,
#  'min_child_samples': 17}
# Best trial: None

{'lambda_l1': 0.003158141941182639,
 'lambda_l2': 0.0008853206993091085,
 'num_leaves': 380,
 'feature_fraction': 0.5686541926682908,
 'bagging_fraction': 0.6569913398412971,
 'bagging_freq': 0,
 'min_child_samples': 5}

Best trial: None


In [None]:
# def create_feature_together(train, test, num_cols):
#     train, test = transform_box_cox(train, test, num_cols)
#     return train, test

# # issue 14 Box-Cox変換
# def transform_box_cox(train, test, column):
#     pt = PowerTransformer(method='box-cox')
#     pt.fit(train[column])

#     # 変換後のデータで各列を置換
#     train[column] = pt.transform(train[column])
#     test[column] = pt.transform(test[column])

#特徴量作成（train_dataとtest_dataで同一の処理が必要）
#変換するカラム
# num_cols = ['all_purchase_price']
# train_data, test_data = create_feature_together(train_data, test_data, num_cols)

In [None]:
# tr_x, va_x, tr_y, va_y = train_test_split(train_x, train_y,
#                                           test_size=0.25, random_state=71, shuffle=True)
# kf = KFold(n_splits=4, shuffle=True, random_state=71)
# for tr_idx, va_idx in kf.split(train_x):
#     tr_x, va_x = train_x.iloc[tr_idx], train_x.iloc[va_idx]
#     tr_y, va_y = train_y.iloc[tr_idx], train_y.iloc[va_idx]