In [42]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import datetime
import jpholiday
import statsmodels.api as sm
from statsmodels.graphics import tsaplots
from optuna.integration import lightgbm as lgb
# import lightgbm as lgb
from sklearn.metrics import  log_loss
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import PowerTransformer
from sklearn.model_selection import KFold
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_auc_score
from datetime import date, timedelta
import sklearn.metrics
import optuna 
from dateutil.relativedelta import relativedelta
import pandas_profiling as pdp


%matplotlib inline

# 基準モデルの作成

In [30]:
FILE_PATH = '/Users/atsushisato/Google_drive/05_PBL/second_section/practice_4/ai_modeling/data/'

train_data = pd.read_csv(FILE_PATH + 'external/train.csv').rename(columns={'顧客ID':'uuid', 'クーポンID':'coupon_id', 'クーポン利用':'coupon_use_result'})
test_data = pd.read_csv(FILE_PATH + 'external/test.csv').rename(columns={'顧客ID':'uuid', 'クーポンID':'coupon_id'})
customer_df = pd.read_csv(FILE_PATH + 'interim/customer_df.csv')

# 特徴量作成

In [31]:
#特徴量作成関数（trainとtestで異なる処理が可能）
def create_feature_separate(dataframe):
    dataframe = category_to_int_conversion(dataframe)
    dataframe['category_average_price'] = dataframe.apply(lambda x: 
                                category_average_price(x['purchase_product_num'], x['coupon_use'], x['purchase_price']),axis=1)
    dataframe['one_purchase_price'] = dataframe.apply(lambda x: 
                                one_purchase_price(x['all_purchase_price'], x['visits_frequency']),axis=1)   
    dataframe['visit_cycle'] = dataframe.apply(lambda x:
                                visit_cycle(x['visits_frequency'], x['pass_days']),axis=1)
    dataframe['visit_cycle_flg'] = dataframe.apply(lambda x: 
                                visit_cycle_flg(x['visit_cycle'], x['pass_days']),axis=1) 
    dataframe['ratio_category_price'] = dataframe.apply(lambda x:
                                ratio_category_price(x['all_purchase_price'], x['purchase_price']),axis=1)
    dataframe['adult_num'] = dataframe['fammily_num'] - dataframe['child_num']
    
    dataframe['average_purchase_price'] = dataframe.apply(lambda x:
                                average_purchase(x['purchase_price'], x['purchase_product_num'],x['coupon_use']),axis=1)
    
#     dataframe = dataframe.drop(['category_id', 'house_flg'],axis=1)
    return dataframe

#===================================================================================
#特徴量作成関数
# issue 9/10 カテゴリ変数の数値変換    
def category_to_int_conversion(dataframe):
    marry_dict = {'無回答':None, '独身':0, '既婚':1}
    dataframe['marry_flg'] = dataframe['marry'].apply(lambda x: marry_dict.get(x))
    
    dataframe['age'] = dataframe['age_range'].apply(lambda x: int(x[:2]))
    dataframe['category_id'] = dataframe['category_id'].apply(lambda x: int(x[1:]))
    
    dataframe = dataframe.drop(['age_range', 'marry', 'category'],axis=1)
    return dataframe

# issue 11 カテゴリ別商品単価
def category_average_price(product_num, coupon_use, price):
    if coupon_use != 0:
        price = price + abs(coupon_use)
    
    if price != 0:
        return int(price / product_num)
    else:
        return 0
    
# issue 16 総購入商品の平均
def one_purchase_price(all_purchase_price, visits_frequency):
    if (all_purchase_price != 0) & (visits_frequency != 0):
        one_purchase_price =  int(all_purchase_price / visits_frequency)            
    else:
        one_purchase_price =  0
    return one_purchase_price

# issue 20 平均来店周期
def visit_cycle(visits_frequency, pass_days):
    all_days = 60
    if visits_frequency != 0:
        cycle = int((all_days - pass_days) / visits_frequency)
    else:
        cycle = 0
    return cycle

# issue 21 経過日数と来店頻度の比較
def visit_cycle_flg(visit_cycle, pass_days):
    if visit_cycle <= pass_days:
        return 1
    else:
        return 0
    
# issue 22 総購入金額に占めるカテゴリ別の購入金額比率
def ratio_category_price(all_purchase_price, purchase_price):
    if purchase_price != 0:
        ratio = purchase_price / all_purchase_price
    else:
        ratio = 0
    return ratio

# issue 23 大人の数
def adult_num(family, child):
    return family-child

# issue 24 カテゴリ別平均購入金額
def average_purchase(purchase_price, purchase_product_num, coupon_use):
    if purchase_product_num != 0:
        purchase_price = purchase_price + abs(coupon_use)
        average_purchase = int(purchase_price / purchase_product_num)
    else:
        average_purchase = 0
    return average_purchase
    

# issue 19 欠損値削除（購入履歴はあるが来店履歴がないuuid）
def drop_missing_value(dataframe):
    drop_uuid = dataframe.query("(visits_frequency ==0 )& (all_purchase_price != 0)")['uuid'].unique()
    dataframe = dataframe.query("uuid not in @drop_uuid") 
    return dataframe

In [32]:
#train_dataとtest_dataの特徴量作成
train_data = pd.merge(train_data, customer_df, on=['uuid', 'coupon_id'],how='inner')
test_data = pd.merge(test_data, customer_df, on=['uuid', 'coupon_id'],how='inner')

#特徴量作成（train_dataとtest_dataで異なる処理が可能）
train_data = create_feature_separate(train_data)
test_data = create_feature_separate(test_data)
train_data = drop_missing_value(train_data)

In [33]:
sorted(train_data.columns)

['adult_num',
 'age',
 'all_purchase_price',
 'average_purchase_price',
 'category_average_price',
 'category_id',
 'child_num',
 'coupon_id',
 'coupon_use',
 'coupon_use_result',
 'fammily_num',
 'house_flg',
 'income',
 'marry_flg',
 'one_purchase_price',
 'pass_days',
 'purchase_num',
 'purchase_price',
 'purchase_product_num',
 'ratio_category_price',
 'uuid',
 'visit_cycle',
 'visit_cycle_flg',
 'visits_frequency']

In [34]:
train_data.head(11)

Unnamed: 0,uuid,coupon_id,coupon_use_result,fammily_num,child_num,house_flg,income,visits_frequency,pass_days,all_purchase_price,...,category_id,marry_flg,age,category_average_price,one_purchase_price,visit_cycle,visit_cycle_flg,ratio_category_price,adult_num,average_purchase_price
0,1,1,0,1,0,0,5,9,19,56562,...,1,,40,267,6284,4,1,0.009441,1,267
1,1,2,0,1,0,0,5,9,19,56562,...,2,,40,1401,6284,4,1,0.049556,1,1401
2,1,3,0,1,0,0,5,9,19,56562,...,3,,40,0,6284,4,1,0.0,1,0
3,1,4,0,1,0,0,5,9,19,56562,...,4,,40,764,6284,4,1,0.013507,1,764
4,1,5,0,1,0,0,5,9,19,56562,...,5,,40,293,6284,4,1,0.088063,1,293
5,1,6,0,1,0,0,5,9,19,56562,...,6,,40,0,6284,4,1,0.0,1,0
6,1,7,0,1,0,0,5,9,19,56562,...,7,,40,0,6284,4,1,0.0,1,0
7,1,8,0,1,0,0,5,9,19,56562,...,8,,40,293,6284,4,1,0.399915,1,293
8,1,9,0,1,0,0,5,9,19,56562,...,9,,40,2486,6284,4,1,0.439518,1,2486
9,1,10,0,1,0,0,5,9,19,56562,...,10,,40,0,6284,4,1,0.0,1,0


In [54]:
#LightGBMによるモデル作成
train_x = train_data.drop(['coupon_use_result','uuid'], axis=1)
train_y = train_data[['coupon_use_result']]

# params = {'objective': 'binary',
#          'metric': 'binary_logloss',
#          'seed': 70,
#          'feature_pre_filter': False,
#          'lambda_l1': 0.7363266301419099,
#          'lambda_l2': 0.0012895602337976008,
#          'num_leaves': 5,
#          'feature_fraction': 0.552,
#          'bagging_fraction': 0.5443702579911031,
#          'bagging_freq': 5,
#          'min_data_in_leaf': 20,
#          'min_child_samples': 20}
params = {'objective': 'binary',
         'metric': 'binary_logloss',
         'seed': 70,
         'feature_pre_filter': False,
         'lambda_l1': 0.00000005204866804465978,
         'lambda_l2': 0.00003133667635701077,
         'num_leaves': 317,
         'feature_fraction': 0.848640913268004,
         'bagging_fraction': 0.4983467582046316,
         'bagging_freq': 4,
         'min_child_samples': 22}

num_round =1000
categorical_feature = ['marry_flg','age']
scores = []
    
val_size = 4
uuid_num = train_data['uuid'].nunique()
train_uuid = int((uuid_num/val_size)*(val_size-1))
filter_idx = (train_uuid*11)-1
tr_x, va_x = train_x.iloc[:filter_idx], train_x.iloc[filter_idx:]
tr_y, va_y = train_y.iloc[:filter_idx], train_y.iloc[filter_idx:]

lgb_train = lgb.Dataset(tr_x, tr_y)
lgb_eval = lgb.Dataset(va_x, va_y)

best_params, tuning_history = dict(), list()
model = lgb.train(params, 
                  lgb_train, 
                  verbose_eval=0,  # 50イテレーション毎に学習結果出力
                  categorical_feature = categorical_feature,
                  num_boost_round=num_round, 
                  early_stopping_rounds=100,
                  valid_names=['train', 'valid'], 
                  valid_sets=[lgb_train, lgb_eval]
                 )

va_pred = model.predict(va_x)
score = log_loss(va_y, va_pred)
scores.append(score)
print("="*80)

print("平均logloss：",np.mean(scores))

[32m[I 2021-01-04 18:52:33,286][0m A new study created in memory with name: no-name-22faf1a0-5a5f-4c2c-92c3-952811ddc2fd[0m
New categorical_feature is ['age', 'marry_flg']
  'New categorical_feature is {}'.format(sorted(list(categorical_feature))))
feature_fraction, val_score: 0.049844:  14%|#4        | 1/7 [00:00<00:01,  5.17it/s][32m[I 2021-01-04 18:52:33,484][0m Trial 0 finished with value: 0.0498439689240646 and parameters: {'feature_fraction': 0.4}. Best is trial 0 with value: 0.0498439689240646.[0m
feature_fraction, val_score: 0.049844:  29%|##8       | 2/7 [00:00<00:00,  5.18it/s][32m[I 2021-01-04 18:52:33,676][0m Trial 1 finished with value: 0.05352220965179254 and parameters: {'feature_fraction': 0.8}. Best is trial 0 with value: 0.0498439689240646.[0m
feature_fraction, val_score: 0.049844:  43%|####2     | 3/7 [00:00<00:00,  5.59it/s][32m[I 2021-01-04 18:52:33,822][0m Trial 2 finished with value: 0.05356432879176431 and parameters: {'feature_fraction': 1.0}. Best i

平均logloss： 0.046495672924284026





In [55]:
#パラメータ調整用

# prediction = np.rint(model.predict(va_x, num_iteration=model.best_iteration))
# accuracy = accuracy_score(va_y, prediction)

# best_params = model.params
# print("Best params:", display(best_params)
# print("  Accuracy = {}".format(accuracy))
# print("  Params: ")
# for key, value in best_params.items():
#     print("    {}: {}".format(key, value))
# model.params

In [56]:
scores

[0.046495672924284026]

In [57]:
#テストデータの予測
pred = model.predict(test_data.drop('uuid',axis=1))
pred = pd.Series(pred, name='pred')

In [58]:
# 特徴量重要度の算出 (データフレームで取得)
cols = list(train_x.columns)         # 特徴量名のリスト
f_importance = np.array(model.feature_importance()) # 特徴量重要度の算出
f_importance = f_importance / np.sum(f_importance)  # 正規化(必要ない場合はコメントアウト)
df_importance = pd.DataFrame({'feature':cols, 'importance':f_importance})
df_importance = df_importance.sort_values('importance', ascending=False) # 降順ソート
display(df_importance)

Unnamed: 0,feature,importance
11,purchase_price,0.163265
7,all_purchase_price,0.163265
5,visits_frequency,0.142857
1,fammily_num,0.081633
4,income,0.081633
6,pass_days,0.061224
19,ratio_category_price,0.040816
16,one_purchase_price,0.040816
15,category_average_price,0.040816
13,marry_flg,0.040816


In [59]:
pred

0       0.003185
1       0.003185
2       0.003185
3       0.003185
4       0.007471
          ...   
3911    0.002932
3912    0.006879
3913    0.002932
3914    0.002932
3915    0.002932
Name: pred, Length: 3916, dtype: float64

In [60]:
def round_score(pred):
    if pred <= 0.0056:
        pred = 0
    elif pred >= 0.9944:
        pred = 1
    return pred

In [61]:
pred = pred.apply(lambda x: round_score(x))
result = test_data.join(pred)[['uuid', 'coupon_id', 'pred']]
result = result.pivot(index='uuid', columns='coupon_id', values='pred')

In [62]:
pred.min()

0.0

In [63]:
pred.max()

0.4231791357911925

In [64]:
result

coupon_id,1,2,3,4,5,6,7,8,9,10,11
uuid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
357,0.000000,0.000000,0.000000,0.000000,0.007471,0.000000,0.000000,0.066787,0.006067,0.000000,0.000000
358,0.000000,0.000000,0.000000,0.005903,0.000000,0.000000,0.000000,0.018743,0.008340,0.000000,0.000000
359,0.020235,0.020235,0.012962,0.012962,0.033212,0.020235,0.020235,0.294455,0.227271,0.021376,0.012962
360,0.000000,0.006684,0.000000,0.000000,0.008509,0.000000,0.000000,0.026864,0.006684,0.000000,0.000000
361,0.007074,0.011080,0.011711,0.018295,0.007074,0.018295,0.011080,0.200599,0.018295,0.011080,0.011080
...,...,...,...,...,...,...,...,...,...,...,...
708,0.000000,0.000000,0.000000,0.005954,0.000000,0.000000,0.000000,0.018904,0.005954,0.000000,0.000000
709,0.011736,0.000000,0.007089,0.013456,0.000000,0.007494,0.016541,0.036795,0.011736,0.007089,0.007494
710,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
711,0.044000,0.026926,0.044000,0.044000,0.057089,0.044000,0.050209,0.390397,0.144982,0.017290,0.028432


In [65]:
result.to_csv('/Users/atsushisato/git_clone/ai_quest_season2/models/model.csv',header=False)

In [66]:
pred.nunique()

829

# optunaによるパラメータ調整

In [43]:
def objective(trial):
    param = {
        'objective': 'cross_entropy',
        'metric': 'auc',
        'boosting': 'gbdt',
        'learning_rate': 0.05,
        'lambda_l1': trial.suggest_loguniform('lambda_l1', 1e-8, 10.0),
        'lambda_l2': trial.suggest_loguniform('lambda_l2', 1e-8, 10.0),
        'num_leaves': trial.suggest_int('num_leaves', 2, 512),
        'feature_fraction': trial.suggest_uniform('feature_fraction', 0.4, 1.0),
        'bagging_fraction': trial.suggest_uniform('bagging_fraction', 0.4, 1.0),
        'bagging_freq': trial.suggest_int('bagging_freq', 0, 10),
        'min_child_samples': trial.suggest_int('min_child_samples', 5, 100),
        'seed': 0,
        'verbosity': -1,
    }
    gbm = lgb.train(param, 
                    lgb_train, 
                    valid_sets=lgb_eval,
                    verbose_eval=False, 
                    num_boost_round=num_round, 
                    early_stopping_rounds=100
                   )
    y_prob = gbm.predict(va_x)
    y_pred = np.round(y_prob)
    return roc_auc_score(
        np.round(va_y.values),
        np.round(y_pred)
    )

In [44]:
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=50)

[32m[I 2021-01-04 18:19:57,523][0m A new study created in memory with name: no-name-ecdea0ad-b4da-43d6-8411-db68074a812e[0m
[32m[I 2021-01-04 18:19:57,525][0m A new study created in memory with name: no-name-b980a7e9-4c77-4fd0-ba7f-b2bc6bb7fd57[0m
feature_fraction, val_score: 0.851199:  14%|#4        | 1/7 [00:00<00:00,  6.45it/s][32m[I 2021-01-04 18:19:57,685][0m Trial 0 finished with value: 0.8511991657977059 and parameters: {'feature_fraction': 0.8}. Best is trial 0 with value: 0.8511991657977059.[0m
feature_fraction, val_score: 0.851199:  14%|#4        | 1/7 [00:00<00:00,  6.45it/s][32m[I 2021-01-04 18:19:57,783][0m Trial 1 finished with value: 0.8218456725755996 and parameters: {'feature_fraction': 0.5}. Best is trial 0 with value: 0.8511991657977059.[0m
feature_fraction, val_score: 0.852659:  43%|####2     | 3/7 [00:00<00:00,  6.96it/s][32m[I 2021-01-04 18:19:57,920][0m Trial 2 finished with value: 0.8526590198123045 and parameters: {'feature_fraction': 0.4}. Best i

Number of finished trials: 50
Best trial: {'lambda_l1': 5.204866804465978e-08, 'lambda_l2': 3.133667635701077e-05, 'num_leaves': 317, 'feature_fraction': 0.848640913268004, 'bagging_fraction': 0.4983467582046316, 'bagging_freq': 4, 'min_child_samples': 22}


In [49]:
#ベストパラメータ
print('Number of finished trials:', len(study.trials))
#Number of finished trials: 50


Number of finished trials: 50


In [50]:
print('Best trial:', display(study.best_trial.params))

{'lambda_l1': 5.204866804465978e-08,
 'lambda_l2': 3.133667635701077e-05,
 'num_leaves': 317,
 'feature_fraction': 0.848640913268004,
 'bagging_fraction': 0.4983467582046316,
 'bagging_freq': 4,
 'min_child_samples': 22}
Best trial: None

{'lambda_l1': 5.204866804465978e-08,
 'lambda_l2': 3.133667635701077e-05,
 'num_leaves': 317,
 'feature_fraction': 0.848640913268004,
 'bagging_fraction': 0.4983467582046316,
 'bagging_freq': 4,
 'min_child_samples': 22}

Best trial: None


In [None]:
# def create_feature_together(train, test, num_cols):
#     train, test = transform_box_cox(train, test, num_cols)
#     return train, test

# # issue 14 Box-Cox変換
# def transform_box_cox(train, test, column):
#     pt = PowerTransformer(method='box-cox')
#     pt.fit(train[column])

#     # 変換後のデータで各列を置換
#     train[column] = pt.transform(train[column])
#     test[column] = pt.transform(test[column])

#特徴量作成（train_dataとtest_dataで同一の処理が必要）
#変換するカラム
# num_cols = ['all_purchase_price']
# train_data, test_data = create_feature_together(train_data, test_data, num_cols)

In [None]:
# tr_x, va_x, tr_y, va_y = train_test_split(train_x, train_y,
#                                           test_size=0.25, random_state=71, shuffle=True)
# kf = KFold(n_splits=4, shuffle=True, random_state=71)
# for tr_idx, va_idx in kf.split(train_x):
#     tr_x, va_x = train_x.iloc[tr_idx], train_x.iloc[va_idx]
#     tr_y, va_y = train_y.iloc[tr_idx], train_y.iloc[va_idx]