In [1]:
import pandas as pd
import numpy as np

import datetime
import jpholiday
import statsmodels.api as sm
from statsmodels.graphics import tsaplots
import lightgbm as lgb
from sklearn.metrics import  log_loss
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import OneHotEncoder
from datetime import date, timedelta
from dateutil.relativedelta import relativedelta

%matplotlib inline

# 基準モデルの作成

In [75]:
FILE_PATH = '/Users/atsushisato/Google_drive/05_PBL/second_section/practice_4/ai_modeling/data/'

train_data = pd.read_csv(FILE_PATH + 'external/train.csv').rename(columns={'顧客ID':'uuid', 'クーポンID':'coupon_id', 'クーポン利用':'coupon_use_result'})
test_data = pd.read_csv(FILE_PATH + 'external/test.csv').rename(columns={'顧客ID':'uuid', 'クーポンID':'coupon_id'})
customer_df = pd.read_csv(FILE_PATH + 'interim/customer_df.csv')

In [76]:
def category_to_int_conversion(dataframe):
    marry_dict = {'無回答':-1, '独身':0, '既婚':1}
    dataframe['age'] = dataframe['age_range'].apply(lambda x: int(x[:2]))
    dataframe['marry_flg'] = dataframe['marry'].apply(lambda x: marry_dict.get(x))
    dataframe['category_id'] = dataframe['category_id'].apply(lambda x: int(x[1:]))
    
    dataframe = dataframe.drop(['age_range', 'marry', 'category'],axis=1)
    return dataframe

In [77]:
#train_dataとtest_data
train_data = pd.merge(train_data, customer_df, on=['uuid', 'coupon_id'],how='inner')
test_data = pd.merge(test_data, customer_df, on=['uuid', 'coupon_id'],how='inner')

#カテゴリ変数の数値変換
train_data = category_to_int_conversion(train_data)
test_data = category_to_int_conversion(test_data)

In [78]:
train_data.head()

Unnamed: 0,uuid,coupon_id,coupon_use_result,fammily_num,child_num,house_flg,income,visits_frequency,pass_days,all_purchase_price,purchase_num,purchase_product_num,coupon_use,purchase_price,category_id,age,marry_flg
0,1,1,0,1,0,0,5,9,19,56562,1,2,0,534,1,40,-1
1,1,2,0,1,0,0,5,9,19,56562,1,2,0,2803,2,40,-1
2,1,3,0,1,0,0,5,9,19,56562,0,0,0,0,3,40,-1
3,1,4,0,1,0,0,5,9,19,56562,1,1,0,764,4,40,-1
4,1,5,0,1,0,0,5,9,19,56562,9,17,0,4981,5,40,-1


In [79]:
uuid_num = train_data['uuid'].nunique()

train = train_data.query(f"uuid <= {int(uuid_num/2)}")
train_x = train.drop('coupon_use_result', axis=1)
train_y = train[['coupon_use_result']]

va = train_data.query(f"uuid > {int(uuid_num/2)}")
va_x = va.drop('coupon_use_result',axis=1)
va_y = va[['coupon_use_result']]

In [101]:
#LightGBMによるモデル作成
# params = {,
#           'application':'binary',  
#           'seed':71, 
#           'verbose':0, 
#           'metrics':'binary_logloss'}
params = {
    'boosting': 'dart',          # dart (drop out trees) often performs better
    'application': 'binary',     # Binary classification
    'learning_rate': 0.05,       # Learning rate, controls size of a gradient descent step
    'min_data_in_leaf': 20,      # Data set is quite small so reduce this a bit
    'feature_fraction': 0.7,     # Proportion of features in each boost, controls overfitting
    'num_leaves': 41,            # Controls size of tree since LGBM uses leaf wise splits
    'metric': 'binary_logloss',  # Area under ROC curve as the evaulation metric
    'drop_rate': 0.15
              }
num_round =100
# category = ['age', 'marry_flg', 'category_id']

lgb_train = lgb.Dataset(train_x, train_y)
lgb_eval = lgb.Dataset(va_x, va_y)
model = lgb.train(params, 
                  lgb_train, 
                  num_boost_round=num_round, 
                  valid_names=['train', 'valid'], 
                  valid_sets=[lgb_train, lgb_eval])

[1]	train's binary_logloss: 0.0776435	valid's binary_logloss: 0.0510303
[2]	train's binary_logloss: 0.0724943	valid's binary_logloss: 0.0497625
[3]	train's binary_logloss: 0.068638	valid's binary_logloss: 0.0487397
[4]	train's binary_logloss: 0.0648071	valid's binary_logloss: 0.0484645
[5]	train's binary_logloss: 0.0622425	valid's binary_logloss: 0.0483827
[6]	train's binary_logloss: 0.0598444	valid's binary_logloss: 0.0481127
[7]	train's binary_logloss: 0.057654	valid's binary_logloss: 0.0476082
[8]	train's binary_logloss: 0.0590959	valid's binary_logloss: 0.0478408
[9]	train's binary_logloss: 0.0569898	valid's binary_logloss: 0.0474458
[10]	train's binary_logloss: 0.0547527	valid's binary_logloss: 0.0471251
[11]	train's binary_logloss: 0.0544864	valid's binary_logloss: 0.0470502
[12]	train's binary_logloss: 0.05521	valid's binary_logloss: 0.0470362
[13]	train's binary_logloss: 0.0530732	valid's binary_logloss: 0.0468061
[14]	train's binary_logloss: 0.0539171	valid's binary_logloss: 0

In [107]:
pred = model.predict(test_data)
pred = pd.Series(pred, name='pred')

In [113]:
pred

0       0.014780
1       0.016337
2       0.016046
3       0.014904
4       0.043826
          ...   
3911    0.013810
3912    0.035418
3913    0.013796
3914    0.014941
3915    0.017317
Name: pred, Length: 3916, dtype: float64

In [117]:
result = test_data.join(pred)[['uuid', 'coupon_id', 'pred']]
result = result.pivot(index='uuid', columns='coupon_id', values='pred')

In [123]:
result.to_csv('/Users/atsushisato/Google_drive/05_PBL/second_section/practice_4/ai_modeling/models/first_model.csv',header=False)