In [436]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt


import datetime
import jpholiday
import statsmodels.api as sm
from statsmodels.graphics import tsaplots
import lightgbm as lgb
from sklearn.metrics import  log_loss
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import PowerTransformer
from sklearn.model_selection import KFold
from sklearn.model_selection import train_test_split
from datetime import date, timedelta
from dateutil.relativedelta import relativedelta
import pandas_profiling as pdp

%matplotlib inline

# 基準モデルの作成

In [410]:
FILE_PATH = '/Users/atsushisato/Google_drive/05_PBL/second_section/practice_4/ai_modeling/data/'

train_data = pd.read_csv(FILE_PATH + 'external/train.csv').rename(columns={'顧客ID':'uuid', 'クーポンID':'coupon_id', 'クーポン利用':'coupon_use_result'})
test_data = pd.read_csv(FILE_PATH + 'external/test.csv').rename(columns={'顧客ID':'uuid', 'クーポンID':'coupon_id'})
customer_df = pd.read_csv(FILE_PATH + 'interim/customer_df.csv')

# 特徴量作成

In [411]:
#特徴量作成関数（trainとtestで異なる処理が可能）
def create_feature_separate(dataframe):
    dataframe = category_to_int_conversion(dataframe)
    dataframe['category_average_price'] = dataframe.apply(lambda x: 
                                category_average_price(x['purchase_product_num'], x['coupon_use'], x['purchase_price']),axis=1)
    dataframe['one_purchase_price'] = dataframe.apply(lambda x: 
                                one_purchase_price(x['all_purchase_price'], x['visits_frequency']),axis=1)
    return dataframe

# issue 9/10 カテゴリ変数の数値変換    
def category_to_int_conversion(dataframe):
    marry_dict = {'無回答':None, '独身':0, '既婚':1}
    dataframe['marry_flg'] = dataframe['marry'].apply(lambda x: marry_dict.get(x))
    
    dataframe['age'] = dataframe['age_range'].apply(lambda x: int(x[:2]))
    dataframe['category_id'] = dataframe['category_id'].apply(lambda x: int(x[1:]))
    
    dataframe = dataframe.drop(['age_range', 'marry', 'category'],axis=1)
    return dataframe

# issue 11 カテゴリ別商品単価
def category_average_price(product_num, coupon_use, price):
    if coupon_use != 0:
        price = price + abs(coupon_use)
    
    if price != 0:
        return int(price / product_num)
    else:
        return 0
    
# issue 16 総購入商品の平均
def one_purchase_price(all_purchase_price, visits_frequency):
    if (all_purchase_price != 0) & (visits_frequency != 0):
        one_purchase_price =  int(all_purchase_price / visits_frequency)            
    else:
        one_purchase_price =  0
    return one_purchase_price

In [412]:
# def create_feature_together(train, test, num_cols):
#     train, test = transform_box_cox(train, test, num_cols)
#     return train, test

# # issue 14 Box-Cox変換
# def transform_box_cox(train, test, column):
#     pt = PowerTransformer(method='box-cox')
#     pt.fit(train[column])

#     # 変換後のデータで各列を置換
#     train[column] = pt.transform(train[column])
#     test[column] = pt.transform(test[column])

In [413]:
#train_dataとtest_dataの特徴量作成
train_data = pd.merge(train_data, customer_df, on=['uuid', 'coupon_id'],how='inner')
test_data = pd.merge(test_data, customer_df, on=['uuid', 'coupon_id'],how='inner')

#特徴量作成（train_dataとtest_dataで異なる処理が可能）
train_data = create_feature_separate(train_data)
test_data = create_feature_separate(test_data)

#特徴量作成（train_dataとtest_dataで同一の処理が必要）
#変換するカラム
# num_cols = ['all_purchase_price']
# train_data, test_data = create_feature_together(train_data, test_data, num_cols)

In [448]:
#LightGBMによるモデル作成
train_x = train.drop(['coupon_use_result','uuid'], axis=1)
train_y = train[['coupon_use_result']]

params = {
    'objective': 'binary',     # Binary classification
    'metric': 'binary_logloss', # Area under ROC curve as the evaulation metric
    'seed':70
              }
num_round =1000
categorical_feature = ['marry_flg','category_id','age']
    
scores = []
    
tr_x, va_x, tr_y, va_y = train_test_split(train_x, train_y,
                                          test_size=0.25, random_state=71, shuffle=True)
lgb_train = lgb.Dataset(tr_x, tr_y)
lgb_eval = lgb.Dataset(va_x, va_y)

model = lgb.train(params, 
                  lgb_train, 
                  verbose_eval=50,  # 50イテレーション毎に学習結果出力
                  num_boost_round=num_round, 
                  early_stopping_rounds=100,
                  categorical_feature = categorical_feature,
                  valid_names=['train', 'valid'], 
                  valid_sets=[lgb_train, lgb_eval])

va_pred = model.predict(va_x)
score = log_loss(va_y, va_pred)
scores.append(score)
    
print("バリデーション結果：",np.mean(scores))

New categorical_feature is ['age', 'category_id', 'marry_flg']
  'New categorical_feature is {}'.format(sorted(list(categorical_feature))))


Training until validation scores don't improve for 100 rounds
[50]	train's binary_logloss: 0.00206541	valid's binary_logloss: 0.0630854
[100]	train's binary_logloss: 0.000149706	valid's binary_logloss: 0.0864307
Early stopping, best iteration is:
[16]	train's binary_logloss: 0.0150067	valid's binary_logloss: 0.0485642
バリデーション結果： 0.04856415838457367


In [449]:
scores

[0.04856415838457367]

In [450]:
#テストデータの予測
pred = model.predict(test_data.drop('uuid',axis=1))
pred = pd.Series(pred, name='pred')

In [451]:
# 特徴量重要度の算出 (データフレームで取得)
cols = list(train_x.columns)         # 特徴量名のリスト
f_importance = np.array(model.feature_importance()) # 特徴量重要度の算出
f_importance = f_importance / np.sum(f_importance)  # 正規化(必要ない場合はコメントアウト)
df_importance = pd.DataFrame({'feature':cols, 'importance':f_importance})
df_importance = df_importance.sort_values('importance', ascending=False) # 降順ソート
display(df_importance)

Unnamed: 0,feature,importance
7,all_purchase_price,0.156904
11,purchase_price,0.121339
15,category_average_price,0.117155
0,coupon_id,0.104603
16,one_purchase_price,0.104603
9,purchase_product_num,0.087866
4,income,0.07113
5,visits_frequency,0.069038
6,pass_days,0.060669
13,marry_flg,0.037657


In [452]:
pred

0       0.002097
1       0.002098
2       0.002099
3       0.002099
4       0.002850
          ...   
3911    0.003417
3912    0.003329
3913    0.003417
3914    0.003417
3915    0.004572
Name: pred, Length: 3916, dtype: float64

In [453]:
def round_score(pred):
    if pred <= 0.0056:
        pred = 0
    elif pred >= 0.9944:
        pred = 1
    return pred

In [454]:
pred = pred.apply(lambda x: round_score(x))
result = test_data.join(pred)[['uuid', 'coupon_id', 'pred']]
result = result.pivot(index='uuid', columns='coupon_id', values='pred')

In [455]:
pred.min()

0.0

In [456]:
pred.max()

0.4722730819319187

In [457]:
result

coupon_id,1,2,3,4,5,6,7,8,9,10,11
uuid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
357,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.056035,0.000000,0.000000,0.000000
358,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.013040,0.000000,0.000000,0.000000
359,0.000000,0.000000,0.000000,0.000000,0.000000,0.005808,0.005808,0.442978,0.035278,0.014818,0.018069
360,0.000000,0.005673,0.000000,0.000000,0.000000,0.000000,0.000000,0.007503,0.000000,0.000000,0.006021
361,0.006605,0.007645,0.006050,0.000000,0.018098,0.018739,0.020983,0.031510,0.023569,0.037241,0.089923
...,...,...,...,...,...,...,...,...,...,...,...
708,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.005737
709,0.000000,0.006528,0.000000,0.000000,0.000000,0.000000,0.000000,0.010281,0.000000,0.000000,0.000000
710,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
711,0.104299,0.000000,0.007141,0.031974,0.016277,0.017463,0.187609,0.348862,0.078174,0.012413,0.020951


In [458]:
result.to_csv('/Users/atsushisato/Google_drive/05_PBL/second_section/practice_4/ai_modeling/models/model.csv',header=False)