In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!nvidia-smi

Sun Dec 12 13:40:51 2021       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 495.44       Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla K80           Off  | 00000000:00:04.0 Off |                    0 |
| N/A   35C    P8    29W / 149W |      0MiB / 11441MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [None]:
!pip install deepctr-torch

Collecting deepctr-torch
  Downloading deepctr_torch-0.2.7-py3-none-any.whl (70 kB)
[?25l[K     |████▊                           | 10 kB 23.9 MB/s eta 0:00:01[K     |█████████▍                      | 20 kB 28.2 MB/s eta 0:00:01[K     |██████████████                  | 30 kB 26.1 MB/s eta 0:00:01[K     |██████████████████▊             | 40 kB 19.7 MB/s eta 0:00:01[K     |███████████████████████▍        | 51 kB 15.3 MB/s eta 0:00:01[K     |████████████████████████████    | 61 kB 11.4 MB/s eta 0:00:01[K     |████████████████████████████████| 70 kB 4.8 MB/s 
Installing collected packages: deepctr-torch
Successfully installed deepctr-torch-0.2.7


In [None]:
# lib
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# training
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import KFold
from sklearn.model_selection import GridSearchCV

from sklearn import preprocessing
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import log_loss, roc_auc_score
from sklearn.model_selection import train_test_split

# deepfm
import torch
from deepctr_torch.inputs import SparseFeat, DenseFeat, get_feature_names
from deepctr_torch.models import DeepFM


from tensorflow.python.keras.models import  save_model,load_model
import joblib # model save & load
import warnings
warnings.filterwarnings('ignore')

In [None]:
# dataset load
data_path = '/content/drive/MyDrive/Colab Notebooks/data/ad.zip (Unzipped Files)/data/'

dataset = pd.read_csv(data_path + 'dataset.csv')
SW_dataset = pd.read_csv(data_path + 'SW_data.csv')
ad = pd.read_csv(data_path + 'ad.csv')

In [None]:
# reduce memory
def reduce_mem_usage(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)

    end_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))

    return df

In [None]:
ad = reduce_mem_usage(ad)
dataset = reduce_mem_usage(dataset)
SW_dataset = reduce_mem_usage(SW_dataset)

Memory usage after optimization is: 0.07 MB
Decreased by 50.9%
Memory usage after optimization is: 77.36 MB
Decreased by 66.1%
Memory usage after optimization is: 82.25 MB
Decreased by 65.9%


In [None]:
SW_dataset.isnull().sum()

Unnamed: 0                      0
label                           0
day                             0
hour                            0
c_user_gender                   0
c_user_age                      0
user_following_count            0
user_pay_count                  0
user_parcel_post_count          0
user_transfer_count             0
user_chat_count                 0
advertiser_grade                0
advertiser_item_count           0
advertiser_interest_count       0
advertiser_follower_count       0
advertiser_pay_count            0
advertiser_review_count         0
advertiser_parcel_post_count    0
advertiser_transfer_count       0
advertiser_chat_count           0
advertiser_favorite_count       0
advertiser_comment_count        0
content_bid_price               0
content_price                   0
c_content_flag_used             0
c_content_category_id_1         0
c_content_category_id_2         0
c_content_category_id_3         0
content_emergency_count         0
content_commen

In [None]:
SW_dataset.content_count.unique()

array([-0.2222,  0.1111,  2.334 ,  0.    ,  0.3333,  0.6665, -0.1111,
        0.2222,  4.11  ,  3.223 ,  1.556 ,  1.    ,  1.223 ,  0.8887,
        4.777 ,  5.89  ,  3.    , -0.3333,  2.445 ,  0.5557,  2.    ,
        3.334 ,  1.667 ,  2.223 , 11.22  ,  0.778 ,  2.555 ,  3.889 ,
        0.4443,  5.    ,  2.666 ,  3.111 ,  2.111 ,  2.889 ,  1.444 ,
        1.889 ,  1.111 ,  5.332 ,  1.333 ,  2.777 ,  6.332 ,  6.555 ,
        3.445 ,  1.777 ,  4.89  , 19.89  ,  6.445 ,  4.332 ,  4.555 ,
        7.11  ,  6.    ,  6.11  ,  9.11  , 10.336 ,  3.777 , 10.445 ,
        4.223 , 10.11  ,  7.    ,  5.777 ,  7.332 ,  7.555 ,  7.223 ,
        7.668 ,  5.11  ,  4.445 ,  7.777 ,  4.    ,  8.89  ,  4.668 ,
        8.22  ,  3.666 , 12.22  ,  9.    ,  6.777 ,  6.223 ,  8.664 ,
        9.22  , 20.11  ,  9.445 ,  7.89  ,  6.89  ,  5.668 ,  6.668 ,
       11.89  ,  3.555 ,  9.336 ,  5.555 , 11.    , 14.336 , 10.    ,
       14.    ,  7.445 ,  8.11  ,  8.    ,  5.445 , 22.89  ,  8.445 ,
       14.664 , 14.8

In [None]:
SW_dataset.drop(['Unnamed: 0', 'server_time_kst', 'm_time', 'content_img_url'], axis = 1, inplace=True)
SW_dataset

Unnamed: 0,label,day,hour,c_user_gender,c_user_age,user_following_count,user_pay_count,user_parcel_post_count,user_transfer_count,user_chat_count,advertiser_grade,advertiser_item_count,advertiser_interest_count,advertiser_follower_count,advertiser_pay_count,advertiser_review_count,advertiser_parcel_post_count,advertiser_transfer_count,advertiser_chat_count,advertiser_favorite_count,advertiser_comment_count,content_bid_price,content_price,c_content_flag_used,c_content_category_id_1,c_content_category_id_2,c_content_category_id_3,content_emergency_count,content_comment_count,content_interest_count,content_favorite_count,content_id,content_count
0,0,20210831,5,0,0,28,0,0,0,0,5285,246,4521,0,496,542,0,0,11,2131,738,70,310000.0,1.0,600.0,600700.0,600700032.0,1.0,6.0,28595.0,1067.0,137083455,-0.222168
1,0,20210831,5,1,46,0,0,0,0,0,0,3940,84,0,0,0,0,0,0,34,0,50,200000.0,1.0,750.0,750610.0,750610112.0,0.0,0.0,179.0,7.0,162622860,0.111084
2,0,20210831,5,1,26,9,0,0,0,0,0,3940,84,0,0,0,0,0,0,34,0,50,200000.0,1.0,750.0,750610.0,750610112.0,0.0,0.0,179.0,7.0,162622860,-0.222168
3,0,20210831,5,2,45,3,0,0,0,0,323,557,185,0,20,33,0,0,0,177,1,55,32000.0,2.0,810.0,810200.0,810200320.0,0.0,0.0,1653.0,122.0,149111357,2.333984
4,0,20210831,5,1,34,1,0,0,0,0,7673,431,31681,0,575,775,0,35,130,1900,529,80,699000.0,1.0,600.0,600100.0,600100032.0,8.0,2.0,40601.0,689.0,100010275,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
853863,0,20210831,9,1,18,0,1,0,1,1,7673,431,31681,0,575,775,0,35,130,1900,529,75,250000.0,1.0,600.0,600100.0,600100032.0,9.0,3.0,58750.0,863.0,104223605,-0.222168
853864,0,20210831,9,2,40,11,0,0,0,1,59,2,3,0,5,6,0,0,3,2,0,55,70000.0,1.0,430.0,430300.0,430300992.0,1.0,2.0,65.0,9.0,163090225,5.445312
853865,0,20210831,9,2,36,1,0,0,0,0,511,9,320,0,0,52,0,0,5,29,2,75,1000000.0,2.0,320.0,320080.0,320080992.0,6.0,0.0,24520.0,133.0,146485304,0.666504
853866,0,20210831,9,1,54,0,0,0,0,0,20847,702,472261,0,438,2172,0,19,21,13291,1908,50,50000.0,2.0,420.0,420200.0,420200992.0,3.0,1.0,6229.0,71.0,135412901,0.111084


* day도 드랍할 예정
    * 모두 20210831이기 때문에 학습에 필요 없는 feature일 것이다.

In [None]:
SW_dataset.columns

Index(['label', 'day', 'hour', 'c_user_gender', 'c_user_age',
       'user_following_count', 'user_pay_count', 'user_parcel_post_count',
       'user_transfer_count', 'user_chat_count', 'advertiser_grade',
       'advertiser_item_count', 'advertiser_interest_count',
       'advertiser_follower_count', 'advertiser_pay_count',
       'advertiser_review_count', 'advertiser_parcel_post_count',
       'advertiser_transfer_count', 'advertiser_chat_count',
       'advertiser_favorite_count', 'advertiser_comment_count',
       'content_bid_price', 'content_price', 'c_content_flag_used',
       'c_content_category_id_1', 'c_content_category_id_2',
       'c_content_category_id_3', 'content_emergency_count',
       'content_comment_count', 'content_interest_count',
       'content_favorite_count', 'content_id', 'content_count'],
      dtype='object')

In [None]:
features = ['day', 'hour', 'c_user_gender', 'c_user_age',
       'user_following_count', 'user_pay_count', 'user_parcel_post_count',
       'user_transfer_count', 'user_chat_count', 'advertiser_grade',
       'advertiser_item_count', 'advertiser_interest_count',
       'advertiser_follower_count', 'advertiser_pay_count',
       'advertiser_review_count', 'advertiser_parcel_post_count',
       'advertiser_transfer_count', 'advertiser_chat_count',
       'advertiser_favorite_count', 'advertiser_comment_count',
       'content_bid_price', 'content_price', 'c_content_flag_used',
       'c_content_category_id_1', 'c_content_category_id_2',
       'c_content_category_id_3', 'content_emergency_count',
       'content_comment_count', 'content_interest_count',
       'content_favorite_count', 'content_id', 'content_count']

In [None]:
# label encoding

for feat in features:
    lbe = LabelEncoder()
    SW_dataset[feat] = lbe.fit_transform(SW_dataset[feat])  

dataset.head()

Unnamed: 0,label,day,hour,c_user_gender,c_user_age,user_following_count,user_pay_count,user_parcel_post_count,user_transfer_count,user_chat_count,advertiser_grade,advertiser_item_count,advertiser_interest_count,advertiser_follower_count,advertiser_pay_count,advertiser_review_count,advertiser_parcel_post_count,advertiser_transfer_count,advertiser_chat_count,advertiser_favorite_count,advertiser_comment_count,content_bid_price,content_price,c_content_flag_used,c_content_category_id_1,c_content_category_id_2,c_content_category_id_3,content_emergency_count,content_comment_count,content_interest_count,content_favorite_count,content_id,server_time_kst,m_time,content_img_url
0,0,20210831,5,0,0,28,0,0,0,0,5285,246,4521,0,496,542,0,0,11,2131,738,70,310000.0,1.0,600.0,600700.0,600700032.0,1.0,6.0,28595.0,1067.0,137083455,2021-08-31T05:59:24.222+09:00,1629687449,https://media.bunjang.co.kr/product/137083455_...
1,0,20210831,5,1,46,0,0,0,0,0,0,3940,84,0,0,0,0,0,0,34,0,50,200000.0,1.0,750.0,750610.0,750610112.0,0.0,0.0,179.0,7.0,162622860,2021-08-31T05:05:20.388+09:00,1629786400,https://media.bunjang.co.kr/product/162622860_...
2,0,20210831,5,1,26,9,0,0,0,0,0,3940,84,0,0,0,0,0,0,34,0,50,200000.0,1.0,750.0,750610.0,750610112.0,0.0,0.0,179.0,7.0,162622860,2021-08-31T05:29:27.712+09:00,1629786400,https://media.bunjang.co.kr/product/162622860_...
3,0,20210831,5,2,45,3,0,0,0,0,323,557,185,0,20,33,0,0,0,177,1,55,32000.0,2.0,810.0,810200.0,810200320.0,0.0,0.0,1653.0,122.0,149111357,2021-08-31T05:30:35.883+09:00,1630373220,https://media.bunjang.co.kr/product/149111357_...
4,0,20210831,5,1,34,1,0,0,0,0,7673,431,31681,0,575,775,0,35,130,1900,529,80,699000.0,1.0,600.0,600100.0,600100032.0,8.0,2.0,40601.0,689.0,100010275,2021-08-31T05:15:15.906+09:00,1630518932,https://media.bunjang.co.kr/product/100010275_...


* 우리가 학습에 사용할 데이터는 대부분이 numerical 한데 왜 레이블 인코딩을 했는가?
    * 레이블 인코딩은 학습을 돌릴 때 문자열이 들어올 수 없기 때문에 문자열을 숫자로 바꿔주는 기법이다.
    * 근데, 오픈 소스 구성 상 레이블 인코딩을 하지 않으면 에러가 난다. 그래서 레이블 인코딩을 했다.  
    (오픈 소스 예제 코드에 사용된 데이터의 컬럼은 대부분이 이산형 변수였기 때문에 레이블 인코딩이 진행되었다.)
* 우리 데이터에 레이블 인코딩을 하면 데이터의 numerical한 성질이 사라지지 않나?
    * 그럴 것이라 생각해서 다른 방안을 모색해야하나 싶었는데, 성능이 준수하게 나왔다.  
        (아래에서 확인하겠지만, 동일 데이터셋으로 xgboost에서 0.083가 나왔고, deepfm에서 0.088이 나왔다. deepfm 등의 SOTA 모델보다 gradient boosting 모델이 더 좋은 성능을 냈다는 기술블로그의 연구 결과와도 일치한다.)

In [None]:
fixlen_feature_columns = [SparseFeat(feat, SW_dataset[feat].nunique()) for feat in features] 

dnn_feature_columns = fixlen_feature_columns
linear_feature_columns = fixlen_feature_columns

* deepfm은 fm과 dnn 이 서로 같은 인풋 데이터를 공유한다.
* 이 cell은 input 데이를 만들기 위한 과정 중 하나다.

In [None]:
fixlen_feature_columns

[SparseFeat(name='day', vocabulary_size=1, embedding_dim=4, use_hash=False, dtype='int32', embedding_name='day', group_name='default_group'),
 SparseFeat(name='hour', vocabulary_size=24, embedding_dim=4, use_hash=False, dtype='int32', embedding_name='hour', group_name='default_group'),
 SparseFeat(name='c_user_gender', vocabulary_size=3, embedding_dim=4, use_hash=False, dtype='int32', embedding_name='c_user_gender', group_name='default_group'),
 SparseFeat(name='c_user_age', vocabulary_size=93, embedding_dim=4, use_hash=False, dtype='int32', embedding_name='c_user_age', group_name='default_group'),
 SparseFeat(name='user_following_count', vocabulary_size=392, embedding_dim=4, use_hash=False, dtype='int32', embedding_name='user_following_count', group_name='default_group'),
 SparseFeat(name='user_pay_count', vocabulary_size=162, embedding_dim=4, use_hash=False, dtype='int32', embedding_name='user_pay_count', group_name='default_group'),
 SparseFeat(name='user_parcel_post_count', vocabul

In [None]:
feature_names = get_feature_names(dnn_feature_columns + linear_feature_columns)
feature_names

['day',
 'hour',
 'c_user_gender',
 'c_user_age',
 'user_following_count',
 'user_pay_count',
 'user_parcel_post_count',
 'user_transfer_count',
 'user_chat_count',
 'advertiser_grade',
 'advertiser_item_count',
 'advertiser_interest_count',
 'advertiser_follower_count',
 'advertiser_pay_count',
 'advertiser_review_count',
 'advertiser_parcel_post_count',
 'advertiser_transfer_count',
 'advertiser_chat_count',
 'advertiser_favorite_count',
 'advertiser_comment_count',
 'content_bid_price',
 'content_price',
 'c_content_flag_used',
 'c_content_category_id_1',
 'c_content_category_id_2',
 'c_content_category_id_3',
 'content_emergency_count',
 'content_comment_count',
 'content_interest_count',
 'content_favorite_count',
 'content_id',
 'content_count']

In [None]:
train, test = train_test_split(SW_dataset, test_size = 0.2, random_state = 42)
target = ['label']
print(train.shape)
print(test.shape)

(683094, 33)
(170774, 33)


In [None]:
train

Unnamed: 0,label,day,hour,c_user_gender,c_user_age,user_following_count,user_pay_count,user_parcel_post_count,user_transfer_count,user_chat_count,advertiser_grade,advertiser_item_count,advertiser_interest_count,advertiser_follower_count,advertiser_pay_count,advertiser_review_count,advertiser_parcel_post_count,advertiser_transfer_count,advertiser_chat_count,advertiser_favorite_count,advertiser_comment_count,content_bid_price,content_price,c_content_flag_used,c_content_category_id_1,c_content_category_id_2,c_content_category_id_3,content_emergency_count,content_comment_count,content_interest_count,content_favorite_count,content_id,content_count
504937,0,0,11,1,8,0,0,0,0,0,250,40,284,0,85,172,0,1,17,219,176,1,0,0,0,0,0,0,0,0,0,845,0
653240,0,0,18,1,14,8,0,0,0,1,125,130,99,0,0,61,0,0,0,124,1,1,328,2,10,67,127,0,0,631,31,954,5
712366,0,0,18,1,4,0,1,1,0,2,160,91,157,0,0,89,0,0,2,172,0,0,141,2,10,67,127,2,0,731,90,1044,1
288467,0,0,0,2,8,43,1,1,0,0,253,161,210,0,130,175,0,11,53,209,161,0,301,1,10,67,127,0,0,121,3,1299,2
470793,0,0,22,2,32,83,0,1,0,0,205,122,248,0,65,129,19,4,4,156,147,1,10,2,10,67,127,0,0,554,39,343,12
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
259178,0,0,18,1,5,1,1,0,0,0,250,40,284,0,85,172,0,1,17,219,176,10,321,1,10,67,127,0,0,752,99,493,2
365838,0,0,1,1,28,7,0,0,0,0,216,108,187,0,116,139,0,0,11,187,149,0,293,1,10,67,127,0,0,196,7,1374,1
131932,0,0,12,1,16,0,0,0,1,0,140,102,104,0,86,73,0,0,9,84,138,0,281,1,10,61,119,1,0,822,117,464,4
671155,0,0,14,1,25,1,1,0,0,0,253,161,210,0,130,175,0,11,53,209,161,0,305,1,10,67,127,0,0,142,11,1297,1


In [None]:
train_model_input = {name : train[name] for name in feature_names}  # 위에서 만든 변수들을 사용해서 input이 만들어지는 과정입니다.
test_model_input = {name : test[name] for name in feature_names}

* 위의 난해한 과정을 거쳐 해당 cell에서 모델에 사용될 input를 train과 test별로 나누게 된다.

In [None]:
device = 'cpu'
use_cuda = True

if use_cuda and torch.cuda.is_available():
    print('GPU')
    device = 'cuda:0'

GPU


In [None]:
model = DeepFM(linear_feature_columns = linear_feature_columns, # 위에서 만든 변수가 하이퍼파라미터로 들어가는 부분입니다.
               dnn_feature_columns = dnn_feature_columns, 
               task = 'binary',
               device = device,
            #    l2_reg_embedding = 1e-5, 
            #    dnn_dropout = 0.6,           # deepfm 논문상 0.6~0.9가 최고 성능이었습니다.
               )

model.compile('adagrad', 
              'binary_crossentropy',
              metrics = ['binary_crossentropy', 'auc']
              )

history = model.fit(x = train_model_input,  # 
                    y = train[target].values,
                    batch_size = 512, # 디폴트 값이 256입니다.
                    epochs = 30,
                    verbose = 1,
                    validation_split = 0.2
                    )

cuda:0
Train on 546475 samples, validate on 136619 samples, 1068 steps per epoch


1068it [00:40, 26.60it/s]


Epoch 1/30
42s - loss:  0.0922 - binary_crossentropy:  0.0922 - auc:  0.8603 - val_binary_crossentropy:  0.0887 - val_auc:  0.8732


1068it [00:39, 27.16it/s]


Epoch 2/30
41s - loss:  0.0868 - binary_crossentropy:  0.0868 - auc:  0.8818 - val_binary_crossentropy:  0.0883 - val_auc:  0.8757


1068it [00:39, 26.95it/s]


Epoch 3/30
41s - loss:  0.0859 - binary_crossentropy:  0.0858 - auc:  0.8872 - val_binary_crossentropy:  0.0883 - val_auc:  0.8763


1068it [00:39, 26.72it/s]


Epoch 4/30
42s - loss:  0.0853 - binary_crossentropy:  0.0853 - auc:  0.8894 - val_binary_crossentropy:  0.0881 - val_auc:  0.8768


1068it [00:39, 26.76it/s]


Epoch 5/30
42s - loss:  0.0849 - binary_crossentropy:  0.0849 - auc:  0.8916 - val_binary_crossentropy:  0.0882 - val_auc:  0.8767


1068it [00:39, 26.77it/s]


Epoch 6/30
42s - loss:  0.0845 - binary_crossentropy:  0.0845 - auc:  0.8935 - val_binary_crossentropy:  0.0883 - val_auc:  0.8768


1068it [00:39, 26.77it/s]


Epoch 7/30
42s - loss:  0.0842 - binary_crossentropy:  0.0842 - auc:  0.8942 - val_binary_crossentropy:  0.0883 - val_auc:  0.8774


1068it [00:39, 26.77it/s]


Epoch 8/30
42s - loss:  0.0840 - binary_crossentropy:  0.0839 - auc:  0.8952 - val_binary_crossentropy:  0.0884 - val_auc:  0.8774


1068it [00:40, 26.48it/s]


Epoch 9/30
42s - loss:  0.0837 - binary_crossentropy:  0.0837 - auc:  0.8973 - val_binary_crossentropy:  0.0883 - val_auc:  0.8774


1068it [00:40, 26.65it/s]


Epoch 10/30
42s - loss:  0.0834 - binary_crossentropy:  0.0834 - auc:  0.8982 - val_binary_crossentropy:  0.0884 - val_auc:  0.8777


1068it [00:39, 26.88it/s]


Epoch 11/30
41s - loss:  0.0832 - binary_crossentropy:  0.0832 - auc:  0.8992 - val_binary_crossentropy:  0.0884 - val_auc:  0.8777


1068it [00:39, 26.73it/s]


Epoch 12/30
42s - loss:  0.0830 - binary_crossentropy:  0.0830 - auc:  0.9001 - val_binary_crossentropy:  0.0885 - val_auc:  0.8777


1068it [00:40, 26.63it/s]


Epoch 13/30
42s - loss:  0.0828 - binary_crossentropy:  0.0828 - auc:  0.9013 - val_binary_crossentropy:  0.0886 - val_auc:  0.8779


1068it [00:39, 26.73it/s]


Epoch 14/30
42s - loss:  0.0826 - binary_crossentropy:  0.0826 - auc:  0.9017 - val_binary_crossentropy:  0.0886 - val_auc:  0.8782


1068it [00:39, 26.73it/s]


Epoch 15/30
42s - loss:  0.0824 - binary_crossentropy:  0.0824 - auc:  0.9024 - val_binary_crossentropy:  0.0886 - val_auc:  0.8781


1068it [00:40, 26.58it/s]


Epoch 16/30
42s - loss:  0.0822 - binary_crossentropy:  0.0822 - auc:  0.9028 - val_binary_crossentropy:  0.0888 - val_auc:  0.8779


1068it [00:40, 26.49it/s]


Epoch 17/30
42s - loss:  0.0821 - binary_crossentropy:  0.0821 - auc:  0.9033 - val_binary_crossentropy:  0.0891 - val_auc:  0.8779


1068it [00:39, 26.72it/s]


Epoch 18/30
42s - loss:  0.0820 - binary_crossentropy:  0.0820 - auc:  0.9041 - val_binary_crossentropy:  0.0890 - val_auc:  0.8780


1068it [00:40, 26.40it/s]


Epoch 19/30
42s - loss:  0.0819 - binary_crossentropy:  0.0819 - auc:  0.9045 - val_binary_crossentropy:  0.0891 - val_auc:  0.8781


1068it [00:40, 26.30it/s]


Epoch 20/30
42s - loss:  0.0817 - binary_crossentropy:  0.0817 - auc:  0.9049 - val_binary_crossentropy:  0.0892 - val_auc:  0.8780


1068it [00:40, 26.47it/s]


Epoch 21/30
42s - loss:  0.0816 - binary_crossentropy:  0.0816 - auc:  0.9057 - val_binary_crossentropy:  0.0894 - val_auc:  0.8779


1068it [00:40, 26.44it/s]


Epoch 22/30
42s - loss:  0.0815 - binary_crossentropy:  0.0815 - auc:  0.9055 - val_binary_crossentropy:  0.0894 - val_auc:  0.8778


1068it [00:40, 26.36it/s]


Epoch 23/30
42s - loss:  0.0814 - binary_crossentropy:  0.0814 - auc:  0.9064 - val_binary_crossentropy:  0.0895 - val_auc:  0.8776


129it [00:04, 28.06it/s]

* 모델이 짜여지고 학습이 이루어진다.
* 현재는 하이퍼파라미터 튜닝을 안 한 상태이며 하이퍼파라미터 종류를 확인하고 시간이 남으면 진행해볼 예정이다.

In [None]:
pred = model.predict(x = test_model_input,
                     batch_size = 512)

print("")
print('test log loss : ', round(log_loss(test[target].values, pred), 4))
print('test AUC : ', round(roc_auc_score(test[target].values, pred), 4))


test log loss :  0.0889
test AUC :  0.8748


* test 데이터셋에도 정상적으로 돌아가며 준수한 loss가 찍힌다.
* 다만 문제점이 하나 있다.
    * 추천을 해주려면 각 클래스(클릭한다 안한다)에 속할 확률을 리턴해줘서 확률 값을 내림차순으로 top-k를 뽑을 수 있어야 하는데, deepfm에는 확률값 리턴 메소드인 predict_proba가 없다.

In [None]:
torch.save(model, '/content/drive/MyDrive/deepfm.h5')
# model = torch.load(PATH)

In [None]:
pd.DataFrame(pred)

Unnamed: 0,0
0,6.898328e-04
1,7.020306e-03
2,1.626103e-09
3,2.318446e-02
4,6.927109e-02
...,...
170769,1.730059e-02
170770,6.885687e-08
170771,1.158793e-06
170772,7.382467e-10
