In [1]:
import torch
import pandas as pd

from inputs.preprocessing import PreProcessing
from models.linear import Linear
from models.dnn import DNN
from models.fm import FM
from models.deepfm import DeepFM
from sklearn.metrics import log_loss, roc_auc_score
from sklearn.model_selection import train_test_split

device = 'cuda:0' if torch.cuda.is_available() else 'cpu'

In [2]:
data = pd.read_csv('./examples/bunjang_sampling.csv')
# data = df.sample(frac=0.25)

In [3]:
len(data)

1626983

In [4]:
print('click proportion of original data:', len(df[df['click'] == 1])/len(df))
print('click proportion of sampling data:', len(data[data['click'] == 1])/len(data))

In [5]:
data.columns

In [6]:
sparse_feature_names1 = ['ad_type', 'u_bizlicense', 'u_sex', 'u_age', 'u_married', 'p_taekpo', 'p_exchg', 'p_category_id']
dense_feature_names = ['u_favorite_count', 'u_comment_count', 'u_review_count', 'u_grade',
                       'u_item_count', 'u_interest', 'u_following_cnt', 'u_parcel_post_count',
                       'u_bunpay_count', 'u_transfer_count', 'u_bunp_account_count', 'u_bunp_meet_count',
                       'p_price', 'p_qty', 'p_image_count', 'p_emergency_cnt', 'p_comment_cnt',
                       'p_interest', 'p_pfavcnt']

preproc = PreProcessing()
dataset1 = preproc.do_preprocessing(data, sparse_feature_names1, dense_feature_names)

train1, test1 = train_test_split(dataset1, test_size=0.2)

feature_names1 = sparse_feature_names1 + dense_feature_names

train_model_input1 = {name: train1[name] for name in feature_names1}
test_model_input1 = {name: test1[name] for name in feature_names1}

In [7]:
sparse_feature_names2 = ['ad_type', 'u_bizlicense', 'p_taekpo', 'p_exchg', 'p_category_id']
# dense_feature_names = ['u_favorite_count', 'u_comment_count', 'u_review_count', 'u_grade',
#                        'u_item_count', 'u_interest', 'u_following_cnt', 'u_parcel_post_count',
#                        'u_bunpay_count', 'u_transfer_count', 'u_bunp_account_count', 'u_bunp_meet_count',
#                        'p_price', 'p_qty', 'p_image_count', 'p_emergency_cnt', 'p_comment_cnt',
#                        'p_interest', 'p_pfavcnt']

preproc = PreProcessing()
dataset2 = preproc.do_preprocessing(data, sparse_feature_names2, dense_feature_names)

train2, test2 = train_test_split(dataset2, test_size=0.2)

feature_names2 = sparse_feature_names2 + dense_feature_names

train_model_input2 = {name: train2[name] for name in feature_names2}
test_model_input2 = {name: test2[name] for name in feature_names2}

In [8]:
linear_model1 = Linear(dataset=data,
                       device=device,
                       sparse_feature_names=sparse_feature_names1,
                       dense_feature_names=dense_feature_names)
linear_model1.fit(train_model_input1, train1['click'].values)

pred = linear_model1.predict(test_model_input1)

print("test LogLoss", round(log_loss(test1['click'].values, pred), 4))
print("test AUC", round(roc_auc_score(test1['click'].values, pred), 4))


1it [00:00,  7.73it/s]

learning device : cpu


5085it [00:36, 140.47it/s]
1it [00:00,  9.94it/s]

epoch : 0 -> loss : 1.0051


5085it [00:35, 142.15it/s]
4it [00:00, 38.06it/s]

epoch : 1 -> loss : 0.9808


5085it [00:34, 147.69it/s]
4it [00:00, 37.77it/s]

epoch : 2 -> loss : 0.9884


5085it [00:35, 144.30it/s]
3it [00:00, 27.80it/s]

epoch : 3 -> loss : 2.8741


5085it [00:36, 140.70it/s]
3it [00:00, 29.22it/s]

epoch : 4 -> loss : 1.8894


5085it [00:36, 140.79it/s]
3it [00:00, 28.34it/s]

epoch : 5 -> loss : 1.9601


5085it [00:36, 139.04it/s]
4it [00:00, 38.11it/s]

epoch : 6 -> loss : 0.9935


5085it [00:38, 131.74it/s]
1it [00:00,  8.22it/s]

epoch : 7 -> loss : 0.0111


5085it [00:38, 133.75it/s]
2it [00:00, 19.41it/s]

epoch : 8 -> loss : 0.0276


5085it [00:35, 144.26it/s]


epoch : 9 -> loss : 0.0133
test LogLoss 0.0848
test AUC 0.5931


In [9]:
linear_model2 = Linear(dataset=data,
                       device=device,
                       sparse_feature_names=sparse_feature_names2,
                       dense_feature_names=dense_feature_names)
linear_model2.fit(train_model_input2, train2['click'].values)

pred = linear_model2.predict(test_model_input2)

print("test LogLoss", round(log_loss(test2['click'].values, pred), 4))
print("test AUC", round(roc_auc_score(test2['click'].values, pred), 4))

2it [00:00, 19.96it/s]

learning device : cpu


5085it [00:30, 168.46it/s]
4it [00:00, 36.44it/s]

epoch : 0 -> loss : 3.8613


5085it [00:29, 174.16it/s]
4it [00:00, 38.55it/s]

epoch : 1 -> loss : 1.9678


5085it [00:28, 175.45it/s]
4it [00:00, 39.36it/s]

epoch : 2 -> loss : 1.9691


5085it [00:29, 170.33it/s]
4it [00:00, 38.24it/s]

epoch : 3 -> loss : 1.9573


5085it [00:29, 172.69it/s]
4it [00:00, 37.88it/s]

epoch : 4 -> loss : 0.985


5085it [00:29, 174.67it/s]
4it [00:00, 39.54it/s]

epoch : 5 -> loss : 4.9095


5085it [00:28, 175.53it/s]
4it [00:00, 39.09it/s]

epoch : 6 -> loss : 1.0107


5085it [00:29, 170.52it/s]
3it [00:00, 27.77it/s]

epoch : 7 -> loss : 1.951


5085it [00:30, 165.88it/s]
4it [00:00, 36.34it/s]

epoch : 8 -> loss : 0.0192


5085it [00:29, 174.46it/s]


epoch : 9 -> loss : 0.9827
test LogLoss 0.0693
test AUC 0.5899


In [10]:
fm_model1 = FM(dataset=data,
                       device=device,
                       sparse_feature_names=sparse_feature_names1,
                       dense_feature_names=dense_feature_names)
fm_model1.fit(train_model_input1, train1['click'].values)

pred = fm_model1.predict(test_model_input1)

print("test LogLoss", round(log_loss(test1['click'].values, pred), 4))
print("test AUC", round(roc_auc_score(test1['click'].values, pred), 4))

1it [00:00,  9.97it/s]

learning device : cpu


5085it [00:37, 137.07it/s]
3it [00:00, 28.56it/s]

epoch : 0 -> loss : 0.9808


5085it [00:37, 136.84it/s]
2it [00:00, 18.87it/s]

epoch : 1 -> loss : 0.0176


5085it [00:38, 133.50it/s]
1it [00:00,  9.84it/s]

epoch : 2 -> loss : 0.9906


5085it [00:39, 130.08it/s]
1it [00:00,  9.55it/s]

epoch : 3 -> loss : 0.9149


5085it [00:37, 134.29it/s]
2it [00:00, 19.93it/s]

epoch : 4 -> loss : 0.0202


5085it [00:38, 132.02it/s]
3it [00:00, 28.04it/s]

epoch : 5 -> loss : 0.997


5085it [00:40, 126.87it/s]
3it [00:00, 29.22it/s]

epoch : 6 -> loss : 0.0143


5085it [00:38, 131.19it/s]
3it [00:00, 28.26it/s]

epoch : 7 -> loss : 0.992


5085it [00:38, 130.54it/s]
2it [00:00, 18.70it/s]

epoch : 8 -> loss : 1.0049


5085it [00:37, 136.87it/s]


epoch : 9 -> loss : 0.9859
test LogLoss 0.0685
test AUC 0.6306


In [11]:
fm_model2 = FM(dataset=data,
                       device=device,
                       sparse_feature_names=sparse_feature_names2,
                       dense_feature_names=dense_feature_names)
fm_model2.fit(train_model_input2, train2['click'].values)

pred = fm_model2.predict(test_model_input2)

print("test LogLoss", round(log_loss(test2['click'].values, pred), 4))
print("test AUC", round(roc_auc_score(test2['click'].values, pred), 4))

1it [00:00,  9.90it/s]

learning device : cpu


5085it [00:31, 161.73it/s]
3it [00:00, 28.64it/s]

epoch : 0 -> loss : 0.9924


5085it [00:31, 160.52it/s]
3it [00:00, 29.17it/s]

epoch : 1 -> loss : 0.9802


5085it [00:31, 160.63it/s]
4it [00:00, 37.96it/s]

epoch : 2 -> loss : 0.015


5085it [00:31, 163.07it/s]
3it [00:00, 29.98it/s]

epoch : 3 -> loss : 0.9774


5085it [00:32, 156.53it/s]
3it [00:00, 29.83it/s]

epoch : 4 -> loss : 0.987


5085it [00:32, 154.57it/s]
2it [00:00, 19.36it/s]

epoch : 5 -> loss : 0.0154


5085it [00:30, 164.51it/s]
4it [00:00, 39.85it/s]

epoch : 6 -> loss : 1.0104


5085it [00:32, 158.54it/s]
4it [00:00, 38.36it/s]

epoch : 7 -> loss : 1.9229


5085it [00:31, 163.32it/s]
2it [00:00, 18.44it/s]

epoch : 8 -> loss : 0.9965


5085it [00:31, 160.14it/s]


epoch : 9 -> loss : 1.9641
test LogLoss 0.0686
test AUC 0.6239


In [12]:
dnn_model1 = DNN(dataset=data,
                       device=device,
                       sparse_feature_names=sparse_feature_names1,
                       dense_feature_names=dense_feature_names)
dnn_model1.fit(train_model_input1, train1['click'].values)

pred = dnn_model1.predict(test_model_input1)

print("test LogLoss", round(log_loss(test1['click'].values, pred), 4))
print("test AUC", round(roc_auc_score(test1['click'].values, pred), 4))

1it [00:00,  9.46it/s]

learning device : cpu


5085it [01:02, 80.92it/s]
1it [00:00,  6.49it/s]

epoch : 0 -> loss : 0.9804


5085it [00:59, 84.97it/s]
1it [00:00,  6.80it/s]

epoch : 1 -> loss : 0.0183


5085it [01:00, 84.63it/s]
1it [00:00,  6.83it/s]

epoch : 2 -> loss : 0.9821


5085it [01:02, 81.67it/s]
1it [00:00,  5.64it/s]

epoch : 3 -> loss : 0.0156


5085it [01:00, 84.19it/s]
1it [00:00,  6.29it/s]

epoch : 4 -> loss : 1.9035


5085it [01:00, 84.19it/s]
1it [00:00,  7.23it/s]

epoch : 5 -> loss : 0.977


5085it [00:59, 84.77it/s]
1it [00:00,  6.34it/s]

epoch : 6 -> loss : 1.9648


5085it [01:00, 84.25it/s]
1it [00:00,  6.16it/s]

epoch : 7 -> loss : 0.0179


5085it [00:59, 84.98it/s]
1it [00:00,  6.69it/s]

epoch : 8 -> loss : 0.9809


5085it [01:01, 83.28it/s]


epoch : 9 -> loss : 1.945
test LogLoss 0.0667
test AUC 0.6371


In [13]:
dnn_model2 = DNN(dataset=data,
                       device=device,
                       sparse_feature_names=sparse_feature_names2,
                       dense_feature_names=dense_feature_names)
dnn_model2.fit(train_model_input2, train2['click'].values)

pred = dnn_model2.predict(test_model_input2)

print("test LogLoss", round(log_loss(test2['click'].values, pred), 4))
print("test AUC", round(roc_auc_score(test2['click'].values, pred), 4))

1it [00:00,  8.56it/s]

learning device : cpu


5085it [00:52, 97.46it/s] 
1it [00:00,  7.06it/s]

epoch : 0 -> loss : 0.0216


5085it [00:51, 99.27it/s] 
1it [00:00,  5.64it/s]

epoch : 1 -> loss : 0.0182


5085it [00:50, 100.42it/s]
1it [00:00,  6.68it/s]

epoch : 2 -> loss : 0.9765


5085it [00:50, 100.97it/s]
1it [00:00,  7.43it/s]

epoch : 3 -> loss : 0.0169


5085it [00:50, 101.01it/s]
1it [00:00,  7.41it/s]

epoch : 4 -> loss : 1.9649


5085it [00:50, 101.66it/s]
1it [00:00,  7.26it/s]

epoch : 5 -> loss : 0.0145


5085it [00:50, 100.53it/s]
1it [00:00,  6.21it/s]

epoch : 6 -> loss : 0.0159


5085it [00:51, 98.70it/s] 
1it [00:00,  7.20it/s]

epoch : 7 -> loss : 0.9567


5085it [00:52, 97.07it/s] 
1it [00:00,  6.50it/s]

epoch : 8 -> loss : 1.9508


5085it [00:53, 94.53it/s] 


epoch : 9 -> loss : 1.0009
test LogLoss 0.0677
test AUC 0.6332


In [14]:
deepfm_model1 = DeepFM(dataset=data,
                       device=device,
                       sparse_feature_names=sparse_feature_names1,
                       dense_feature_names=dense_feature_names)
deepfm_model1.fit(train_model_input1, train1['click'].values)

pred = deepfm_model1.predict(test_model_input1)

print("test LogLoss", round(log_loss(test1['click'].values, pred), 4))
print("test AUC", round(roc_auc_score(test1['click'].values, pred), 4))

1it [00:00,  8.95it/s]

learning device : cpu


5085it [02:14, 37.93it/s]
1it [00:00,  6.18it/s]

epoch : 0 -> loss : 1.0016


5085it [02:08, 39.48it/s]
1it [00:00,  6.36it/s]

epoch : 1 -> loss : 0.9813


5085it [02:05, 40.62it/s]
1it [00:00,  6.54it/s]

epoch : 2 -> loss : 0.9999


5085it [02:05, 40.63it/s]
1it [00:00,  6.48it/s]

epoch : 3 -> loss : 0.9895


5085it [02:08, 39.58it/s]
1it [00:00,  6.38it/s]

epoch : 4 -> loss : 1.9212


5085it [02:07, 39.86it/s]
1it [00:00,  6.43it/s]

epoch : 5 -> loss : 2.9149


5085it [02:14, 37.87it/s]
1it [00:00,  6.72it/s]

epoch : 6 -> loss : 0.0132


5085it [02:09, 39.27it/s]
1it [00:00,  6.30it/s]

epoch : 7 -> loss : 1.9058


5085it [02:05, 40.66it/s]
1it [00:00,  6.17it/s]

epoch : 8 -> loss : 2.8768


5085it [02:08, 39.56it/s]


epoch : 9 -> loss : 0.0308
test LogLoss 0.0716
test AUC 0.6219
