In [None]:
!pip install -q tensorflow-recommenders
!pip install -q --upgrade tensorflow-datasets

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/96.2 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m96.2/96.2 kB[0m [31m6.2 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
import pprint

%matplotlib inline
import matplotlib.pyplot as plt
from mpl_toolkits.axes_grid1 import make_axes_locatable

import numpy as np
import tensorflow as tf
import tensorflow_datasets as tfds

import tensorflow_recommenders as tfrs

In [None]:
# import packages
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import random
import sys
import csv
import gzip
import copy
import datetime
import pickle
from sklearn import metrics
from tabulate import tabulate

In [None]:
seed_value = 42  # seed for reproducibility
random.seed(seed_value)

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
sys.path.append('/content/drive/MyDrive/ctr/code/model')

In [None]:
import dcn
import run_models

In [None]:
BATCH_SIZE = 100000
SHUFFLE_BUFFER_SIZE = 10000
SHUFFLE_SEED = 42
DATASET_EPOCHS = 1
FITTING_EPOCHS = 50
LEARNING_RATE = 0.0001
DCN_PARALLEL = True
STR_COLUMNS = [
    'click', 'banner_pos',
    'site_id', 'site_domain', 'site_category',
    'app_id', 'app_domain', 'app_category',
    'device_id', 'device_ip', 'device_model', 'device_type', 'device_conn_type',
    'C1', 'C14', 'C15', 'C16', 'C17', 'C18', 'C19', 'C20', 'C21',
    'day_of_week'
    ]
INT_COLUMNS = [
    'hour_of_day', 'rare count'
]
COLUMN_DEFAULTS = [tf.string] * (len(STR_COLUMNS) -1) + [tf.float32] + [tf.string] * 1 + [tf.float32]

In [None]:
# column_defaults를 원본 CSV File의 Column 순서로 인식한다

In [None]:
TRAIN_FILE = '/content/drive/MyDrive/ctr/avazu/processed/train/train_rare_count.csv'
train_batches = tf.data.experimental.make_csv_dataset(
    TRAIN_FILE,
    batch_size=BATCH_SIZE,
    select_columns= STR_COLUMNS + INT_COLUMNS,
    column_defaults=COLUMN_DEFAULTS,
    shuffle=True, shuffle_buffer_size=SHUFFLE_BUFFER_SIZE, shuffle_seed=SHUFFLE_SEED,
    num_epochs=DATASET_EPOCHS
)

In [None]:
TEST_FILE = '/content/drive/MyDrive/ctr/avazu/processed/train/test_rare_count.csv'
test = tf.data.experimental.make_csv_dataset(
    TEST_FILE,
    batch_size=BATCH_SIZE,
    select_columns= STR_COLUMNS + INT_COLUMNS,
    column_defaults=COLUMN_DEFAULTS,
    num_epochs=DATASET_EPOCHS
)

In [None]:
import pickle
with open('/content/drive/MyDrive/ctr/data/total_voca.p', 'rb') as f:
    total_voca = pickle.load(f)

In [None]:
with open('/content/drive/MyDrive/ctr/data/one_hot_encoding_voca.p', 'rb') as f:
    one_hot_encoding_voca = pickle.load(f)

In [None]:
one_hot_encoding_voca['day_of_week'] = ['Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday',
       'Monday']

In [None]:
FEATURE_ENGINEERING = {
    'one-hot encoding' : ['site_category', 'app_category', 'device_type', 'C1', 'C15', 'C16', 'C18', 'C21', 'day_of_week'],
    'threshold + embedding' : ['device_model', 'C14', 'C17', 'site_id', 'site_domain', 'app_id'],
    'top-n + one-hot encoding' : ['app_domain', 'C19', 'C20']
}

In [None]:
THRESHOLD_VOCA = total_voca[(10,1000)]
TOP_N_VOCA = total_voca[(25, 500)]

In [None]:
THRESHOLD_VOCA.keys()

dict_keys(['site_id', 'site_domain', 'device_model', 'app_id', 'app_domain', 'C14', 'C17', 'C19', 'C20'])

In [None]:
vocabularies = {}

vocabularies['one-hot encoding'] = {}
for feature in FEATURE_ENGINEERING['one-hot encoding']:
    vocabularies['one-hot encoding'][feature] = one_hot_encoding_voca[feature]

vocabularies['threshold + embedding'] = {}
for feature in FEATURE_ENGINEERING['threshold + embedding']:
    vocabularies['threshold + embedding'][feature] = THRESHOLD_VOCA[feature]['over_threshold']

vocabularies['top-n + one-hot encoding'] = {}
for feature in FEATURE_ENGINEERING['top-n + one-hot encoding']:
    vocabularies['top-n + one-hot encoding'][feature] = TOP_N_VOCA[feature]['top_num']

In [None]:
str_features = list(vocabularies['one-hot encoding'].keys()) + list(vocabularies['threshold + embedding'].keys()) + list(vocabularies['top-n + one-hot encoding'].keys())
int_features = ['hour_of_day']

In [None]:
nodes = [32, 64, 128, 256, 512, 1024]
deep_layers = [1, 2, 3, 4, 5]
cross_layers = [1, 2, 3, 4, 5, 6]
result_dict = {}

In [None]:
import warnings
warnings.filterwarnings(action='ignore')

In [None]:
for engineering_method, voca_set in vocabularies.items():
    for feature, voca in voca_set.items():
        vocabularies[engineering_method][feature] = list(map(str, voca))

In [None]:
HISTORY_FILE_DIR = '/content/drive/MyDrive/ctr/data/feature_engineering.rare_count/'
HISTORY_FILE_NAME = 'fitting_history.V1.2'

In [None]:
node, deep_layer, cross_layer = 1024, 4, 1
print("Node: {}, Deep Layer : {}, Cross Layer : {}".format(node, deep_layer, cross_layer))

fitting_history = run_models.run_models(
    dcn_parallel=DCN_PARALLEL,
    cross_layer_size=cross_layer,
    deep_layer_sizes=[node]*deep_layer,
    vocabularies=vocabularies,
    str_features=str_features,
    int_features=int_features,
    train=train_batches,
    test=test,
    learning_rate=LEARNING_RATE,
    epochs=FITTING_EPOCHS,
    history_file_dir=HISTORY_FILE_DIR,
    history_file_name=HISTORY_FILE_NAME
    )

Node: 1024, Deep Layer : 4, Cross Layer : 1
[1m363/363[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1174s[0m 3s/step - AUC: 0.6106 - LogLoss: 0.6034 - loss: 0.5516 - regularization_loss: 0.0000e+00 - total_loss: 0.5516
[1m43/43[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m136s[0m 3s/step - AUC: 0.5415 - LogLoss: 2.0947 - loss: 2.5719 - regularization_loss: 0.0000e+00 - total_loss: 2.5719
1th Epoch
[1m363/363[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1148s[0m 3s/step - AUC: 0.6312 - LogLoss: 0.6067 - loss: 0.5058 - regularization_loss: 0.0000e+00 - total_loss: 0.5058
[1m43/43[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m135s[0m 3s/step - AUC: 0.6818 - LogLoss: 0.4284 - loss: 0.4721 - regularization_loss: 0.0000e+00 - total_loss: 0.4721
[1m363/363[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1159s[0m 3s/step - AUC: 0.6981 - LogLoss: 0.4340 - loss: 0.4348 - regularization_loss: 0.0000e+00 - total_loss: 0.4348
[1m43/43[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m

In [None]:
import pickle
with open(HISTORY_FILE_PATH, 'wb') as f:
    pickle.dump(fitting_history, f)