In [2]:
!pip install -q tensorflow-recommenders
!pip install -q --upgrade tensorflow-datasets

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/96.2 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━[0m [32m92.2/96.2 kB[0m [31m2.5 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m96.2/96.2 kB[0m [31m1.6 MB/s[0m eta [36m0:00:00[0m
[?25h

In [3]:
import pprint

%matplotlib inline
import matplotlib.pyplot as plt
from mpl_toolkits.axes_grid1 import make_axes_locatable

import numpy as np
import tensorflow as tf
import tensorflow_datasets as tfds

import tensorflow_recommenders as tfrs

In [4]:
# import packages
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import random
import sys
import csv
import gzip
import copy
import datetime
import pickle
from sklearn import metrics
from tabulate import tabulate

In [5]:
seed_value = 42  # seed for reproducibility
random.seed(seed_value)

In [6]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [7]:
sys.path.append('/content/drive/MyDrive/ctr/code/model')

In [8]:
import dcn
import run_models
import run_models_save_model
import check_cross_layer

In [67]:
BATCH_SIZE = 100000
SHUFFLE_BUFFER_SIZE = 10000
SHUFFLE_SEED = 42
DATASET_EPOCHS = 1
FITTING_EPOCHS = 1
LEARNING_RATE = 0.0001
DCN_PARALLEL = True
EMBEDDING_DIMENSION = 32
STR_COLUMNS = [
    'click', 'banner_pos',
    'site_id', 'site_domain', 'site_category',
    'app_id', 'app_domain', 'app_category',
    'device_id', 'device_ip', 'device_model', 'device_type', 'device_conn_type',
    'C1', 'C14', 'C15', 'C16', 'C17', 'C18', 'C19', 'C20', 'C21',
    'day_of_week'
    ]
INT_COLUMNS = [
    'hour_of_day'
]
COLUMN_DEFAULTS = [tf.string] * (len(STR_COLUMNS) -1) + [tf.float32] * len(INT_COLUMNS) + [tf.string] * 1

In [10]:
# column_defaults를 원본 CSV File의 Column 순서로 인식한다

In [11]:
TRAIN_FILE = '/content/drive/MyDrive/ctr/avazu/processed/train/train.csv'
train_batches = tf.data.experimental.make_csv_dataset(
    TRAIN_FILE,
    batch_size=BATCH_SIZE,
    select_columns= STR_COLUMNS + INT_COLUMNS,
    column_defaults=COLUMN_DEFAULTS,
    shuffle=True, shuffle_buffer_size=SHUFFLE_BUFFER_SIZE, shuffle_seed=SHUFFLE_SEED,
    num_epochs=DATASET_EPOCHS
)

In [12]:
TEST_FILE = '/content/drive/MyDrive/ctr/avazu/processed/train/test.csv'
test = tf.data.experimental.make_csv_dataset(
    TEST_FILE,
    batch_size=BATCH_SIZE,
    select_columns= STR_COLUMNS + INT_COLUMNS,
    column_defaults=COLUMN_DEFAULTS,
    num_epochs=DATASET_EPOCHS
)

In [13]:
with open('/content/drive/MyDrive/ctr/avazu/processed/train/vocabularies.p', 'rb') as f:
    total_voca = pickle.load(f)

In [14]:
FEATURE_ENGINEERING = {
    'embedding' : [
        'site_id', 'site_domain', 'site_category',
        'app_id', 'app_domain', 'app_category',
        'device_model', 'device_type',
        'C1', 'C14', 'C15', 'C16', 'C17', 'C18', 'C19', 'C20', 'C21',
        'day_of_week'
    ]
}

In [15]:
vocabularies = {}

vocabularies['embedding'] = {}
for feature in FEATURE_ENGINEERING['embedding']:
    vocabularies['embedding'][feature] = total_voca[feature]

In [16]:
str_features = list(vocabularies['embedding'].keys())
int_features = ['hour_of_day']

In [17]:
nodes = [32, 64, 128, 256, 512, 1024]
deep_layers = [1, 2, 3, 4, 5]
cross_layers = [1, 2, 3, 4, 5, 6]
result_dict = {}

In [18]:
import warnings
warnings.filterwarnings(action='ignore')

In [19]:
for engineering_method, voca_set in vocabularies.items():
    for feature, voca in voca_set.items():
        vocabularies[engineering_method][feature] = list(map(str, voca))

In [20]:
HISTORY_FILE_DIR = '/content/drive/MyDrive/ctr/data/baseline/'
HISTORY_FILE_NAME = 'fitting_history.V1.3'

In [21]:
CHECKPOINTS_DIR = '/content/drive/MyDrive/ctr/data/baseline/'
CHECKPOINTS_NAME = 'checkpoints.V1.3'

In [68]:
node, deep_layer, cross_layer = 1024, 4, 1

model = dcn.DCN(
    dcn_parallel=DCN_PARALLEL,
    cross_layer_size=cross_layer,
    deep_layer_sizes=[node]*deep_layer,
    vocabularies=vocabularies,
    str_features=str_features,
    int_features=int_features,
    embedding_dimension=EMBEDDING_DIMENSION)
model.compile(optimizer=tf.keras.optimizers.Adam(LEARNING_RATE))

In [None]:
fitting_data = {
    'Train LogLoss' : [],
    'Train AUC' : [],
    'Test LogLoss' : [],
    'Test AUC' : []
}
EPOCHS = 50
for epoch in range(EPOCHS):
    history = model.fit(train_batches, epochs=1, verbose=True)
    metrics = model.evaluate(test, return_dict=True)
    fitting_data['Train LogLoss'].append(history.history["LogLoss"][0])
    fitting_data['Train AUC'].append(history.history["AUC"][0])
    fitting_data['Test LogLoss'].append(metrics["LogLoss"])
    fitting_data['Test AUC'].append(metrics["AUC"])
    if (epoch % 10) == 0:
        print("{}th Epoch".format(epoch+1))
        with open(HISTORY_FILE_DIR + HISTORY_FILE_NAME + '.p', 'wb') as f:
            pickle.dump(fitting_data, f)
        model.save_weights(CHECKPOINTS_DIR + CHECKPOINTS_NAME + '.weights.h5')

     25/Unknown [1m76s[0m 3s/step - AUC: 0.5467 - LogLoss: 0.4739 - loss: 0.4626 - regularization_loss: 0.0000e+00 - total_loss: 0.4626

In [None]:
mat = model._cross_layers[0].weights[0]
features = FEATURE_ENGINEERING['embedding']

In [None]:
from mpl_toolkits.axes_grid1 import make_axes_locatable

In [None]:
block_norm = np.ones([len(features), len(features)])
ticks = list(range(len(features)))
dim = model.embedding_dimension

# Compute the norms of the blocks.
for i in range(len(features)):
  for j in range(len(features)):
    block = mat[i * dim:(i + 1) * dim,
                j * dim:(j + 1) * dim]
    block_norm[i,j] = np.linalg.norm(block, ord="fro")

plt.figure(figsize=(200,200))
im = plt.matshow(block_norm, cmap=plt.cm.Blues)
ax = plt.gca()

divider = make_axes_locatable(plt.gca())
cax = divider.append_axes("right", size="5%", pad=0.05)
plt.colorbar(im, cax=cax)
cax.tick_params(labelsize=10)

ax.set_xticks(range(len(features)))
ax.set_yticks(range(len(features)))

_ = ax.set_xticklabels(features, rotation=45, ha="left", fontsize=10)
_ = ax.set_yticklabels(features, fontsize=10)

plt.savefig('/content/drive/MyDrive/ctr/data/baseline/weight_analysis.png')