In [None]:
!pip install -q tensorflow-recommenders
!pip install -q --upgrade tensorflow-datasets

In [None]:
import pprint

%matplotlib inline
import matplotlib.pyplot as plt
from mpl_toolkits.axes_grid1 import make_axes_locatable

import numpy as np
import tensorflow as tf
import tensorflow_datasets as tfds

import tensorflow_recommenders as tfrs

In [None]:
# import packages
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import random
import sys
import csv
import gzip
import copy
import datetime
from tqdm import tqdm
from sklearn import metrics
from tabulate import tabulate

In [None]:
seed_value = 42  # seed for reproducibility
random.seed(seed_value)

In [None]:
import warnings
warnings.filterwarnings(action='ignore')

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
sys.path.append('/content/drive/MyDrive/ctr/code/model')

In [None]:
import dcn
import run_models

In [None]:
FIGSIZE = (6,3)

seed_value = 42  # seed for reproducibility
random.seed(seed_value)
subsample_ratio = 0.05

n = 40428967  # total number of records in the clickstream data
sample_size = int(n * subsample_ratio)

parse_date = lambda val : datetime.datetime.strptime(val, '%y%m%d%H')
skip_values = lambda i: i>0 and random.random() > subsample_ratio

In [None]:
# indicate the datatypes to use in the reading process
types_train = {
    'id': np.dtype(int),
    'click': np.dtype(int),
    'hour': np.dtype(int),
    'C1': np.dtype(int),
    'banner_pos': np.dtype(int),
    'site_id': np.dtype(str),
    'site_domain': np.dtype(str),
    'site_category': np.dtype(str),
    'app_id': np.dtype(str),
    'app_domain': np.dtype(str),
    'app_category': np.dtype(str),
    'device_id': np.dtype(str),
    'device_ip': np.dtype(str),
    'device_model': np.dtype(str),
    'device_type': np.dtype(int),
    'device_conn_type': np.dtype(int),
    'C14': np.dtype(int),
    'C15': np.dtype(int),
    'C16': np.dtype(int),
    'C17': np.dtype(int),
    'C18': np.dtype(int),
    'C19': np.dtype(int),
    'C20': np.dtype(int),
    'C21':np.dtype(int)
}

In [None]:
FILE_PATH = '/content/drive/MyDrive/ctr/avazu/processed/train/train.csv'
train = pd.read_csv(FILE_PATH)

In [None]:
FILE_PATH = '/content/drive/MyDrive/ctr/avazu/processed/train/test.csv'
test = pd.read_csv(FILE_PATH)

In [None]:
data = train

In [None]:
NULL_ID = 'a99f214a'
user_id = []
for row in tqdm(data[['device_id', 'device_ip', 'device_model']].values):
    device_id, device_ip, device_model = row
    if device_id != NULL_ID:
        user_id.append(device_id)
    else:
        user_id.append(device_ip + '-' + device_model)

In [None]:
train['user_id'] = user_id

In [None]:
user_data = {}
click_list, imp_list = [], []
for user_id, click in tqdm(data[['user_id', 'click']].values):
    if user_id not in user_data:
        user_data[user_id] = {
            'click' : click,
            'imp' : 1
        }
        click_list.append(click)
        imp_list.append(1)
    else:
        now_imp, now_click = user_data[user_id]['imp'], user_data[user_id]['click']

        click_list.append(now_click)
        imp_list.append(now_imp)

        user_data[user_id]['imp'] += 1
        user_data[user_id]['click'] += click

In [None]:
train['history click'] = click_list
train['history imp'] = imp_list
train['CTR'] = train['history click'] / train['history imp'] * 100
train = train.round({'CTR':2})

In [None]:
HISTORY_FILE_PATH = '/content/drive/MyDrive/ctr/avazu/processed/train/train_imp_3.csv'
train.loc[train['history imp'] >= 3].to_csv(HISTORY_FILE_PATH, index=False)

HISTORY_FILE_PATH = '/content/drive/MyDrive/ctr/avazu/processed/train/train_imp_5.csv'
train.loc[train['history imp'] >= 5].to_csv(HISTORY_FILE_PATH, index=False)

HISTORY_FILE_PATH = '/content/drive/MyDrive/ctr/avazu/processed/train/train_imp_7.csv'
train.loc[train['history imp'] >= 7].to_csv(HISTORY_FILE_PATH, index=False)

HISTORY_FILE_PATH = '/content/drive/MyDrive/ctr/avazu/processed/train/train_imp_10.csv'
train.loc[train['history imp'] >= 10].to_csv(HISTORY_FILE_PATH, index=False)

In [None]:
data = test

In [None]:
NULL_ID = 'a99f214a'
user_id = []
for row in tqdm(data[['device_id', 'device_ip', 'device_model']].values):
    device_id, device_ip, device_model = row
    if device_id != NULL_ID:
        user_id.append(device_id)
    else:
        user_id.append(device_ip + '-' + device_model)
test['user_id'] = user_id

In [None]:
click_list, imp_list = [], []
for user_id, click in tqdm(data[['user_id', 'click']].values):
    if user_id not in user_data:
        user_data[user_id] = {
            'click' : click,
            'imp' : 1
        }
        click_list.append(click)
        imp_list.append(1)
    else:
        now_imp, now_click = user_data[user_id]['imp'], user_data[user_id]['click']

        click_list.append(now_click)
        imp_list.append(now_imp)

        user_data[user_id]['imp'] += 1
        user_data[user_id]['click'] += click

In [None]:
test['history click'] = click_list
test['history imp'] = imp_list
test['CTR'] = test['history click'] / test['history imp'] * 100
test = test.round({'CTR':2})

In [None]:
HISTORY_FILE_PATH = '/content/drive/MyDrive/ctr/avazu/processed/train/test_imp_3.csv'
test.loc[test['history imp'] >= 3].to_csv(HISTORY_FILE_PATH, index=False)

HISTORY_FILE_PATH = '/content/drive/MyDrive/ctr/avazu/processed/train/test_imp_5.csv'
test.loc[test['history imp'] >= 5].to_csv(HISTORY_FILE_PATH, index=False)

HISTORY_FILE_PATH = '/content/drive/MyDrive/ctr/avazu/processed/train/test_imp_7.csv'
test.loc[test['history imp'] >= 7].to_csv(HISTORY_FILE_PATH, index=False)

HISTORY_FILE_PATH = '/content/drive/MyDrive/ctr/avazu/processed/train/test_imp_10.csv'
test.loc[test['history imp'] >= 10].to_csv(HISTORY_FILE_PATH, index=False)