In [None]:
!pip install -q tensorflow-recommenders
!pip install -q --upgrade tensorflow-datasets

In [None]:
import pprint

%matplotlib inline
import matplotlib.pyplot as plt
from mpl_toolkits.axes_grid1 import make_axes_locatable

import numpy as np
import tensorflow as tf
import tensorflow_datasets as tfds

import tensorflow_recommenders as tfrs

In [None]:
# import packages
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import random
import sys
import csv
import gzip
import copy
import datetime
from tqdm import tqdm
from sklearn import metrics
from tabulate import tabulate

In [None]:
seed_value = 42  # seed for reproducibility
random.seed(seed_value)

In [None]:
import warnings
warnings.filterwarnings(action='ignore')

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
sys.path.append('/content/drive/MyDrive/ctr/code/model')

In [None]:
import dcn
import run_models

In [None]:
FIGSIZE = (6,3)

seed_value = 42  # seed for reproducibility
random.seed(seed_value)
subsample_ratio = 0.05

n = 40428967  # total number of records in the clickstream data
sample_size = int(n * subsample_ratio)

parse_date = lambda val : datetime.datetime.strptime(val, '%y%m%d%H')
skip_values = lambda i: i>0 and random.random() > subsample_ratio

In [None]:
# indicate the datatypes to use in the reading process
types_train = {
    'id': np.dtype(int),
    'click': np.dtype(int),
    'hour': np.dtype(int),
    'C1': np.dtype(int),
    'banner_pos': np.dtype(int),
    'site_id': np.dtype(str),
    'site_domain': np.dtype(str),
    'site_category': np.dtype(str),
    'app_id': np.dtype(str),
    'app_domain': np.dtype(str),
    'app_category': np.dtype(str),
    'device_id': np.dtype(str),
    'device_ip': np.dtype(str),
    'device_model': np.dtype(str),
    'device_type': np.dtype(int),
    'device_conn_type': np.dtype(int),
    'C14': np.dtype(int),
    'C15': np.dtype(int),
    'C16': np.dtype(int),
    'C17': np.dtype(int),
    'C18': np.dtype(int),
    'C19': np.dtype(int),
    'C20': np.dtype(int),
    'C21':np.dtype(int)
}

In [None]:
FILE_PATH = '/content/drive/MyDrive/ctr/avazu/processed/train/train.csv'
train = pd.read_csv(FILE_PATH)

In [None]:
def ret_user_hotness(train):
    NULL_ID = 'a99f214a'
    USED_FEATURES = {'device_id', 'device_ip', 'device_model', 'click'}
    columns = train.columns
    for col in columns:
        if col not in USED_FEATURES:
            train.drop(col, axis=1, inplace=True)
    user_data_df = pd.DataFrame(columns=[
        'User ID',
        'click',
        'imp'
    ])
    user_data = {}
    for row in train.astype({'click':str}).values:
        user_id, click = row[1], int(row[0])
        if(user_id == NULL_ID):
            user_id = row[2] + '-' + row[3]
        if user_id not in user_data:
            user_data[user_id] = { 'click': click, 'imp': 1 }
        else:
            user_data[user_id]['click'] += click
            user_data[user_id]['imp'] += 1

    imp_bins = [0, 1, 5, 10, 25, 50, 100, 200, 500, 1000, 2000, 3000, 999999999]
    ratio_bins = [-1, 0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1]

    id_list, click_list, imp_list, ratio_list = [], [], [], []

    for user_id, data in user_data.items():
        id_list.append(user_id)
        click_list.append(data['click'])
        imp_list.append(data['imp'])
        ratio_list.append(data['click'] / data['imp'])

    total_data_df = pd.DataFrame(data={
        'user_id' : id_list,
        'click' : click_list,
        'imp' : imp_list,
        'ratio' : ratio_list,
    })

    imp_bins = [0, 1, 5, 10, 25, 50, 100, 200, 500, 1000, 2000, 3000, 999999999]
    total_data_df['imp_binned'] = pd.cut(total_data_df['imp'], bins=imp_bins)
    total_data_df['ratio_binned'] = pd.cut(total_data_df['ratio'], bins=ratio_bins)

    return total_data_df

In [None]:
user_hotness = ret_user_hotness(train)

In [None]:
# Imp가 Threshold 이상인 유저에 대한 통계 진행
# Imp가 Threshold 이상인 유저가 10% 이상이 되어야, labeling의 의미가 있지 않을까 추측

In [None]:
THRESHOLD = 5
MASK_IMP = user_hotness['imp'] >= THRESHOLD
imp_over_threshold = user_hotness[MASK_IMP]

In [None]:
print("Imp가 {} 이상인, 유저의 비율 : {:.2f}%".format(THRESHOLD, len(imp_over_threshold) / len(user_hotness) * 100))

In [None]:
print("Imp가 {} 이상인 유저 내에서의 통계".format(THRESHOLD))

# 1. ratio = 0, click = 0
MASK1 = imp_over_threshold['ratio_binned'] ==  pd.Interval(left=-1.0, right=0.0)
click_zero = imp_over_threshold[MASK1]
print("Click = 0, Ratio = 0 인 유저의 비율 : {:.2f}%".format(len(click_zero) / len(imp_over_threshold) * 100))

# 2. 0 < Ratio <= 0.5
MASK2 = (imp_over_threshold['ratio'] > 0) & (imp_over_threshold['ratio'] <= 0.5)
ratio_mid = imp_over_threshold[MASK2]
print("0 < Ratio <= 0.5 인 유저의 비율 : {:.2f}%".format(len(ratio_mid) / len(imp_over_threshold) * 100))

# 3. Ratio > 0.5
MASK3 = imp_over_threshold['ratio'] > 0.5
ratio_high = imp_over_threshold[MASK3]
print("Ratio > 0.5 인 유저의 비율 : {:.2f}%".format(len(ratio_high) / len(imp_over_threshold) * 100))

In [None]:
user_hotness['Hotness'] = [None] * len(user_hotness)

In [None]:
MASK1 = (user_hotness['imp'] >= THRESHOLD) & (user_hotness['click'] == 0)
MASK2 = (user_hotness['imp'] >= THRESHOLD) & (user_hotness['ratio'] > 0) & (user_hotness['ratio'] <= 0.5)
MASK3 = (user_hotness['imp'] >= THRESHOLD) & (user_hotness['ratio'] > 0.5)

In [None]:
user_hotness.loc[~MASK_IMP, 'Hotness'] = 'New'
user_hotness.loc[MASK1, 'Hotness'] = 'Zero'
user_hotness.loc[MASK2, 'Hotness'] = 'Mid'
user_hotness.loc[MASK3, 'Hotness'] = 'High'

In [None]:
(user_hotness.value_counts('Hotness', normalize=True)*100).apply('{:.2f}'.format)

In [None]:
USER_HOTNESS_FILE_PATH = '/content/drive/MyDrive/ctr/avazu/processed/train/user_hotness.csv'
user_hotness.to_csv(USER_HOTNESS_FILE_PATH, index=False)