In [None]:
!pip install -q tensorflow-recommenders
!pip install -q --upgrade tensorflow-datasets

In [None]:
import pprint

%matplotlib inline
import matplotlib.pyplot as plt
from mpl_toolkits.axes_grid1 import make_axes_locatable

import numpy as np
import tensorflow as tf
import tensorflow_datasets as tfds

import tensorflow_recommenders as tfrs

In [None]:
# import packages
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import random
import sys
import csv
import gzip
import copy
import datetime
from tqdm import tqdm
from sklearn import metrics
from tabulate import tabulate

In [None]:
seed_value = 42  # seed for reproducibility
random.seed(seed_value)

In [None]:
import warnings
warnings.filterwarnings(action='ignore')

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
sys.path.append('/content/drive/MyDrive/ctr/code/model')

In [None]:
import dcn
import run_models

In [None]:
FILE_PATH = '/content/drive/MyDrive/ctr/avazu/processed/train/train.csv'
train = pd.read_csv(FILE_PATH)

In [None]:
TOTAL_CLICK = np.sum(train['click'])
CTR = TOTAL_CLICK / len(train) * 100

In [None]:
rare_count_df = pd.read_csv('/content/drive/MyDrive/ctr/data/total_rare_count_df.csv')

In [None]:
mode_dict = {
    'Top Num' : [10, 25, 50, 75, 100, 200],
    'Threshold' : [10, 500, 1000, 2500, 5000, 10000]
}

rare_count_group = {
    'selection mode' : [],
    'value' : [],
    'rare count' : [],
    'imp' : [],
    'imp frequency ratio' : [],
    'click' : [],
    'average CTR' : []
}

FEATURE_NUM = 9
TOTAL_IMP = len(train)
click_list = list(train['click'])

for selection_mode, value_list in mode_dict.items():
    for value in value_list:
        print(selection_mode, value)
        rare_count_list = list(rare_count_df[rare_count_df[selection_mode] == value][selection_mode + ' Rare Count'])

        rare_count_dict = {}
        for rare_count in range(FEATURE_NUM + 1):
            rare_count_dict[rare_count] = {'imp':0, 'click':0}

        for rare, click in zip(rare_count_list, click_list):
            rare_count_dict[rare]['imp'] += 1
            rare_count_dict[rare]['click'] += click

        rare_count_group['selection mode'] += [selection_mode] * (FEATURE_NUM + 1)
        rare_count_group['value'] += [value] * (FEATURE_NUM + 1)
        rare_count_group['rare count'] += list(range(FEATURE_NUM + 1))

        for rare_count in range(FEATURE_NUM + 1):
            imp, click = rare_count_dict[rare_count]['imp'], rare_count_dict[rare_count]['click']
            imp_frequency_ratio = imp / TOTAL_IMP * 100
            average_CTR = click / imp * 100 if imp != 0 else 0

            rare_count_group['imp'].append(imp)
            rare_count_group['imp frequency ratio'].append(imp_frequency_ratio)
            rare_count_group['click'].append(click)
            rare_count_group['average CTR'].append(average_CTR)

In [None]:
rare_count_group_df = pd.DataFrame(rare_count_group)

In [None]:
rare_count_group_df.to_csv('/content/drive/MyDrive/ctr/data/rare_count_group_df.csv', index=False)

In [None]:
rare_count_group_df[rare_count_group_df['selection mode'] == 'Top Num']

In [None]:
for selection_mode, value_list in mode_dict.items():
    for value in value_list:
        scores = rare_count_group_df[(rare_count_group_df['selection mode'] == selection_mode) & (rare_count_group_df['value'] == value)]
        diff = scores['average CTR'] - CTR
        diff[diff == -CTR] = 0
        scores['diff'] = diff
        scores.plot(kind='bar',x='rare count', y=['imp frequency ratio', 'diff'], title=selection_mode + '=' + str(value))