In [None]:
!pip install -q tensorflow-recommenders
!pip install -q --upgrade tensorflow-datasets

In [None]:
import pprint

%matplotlib inline
import matplotlib.pyplot as plt
from mpl_toolkits.axes_grid1 import make_axes_locatable

import numpy as np
import tensorflow as tf
import tensorflow_datasets as tfds

import tensorflow_recommenders as tfrs

In [None]:
# import packages
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import random
import sys
import csv
import gzip
import copy
import datetime
from tqdm import tqdm
from sklearn import metrics
from tabulate import tabulate

In [None]:
seed_value = 42  # seed for reproducibility
random.seed(seed_value)

In [None]:
import warnings
warnings.filterwarnings(action='ignore')

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
sys.path.append('/content/drive/MyDrive/ctr/code/model')

In [None]:
import dcn
import run_models

In [None]:
FILE_PATH = '/content/drive/MyDrive/ctr/avazu/processed/train/train.csv'
train = pd.read_csv(FILE_PATH)

In [None]:
multi_dimensional_features = {
    'site' : ['site_id', 'site_domain'],
    'device' : ['device_model'],
    'app' : ['app_id', 'app_domain'],
    'categorical' : ['C14', 'C17', 'C19', 'C20']
}

In [None]:
import pickle
with open('/content/drive/MyDrive/ctr/data/total_voca.p', 'rb') as f:
    total_voca = pickle.load(f)
FEATURE_ANALYSIS = '/content/drive/MyDrive/ctr/data/feature_analysis.csv'
feature_analysis = pd.read_csv(FEATURE_ANALYSIS)
RARE_COUNT_DF = '/content/drive/MyDrive/ctr/data/rare_count_df.csv'
rare_count_df = pd.read_csv(RARE_COUNT_DF)

<h2>Category Imp Ratio & Average CTR Analysis</h2>

In [None]:
def get_cate_imp_click_cnt(data, feature, voca):
    cate_imp_click_cnt = {}
    for cate in voca:
        cate_imp_click_cnt[cate] = {'imp': 0, 'click': 0}
        cate_imp_click_cnt['Rare'] = {'imp':0, 'click': 0}

    def get_imp_click(row, feature):
        click = row['click']
        cate = row[feature]
        if cate not in voca:
            cate_imp_click_cnt['Rare']['imp'] += 1
            cate_imp_click_cnt['Rare']['click'] += click
        else:
            cate_imp_click_cnt[cate]['imp'] += 1
            cate_imp_click_cnt[cate]['click'] += click

    data[['click', feature]].apply(lambda row: get_imp_click(row, feature), axis=1)
    return cate_imp_click_cnt

In [None]:
def get_cate_analysis_ctr(cate_analysis_ctr, data, feature, param):
    voca, selection_mode, value = param
    cate_num = len(voca) + 1
    cate_imp_click_cnt = get_cate_imp_click_cnt(data, feature, voca)
    cate_analysis_ctr['selection mode'] += [selection_mode] * cate_num
    cate_analysis_ctr['value'] += [value] * cate_num
    cate_analysis_ctr['feature'] += [feature] * cate_num
    cate_analysis_ctr['cate num'] += [cate_num] * cate_num

    for idx, cate in enumerate(voca):
        cate_value_idx = idx + 1
        cate_analysis_ctr['cate value idx'].append(cate_value_idx)
        cate_analysis_ctr['cate value'].append(cate)
        imp, click = cate_imp_click_cnt[cate]['imp'], cate_imp_click_cnt[cate]['click']
        cate_analysis_ctr['imp'].append(imp)
        cate_analysis_ctr['imp frequency ratio'].append(imp / TOTAL_IMP * 100)
        cate_analysis_ctr['click'].append(click)
        if imp == 0:
            cate_analysis_ctr['average CTR'].append(0)
        else:
            cate_analysis_ctr['average CTR'].append(click / imp * 100)

    imp, click = cate_imp_click_cnt['Rare']['imp'], cate_imp_click_cnt['Rare']['click']
    cate_analysis_ctr['cate value idx'].append(cate_num)
    cate_analysis_ctr['cate value'].append('Rare')
    cate_analysis_ctr['imp'].append(imp)
    cate_analysis_ctr['imp frequency ratio'].append(imp / TOTAL_IMP * 100)
    cate_analysis_ctr['click'].append(click)
    if imp == 0:
        cate_analysis_ctr['average CTR'].append(0)
    else:
        cate_analysis_ctr['average CTR'].append(click / imp * 100)
    return cate_analysis_ctr

In [None]:
data = train
cate_analysis_ctr = {
    'selection mode' : [],
    'value' : [],
    'feature' : [],
    'cate num' : [],
    'cate value idx' : [],
    'cate value' : [],
    'imp' : [],
    'imp frequency ratio' : [],
    'click' : [],
    'average CTR' : []
}

TOTAL_IMP = len(data)
for (top_num_val, threshold_val), voca in total_voca.items():
    print(top_num_val, threshold_val)
    for feature, voca_dict in voca.items():
        print(feature)
        top_num_voca, over_threshold_voca = voca_dict['top_num'], voca_dict['over_threshold']

        param_tuple = {
            'Top Num' : (top_num_voca, 'Top Num', top_num_val),
            'Threshold' : (over_threshold_voca, 'Threshold', threshold_val)
        }

        cate_analysis_ctr = get_cate_analysis_ctr(cate_analysis_ctr, data, feature, param_tuple['Top Num'])
        cate_analysis_ctr = get_cate_analysis_ctr(cate_analysis_ctr, data, feature, param_tuple['Threshold'])

In [None]:
cate_analysis_ctr_df = pd.DataFrame(cate_analysis_ctr)

In [None]:
cate_analysis_ctr_df[(cate_analysis_ctr_df['cate value'] == 'Rare') & (cate_analysis_ctr_df['selection mode'] == 'Top Num')]

In [None]:
cate_analysis_ctr_df[(cate_analysis_ctr_df['cate value'] == 'Rare') & (cate_analysis_ctr_df['selection mode'] == 'Threshold')]

In [None]:
cate_analysis_ctr_df.to_csv('/content/drive/MyDrive/ctr/data/cate_analysis_ctr_df.csv', index=False)