In [None]:
!pip install -q tensorflow-recommenders
!pip install -q --upgrade tensorflow-datasets

In [None]:
import pprint

%matplotlib inline
import matplotlib.pyplot as plt
from mpl_toolkits.axes_grid1 import make_axes_locatable

import numpy as np
import tensorflow as tf
import tensorflow_datasets as tfds

import tensorflow_recommenders as tfrs

In [None]:
# import packages
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import random
import sys
import csv
import gzip
import copy
import datetime
from tqdm import tqdm
from sklearn import metrics
from tabulate import tabulate

In [None]:
seed_value = 42  # seed for reproducibility
random.seed(seed_value)

In [None]:
import warnings
warnings.filterwarnings(action='ignore')

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
sys.path.append('/content/drive/MyDrive/ctr/code/model')

In [None]:
import dcn
import run_models

In [None]:
FILE_PATH = '/content/drive/MyDrive/ctr/avazu/processed/train/train.csv'
train = pd.read_csv(FILE_PATH)

In [None]:
SAMPLE_SIZE = 1000
sample = train.head(SAMPLE_SIZE)

In [None]:
multi_dimensional_features = {
    'site' : ['site_id', 'site_domain'],
    'device' : ['device_model'],
    'app' : ['app_id', 'app_domain'],
    'categorical' : ['C14', 'C17', 'C19', 'C20']
}

<h2>Rare Analysis</h2>
1. Frequency Top N개의 Category 에 대한 분석 = Top Num <br>
2. Frequency가 기준 이상인 Category에 대한 분석 = Over Threshold

In [None]:
# Using only Top-10 Fetures

In [None]:
def get_feature_analysis(data, multi_dimensional_features, verbose=False):
    # TOP_NUM_LIST = list(range(10,60,10))
    # THRESHOLD_LIST = [10, 100, 200, 500, 1000]

    key_pair_list = [(10, 1000), (25, 500), (50, 10)]

    TOTAL_IMP = len(data)
    voca = {}
    feature_analysis = pd.DataFrame(
        columns = [
            'Feature Cate',
            'Feature Name',
            'Top-Num',
            'Threshold',
            '#Cate',
            'Top-Num #Cate Ratio',
            'Over-Thr. #Cate Ratio',
            'Over-Thr. #Cate Count',
            'Top-Num #Imp Ratio',
            'Top-Num #Imp Count',
            'Over-Thr. #Imp Ratio',
            'Over-Thr. #Imp Count'
        ]
    )
    for TOP_NUM, THRESHOLD in key_pair_list:
        print("Top Num: {}".format(TOP_NUM))
        print("Threshold : {}".format(THRESHOLD))
        key_pair = (TOP_NUM, THRESHOLD)
        voca[key_pair] = {}
        for f_cate, f_list in multi_dimensional_features.items():
            for feature in f_list:
                counts = data.value_counts(feature, ascending=False)
                over_threshold = counts[counts > THRESHOLD]

                num_cate, over_cate = len(counts), len(over_threshold)

                top_num_cate = list(counts.head(TOP_NUM).index)
                over_threshold_cate = list(over_threshold.index)

                voca[key_pair][feature] = {
                    'top_num' : top_num_cate,
                    'over_threshold' : over_threshold_cate
                }

                top_num_imp_count = np.sum(data[feature].apply(lambda x : x in top_num_cate))
                over_threshold_imp_count = np.sum(data[feature].apply(lambda x : x in over_threshold_cate))

                top_num_imp_ratio = top_num_imp_count / TOTAL_IMP * 100
                over_threshold_imp_ratio = over_threshold_imp_count / TOTAL_IMP * 100

                feature_analysis.loc[len(feature_analysis)] = {
                    'Feature Cate' : f_cate,
                    'Feature Name' : feature,
                    'Top-Num' : TOP_NUM,
                    'Threshold' : THRESHOLD,
                    '#Cate' : num_cate,
                    'Top-Num #Cate Ratio' : TOP_NUM / num_cate * 100,
                    'Over-Thr. #Cate Ratio' : over_cate / num_cate * 100,
                    'Over-Thr. #Cate Count' : over_cate,
                    'Top-Num #Imp Ratio' : top_num_imp_ratio,
                    'Top-Num #Imp Count' : top_num_imp_count,
                    'Over-Thr. #Imp Ratio' : over_threshold_imp_ratio,
                    'Over-Thr. #Imp Count' : over_threshold_imp_count
                }
    return feature_analysis, voca

In [None]:
data = train
feature_analysis, total_voca = get_feature_analysis(data, multi_dimensional_features)

In [None]:
low_threshold = feature_analysis[(feature_analysis['Top-Num'] == 100) & (feature_analysis['Threshold'] == 10)]
mid_threshold = feature_analysis[(feature_analysis['Top-Num'] == 50) & (feature_analysis['Threshold'] == 500)]
high_threshold = feature_analysis[(feature_analysis['Top-Num'] == 10) & (feature_analysis['Threshold'] == 1000)]

<h2>Rare Count 집계</h2>

In [None]:
SAMPLE_SIZE = 10000
data = train.head(SAMPLE_SIZE)
key_pair_list = [(10, 1000), (25, 500), (50, 10)]

In [None]:
rare_count_dict = {}

In [None]:
for key_pair in key_pair_list:
    rare_count_dict[key_pair] = {}
    voca_dict = total_voca[key_pair]
    rare_count_dict[key_pair]['Top Num Rare Count'] = np.array([0] * len(data))
    rare_count_dict[key_pair]['Over Threshold Rare Count'] = np.array([0] * len(data))
    total_f_num = 0
    for f_cate, f_list in multi_dimensional_features.items():
        for feature in f_list:
            total_f_num += 1
            top_num_voca, over_threshold_voca = voca_dict[feature]['top_num'], voca_dict[feature]['over_threshold']
            top_num_rare = data[feature].apply(lambda x: 0 if x in top_num_voca else 1)
            over_threshold_rare = data[feature].apply(lambda x: 0 if x in over_threshold_voca else 1)
            rare_count_dict[key_pair]['Top Num Rare Count'] += top_num_rare
            rare_count_dict[key_pair]['Over Threshold Rare Count'] += over_threshold_rare

In [None]:
user_idx_list_total = []
TOP_NUM_LIST_TOTAL = []
THRSHOLD_LIST_TOTAL = []
TOP_NUM_RARE_COUNT_TOTAL = []
OVER_THRESHOLD_COUNT_TOTAL = []

user_list = list(data.index)
for (TOP_NUM, THRESHOLD), count_dict in rare_count_dict.items():
    user_idx_list_total += user_list

    TOP_NUM_LIST_TOTAL += [TOP_NUM] * len(data)
    THRSHOLD_LIST_TOTAL += [THRESHOLD] * len(data)

    TOP_NUM_RARE_COUNT_TOTAL += list(count_dict['Top Num Rare Count'])
    OVER_THRESHOLD_COUNT_TOTAL += list(count_dict['Over Threshold Rare Count'])

rare_count_df = pd.DataFrame(
    data = {
        'user_idx' : user_idx_list_total,
        'Top Num' : TOP_NUM_LIST_TOTAL,
        'Threshold' : THRSHOLD_LIST_TOTAL,
        'Top Num Rare Count' : TOP_NUM_RARE_COUNT_TOTAL,
        'Over Threshold Rare Count' : OVER_THRESHOLD_COUNT_TOTAL
        }
)

In [None]:
rare_cnt_low_threshold = rare_count_df[(rare_count_df['Top Num'] == 50) & (rare_count_df['Threshold'] == 10)]
rare_cnt_mid_threshold = rare_count_df[(rare_count_df['Top Num'] == 25) & (rare_count_df['Threshold'] == 500)]
rare_cnt_high_threshold = rare_count_df[(rare_count_df['Top Num'] == 10) & (rare_count_df['Threshold'] == 1000)]

In [None]:
rare_count_df.to_csv('/content/drive/MyDrive/ctr/data/rare_count_df.csv', index=False)

In [None]:
(rare_cnt_low_threshold.value_counts('Top Num Rare Count', ascending=False, normalize=True) * 100).apply("{:.2f}%".format)

In [None]:
value_cnt = (rare_cnt_low_threshold.value_counts('Top Num Rare Count', ascending=False, normalize=True) * 100).sort_index(ascending=True)

In [None]:
value_cnt.plot(kind='bar')

In [None]:
(rare_cnt_low_threshold.value_counts('Over Threshold Rare Count', ascending=False, normalize=True) * 100).apply("{:.2f}%".format)

In [None]:
(rare_cnt_mid_threshold.value_counts('Top Num Rare Count', ascending=False, normalize=True) * 100).apply("{:.2f}%".format)

In [None]:
(rare_cnt_mid_threshold.value_counts('Over Threshold Rare Count', ascending=False, normalize=True) * 100).apply("{:.2f}%".format)

In [None]:
(rare_cnt_high_threshold.value_counts('Top Num Rare Count', ascending=False, normalize=True) * 100).apply("{:.2f}%".format)

In [None]:
(rare_cnt_high_threshold.value_counts('Over Threshold Rare Count', ascending=False, normalize=True) * 100).apply("{:.2f}%".format)

In [None]:
d1 = np.array([0] * len(data))

In [None]:
top_num_rare = data[feature].apply(lambda x: 0 if x in top_num_voca else 1)

In [None]:
total_f_num

In [None]:
(data.value_counts('top num rare count', ascending = False, normalize=True) * 100).apply('{:.2f}%'.format)

In [None]:
(data.value_counts('over threshold rare count', ascending = False, normalize=True) * 100).apply('{:.2f}%'.format)

In [None]:
np.sum(data['rare count'])

In [None]:
def get_count_analysis(TOP_NUM, feature, data, verbose=False):
    total_proportion = data.value_counts(feature, ascending=False, normalize=True) * 100
    total_counts = data.value_counts(feature, ascending=False)
    total_f_num = len(total_counts)

    proportion = total_proportion.head(TOP_NUM).apply('{:.2f}'.format)
    counts = total_counts.head(TOP_NUM)
    count_values = pd.DataFrame({
        'proportion' : proportion,
        'count' : counts
    })

    if verbose:
        print("총 Category 개수 : {}".format(total_f_num))
        print("Top Feture의 비율 : {:.2f}%".format(10 / total_f_num * 100))

        print("마지막 Category의 비율 : {:.2f}".format(total_proportion.head(TOP_NUM).iloc[-1]))
        print("마지막 Category의 Count : {}".format(total_counts.head(TOP_NUM).iloc[-1]))
        print()

    return list(count_values.index)

In [None]:
def get_voca_top(TOP_NUM, data, verbose=False):
    voca_top = {}
    for f_cate, f_list in multi_dimensional_features.items():
        for feature in f_list:
            voca_top[feature] = get_count_analysis(TOP_NUM, feature, data)
    return voca_top

In [None]:
TOP_NUM = 10
data = train
voca_top = get_voca_top(TOP_NUM, data, verbose=False)

In [None]:
def get_count_analysis_threshold(THRESHOLD, feature, data, verbose=False):
    counts = data.value_counts(feature, ascending=False)
    over_threshold = counts[counts > THRESHOLD]

    num_cate, over_cate = len(counts), len(over_threshold)

    if verbose:
        print("총 Category 개수 : {}개".format(num_cate))
        print("기준 이상의 Category 개수 : {}개".format(over_cate))
        print("생략 되는 Category 개수 : {}개".format(num_cate - over_cate))
        print("기준 이상의 Category 퍼센트 : {:.2f}%".format(over_cate / num_cate * 100))
    return list(over_threshold.index)

In [None]:
def get_voca(THRESHOLD, data, verbose=False):
    voca = {}
    for f_cate, f_list in multi_dimensional_features.items():
        for feature in f_list:
            voca[feature] = get_count_analysis_threshold(THRESHOLD, feature, data)
    return voca

In [None]:
THRESHOLD = 100
data = train
voca = get_voca(THRESHOLD, data)

In [None]:
# import pickle
# with open('/content/drive/MyDrive/ctr/data/voca_top.p', 'wb') as f:
#     pickle.dump(voca_top, f)
# with open('/content/drive/MyDrive/ctr/data/voca.p', 'wb') as f:
#     pickle.dump(voca, f)