In [1]:
# import packages
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import random
import sys
import gzip
import datetime
from tabulate import tabulate

In [2]:
FIGSIZE = (6,3)

seed_value = 42  # seed for reproducibility
random.seed(seed_value)
subsample_ratio = 0.05

n = 40428967  # total number of records in the clickstream data
sample_size = int(n * subsample_ratio)

parse_date = lambda val : datetime.datetime.strptime(val, '%y%m%d%H')
skip_values = lambda i: i>0 and random.random() > subsample_ratio

In [3]:
# indicate the datatypes to use in the reading process
types_train = {
    'id': np.dtype(int),
    'click': np.dtype(int),
    'hour': np.dtype(int),
    'C1': np.dtype(int),
    'banner_pos': np.dtype(int),
    'site_id': np.dtype(str),
    'site_domain': np.dtype(str),
    'site_category': np.dtype(str),
    'app_id': np.dtype(str),
    'app_domain': np.dtype(str),
    'app_category': np.dtype(str),
    'device_id': np.dtype(str),
    'device_ip': np.dtype(str),
    'device_model': np.dtype(str),
    'device_type': np.dtype(int),
    'device_conn_type': np.dtype(int),
    'C14': np.dtype(int),
    'C15': np.dtype(int),
    'C16': np.dtype(int),
    'C17': np.dtype(int),
    'C18': np.dtype(int),
    'C19': np.dtype(int),
    'C20': np.dtype(int),
    'C21':np.dtype(int)
}

In [4]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [5]:
import warnings
warnings.filterwarnings(action='ignore')
# warnings.filterwarnings(action='default')

In [6]:
with gzip.open('/content/drive/MyDrive/ctr/avazu-gzip/train.gz') as f:
    train = pd.read_csv(f, dtype=types_train)

In [7]:
sample = train.sample(frac=0.01, random_state=42)

In [8]:
data = train

In [11]:
click = data.groupby(by=['C17', 'site_category'])['click'].sum()

In [12]:
imp = data.groupby(by=['C17', 'site_category'])['id'].count()

In [45]:
corr_check = pd.concat({'click':click, 'impression':imp}, axis=1)
corr_check['CTR'] = click / imp * 100

In [34]:
comb_dict = {}
for c17, cate in data.groupby(by=['C17', 'site_category']).count().index:
    if c17 not in comb_dict:
        comb_dict[c17] = [cate]
    else:
        comb_dict[c17].append(cate)

In [41]:
c17_list, cate_num_list = [], []
for c17, cate_list in comb_dict.items():
    c17_list.append(c17)
    cate_num_list.append(len(cate_list))

In [20]:
ctr_list = list(data.groupby(['C17'])['click'].sum() / data.groupby(['C17'])['id'].count() * 100)

In [49]:
cate_num_full_list = []
for cate_num in cate_num_list:
    cate_num_full_list += [cate_num] * cate_num

In [52]:
c17_ctr = data.groupby(['C17'])['click'].sum() / data.groupby(['C17'])['id'].count() * 100

In [55]:
cate_ctr = data.groupby(['site_category'])['click'].sum() / data.groupby(['site_category'])['id'].count() * 100

In [63]:
c17_ctr_list = []
cate_ctr_list = []
for c17, cate in corr_check.index:
    c17_ctr_list.append(c17_ctr.loc[c17])
    cate_ctr_list.append(cate_ctr.loc[cate])

In [64]:
corr_check['C17 CTR'], corr_check['site_category CTR'] = c17_ctr_list, cate_ctr_list

In [66]:
corr_check['cate num'] = cate_num_full_list

In [71]:
corr_check['C17 CTR Diff'] = corr_check['CTR'] - corr_check['C17 CTR']
corr_check['site_category CTR Diff'] = corr_check['CTR'] - corr_check['site_category CTR']
corr_check['CTR Diff sum'] = corr_check['C17 CTR Diff'] + corr_check['site_category CTR Diff']

In [73]:
THRESHOLD = 1000
corr_check_filtered = corr_check.loc[corr_check['impression'] > THRESHOLD]

In [87]:
DIR_PATH = '/content/drive/MyDrive/ctr/data/feature_correlation/corr_check/'

In [85]:
def ret_corr_check(data, f1, f2, dir_path, THRESHOLD=1000):
    click = data.groupby(by=[f1, f2])['click'].sum()
    imp = data.groupby(by=[f1, f2])['id'].count()

    corr_check = pd.concat({'click':click, 'impression':imp}, axis=1)
    corr_check['CTR'] = click / imp * 100

    comb_dict = {}
    for c1, c2 in data.groupby(by=[f1, f2]).count().index:
        if c1 not in comb_dict:
            comb_dict[c1] = [c2]
        else:
            comb_dict[c1].append(c2)

    f1_list, cate_num_list = [], []
    for c1, c2_list in comb_dict.items():
        f1_list.append(c1)
        cate_num_list.append(len(c2_list))

    ctr_list = list(data.groupby([f1])['click'].sum() / data.groupby([f1])['id'].count() * 100)

    cate_num_full_list = []
    for cate_num in cate_num_list:
        cate_num_full_list += [cate_num] * cate_num

    f1_ctr = data.groupby([f1])['click'].sum() / data.groupby([f1])['id'].count() * 100
    f2_ctr = data.groupby([f2])['click'].sum() / data.groupby([f2])['id'].count() * 100

    f1_ctr_list = []
    f2_ctr_list = []
    for c1, c2 in corr_check.index:
        f1_ctr_list.append(f1_ctr.loc[c1])
        f2_ctr_list.append(f2_ctr.loc[c2])

    corr_check[f1 + ' CTR'], corr_check[f2 + ' CTR'] = f1_ctr_list, f2_ctr_list
    corr_check['cate num'] = cate_num_full_list
    corr_check[f1 + ' CTR Diff'] = corr_check['CTR'] - corr_check[f1 + ' CTR']
    corr_check[f2 + ' CTR Diff'] = corr_check['CTR'] - corr_check[f2 + ' CTR']
    corr_check['CTR Diff Sum'] = corr_check[f1 + ' CTR Diff'] + corr_check[f2 + ' CTR Diff']

    THRESHOLD = 1000
    corr_check_filtered = corr_check.loc[corr_check['impression'] > THRESHOLD]

    corr_check_filtered = corr_check_filtered[['cate num', 'click', 'impression', 'CTR', f1 + ' CTR', f2 + ' CTR', f1 + ' CTR Diff', f2 + ' CTR Diff', 'CTR Diff Sum']]
    corr_check_filtered = corr_check_filtered.round({
    	'CTR' : 2,
    	f1 + ' CTR' : 2,
    	f2 + ' CTR' : 2,
    	f1 + ' CTR Diff' : 2,
    	f2 + ' CTR Diff' : 2,
    	'CTR Diff Sum' : 2
    	})
    corr_check_filtered.to_csv(dir_path + f1 + '_' + f2 + '.csv')
    return corr_check_filtered

In [89]:
f_list = [
    'site_category',
    'C17',
    'C21'
    ]
f_num = len(f_list)
for f1 in f_list:
    for f2 in f_list:
        if f1 == f2 :
            continue
        print(f1, f2)
        corr_check = ret_corr_check(data, f1, f2, DIR_PATH)

site_category C17
site_category C21
C17 site_category
C17 C21
C21 site_category
C21 C17
