In [None]:
!pip install -q tensorflow-recommenders
!pip install -q --upgrade tensorflow-datasets

In [None]:
import pprint

%matplotlib inline
import matplotlib.pyplot as plt
from mpl_toolkits.axes_grid1 import make_axes_locatable

import numpy as np
import tensorflow as tf
import tensorflow_datasets as tfds

import tensorflow_recommenders as tfrs

In [None]:
# import packages
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import random
import sys
import csv
import gzip
import copy
import datetime
from tqdm import tqdm
from sklearn import metrics
from tabulate import tabulate

In [None]:
seed_value = 42  # seed for reproducibility
random.seed(seed_value)

In [None]:
import warnings
warnings.filterwarnings(action='ignore')

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
sys.path.append('/content/drive/MyDrive/ctr/code/model')

In [None]:
import dcn
import run_models

In [None]:
subsample_ratio = 0.5
skip_values = lambda i: i>0 and random.random() > subsample_ratio

In [None]:
FILE_PATH = '/content/drive/MyDrive/ctr/avazu/processed/train/train.csv'
train = pd.read_csv(FILE_PATH, skiprows=skip_values)

In [None]:
# id_null = train[train['device_id'] == 'a99f214a'].astype(str)
id_filled = train[train['device_id'] != 'a99f214a'].astype(str)
del train

In [None]:
len(id_filled)

In [None]:
# 1. 몇 %의 User가 한개의 Featre 값을 갖는가
# 2. 한개의 Feature 값을 한 User만 갖는, Feature 내에서의 비율

In [None]:
def get_unique_value_ratio(set_dict):
    total_num = 0
    unique_num = 0
    for id, value_set in set_dict.items():
        total_num += 1
        if len(value_set) == 1:
            unique_num += 1
    return unique_num / total_num
def get_unique_value_ratio_data(data, id_feature, target_feature):
    set_dict = {}
    def get_value_set(row, id_feature, target_feature):
        id, target = row[id_feature], row[target_feature]
        if id not in set_dict:
            set_dict[id] = set()
        set_dict[id].add(target)
    data.apply(lambda row: get_value_set(row, id_feature, target_feature), axis=1)
    unique_value_ratio = get_unique_value_ratio(set_dict)
    return unique_value_ratio

In [None]:
def get_unique_user_ratio(set_dict):
    total_num = 0
    unique_num = 0
    for id, value_set in set_dict.items():
        total_num += 1
        if len(value_set) == 1:
            unique_num += 1
    return unique_num / total_num

def get_unique_user_ratio_data(data, id_feature, target_feature):
    set_dict = {}
    def get_user_set(row, id_feature, target_feature):
        id, target = row[id_feature], row[target_feature]
        if target not in set_dict:
            set_dict[target] = set()
        set_dict[target].add(id)
    data.apply(lambda row: get_user_set(row, id_feature, target_feature), axis=1)
    unique_value_ratio = get_unique_user_ratio(set_dict)
    return unique_value_ratio

In [None]:
def ret_user_ratio_df(data, id_feature):
    columns = [
        'site_id', 'site_domain', 'site_category',
        'app_id', 'app_domain', 'app_category',
        'device_ip', 'device_model', 'device_type', 'device_conn_type',
        'C1', 'C14', 'C15', 'C16', 'C17', 'C18', 'C19', 'C20', 'C21'
        ]
    df = pd.DataFrame(columns=['target_feature',
                               'unique value ratio',
                               'unique user ratio'
                               ])
    for target_feature in columns:
        print(target_feature)
        value_ratio = get_unique_value_ratio_data(data, id_feature, target_feature)
        user_ratio = get_unique_user_ratio_data(data, id_feature, target_feature)
        df.loc[len(df)] = {
            'target_feature' : target_feature,
            'unique value ratio' : value_ratio,
            'unique user ratio' : user_ratio
        }
    return df

In [None]:
def ret_user_ratio_df_f2(data, id_feature):
    columns = [
        'site_id', 'site_domain', 'site_category',
        'app_id', 'app_domain', 'app_category',
        'device_ip', 'device_model', 'device_type', 'device_conn_type',
        'C1', 'C14', 'C15', 'C16', 'C17', 'C18', 'C19', 'C20', 'C21'
        ]
    df = pd.DataFrame(columns=['target_feature',
                            'unique value ratio',
                            'unique user ratio'
                            ])
    for idx1 in tqdm(range(len(columns))):
        for idx2 in range(idx1+1, len(columns)):
            col1, col2 = columns[idx1], columns[idx2]
            col = col1 + ' & ' + col2
            data[col] = data[[col1, col2]].agg('-'.join, axis=1)
            target_feature = col
            value_ratio = get_unique_value_ratio_data(data, id_feature, target_feature)
            user_ratio = get_unique_user_ratio_data(data, id_feature, target_feature)
            df.loc[len(df)] = {
                'target_feature' : target_feature,
                'unique value ratio' : value_ratio,
                'unique user ratio' : user_ratio
            }
            data.drop(col, axis=1, inplace=True)
    return df

In [None]:
def ret_user_ratio_df_f3(data, id_feature):
    columns = [
        'site_id', 'site_domain', 'site_category',
        'app_id', 'app_domain', 'app_category',
        'device_ip', 'device_model', 'device_type', 'device_conn_type',
        'C1', 'C14', 'C15', 'C16', 'C17', 'C18', 'C19', 'C20', 'C21'
        ]
    df = pd.DataFrame(columns=['target_feature',
                            'unique value ratio',
                            'unique user ratio'
                            ])
    for idx1 in tqdm(range(len(columns))):
        for idx2 in range(idx1+1, len(columns)):
            for idx3 in range(idx2+1, len(columns)):
                col1, col2, col3 = columns[idx1], columns[idx2], columns[idx3]
                col = col1 + ' & ' + col2 + ' & ' + col3
                data[col] = data[[col1, col2, col3]].agg('-'.join, axis=1)
                target_feature = col
                value_ratio = get_unique_value_ratio_data(data, id_feature, target_feature)
                user_ratio = get_unique_user_ratio_data(data, id_feature, target_feature)
                df.loc[len(df)] = {
                    'target_feature' : target_feature,
                    'unique value ratio' : value_ratio,
                    'unique user ratio' : user_ratio
                }
                data.drop(col, axis=1, inplace=True)
    return df

In [None]:
# df = ret_user_ratio_df(id_filled, 'device_id')

In [None]:
# df_f2 = ret_user_ratio_df_f2(id_filled, 'device_id')
# df_f2['unique value ratio'] = (df_f2['unique value ratio'] * 100).apply('{:.2f}'.format)
# df_f2['unique user ratio'] = (df_f2['unique user ratio'] * 100).apply('{:.2f}'.format)
# df_f2

In [None]:
# import pickle
# with open('/content/drive/MyDrive/ctr/data/df_f2.p', 'wb') as f:
#     pickle.dump(df_f2, f)

In [None]:
df_f3 = ret_user_ratio_df_f3(id_filled, 'device_id')
df_f3['unique value ratio'] = (df_f3['unique value ratio'] * 100).apply('{:.2f}'.format)
df_f3['unique user ratio'] = (df_f3['unique user ratio'] * 100).apply('{:.2f}'.format)
df_f3

In [None]:
with open('/content/drive/MyDrive/ctr/data/df_f3.p', 'wb') as f:
    pickle.dump(df_f3, f)

In [None]:
df_f2[df_f2['target_feature'] == 'device_ip & device_model']

In [None]:
tmp_f2 = df_f2.astype({
    'unique value ratio' : float,
    'unique user ratio' : float
})

In [None]:
tmp_f2[(tmp_f2['unique value ratio'] > 93) & (tmp_f2['unique user ratio'] > 97)]

In [None]:
tmp_f3 = df_f3.astype({
    'unique value ratio' : float,
    'unique user ratio' : float
})

In [None]:
tmp_f3[(tmp_f3['unique value ratio'] > 93) & (tmp_f3['unique user ratio'] >97)]

In [None]:
def count_df():
    # 한 유저가 보유하는 Model 개수 통계 - 99%가 1개 보유
    count_df = ret_count_df(id_filled, 'device_id', 'device_model')
    # 특정 Device Model을 사용하는 User의 수 통계 - 17%가 1명
    count_df = ret_count_df(id_filled, 'device_model', 'device_id')
    # 한 유저가 보유하는 IP 종류 통계 - 84%가 1개
    count_df = ret_count_df(id_filled, 'device_id', 'device_ip')
    # 특정 IP를 사용하는 User 수 통계 - 83%가 1명
    count_df = ret_count_df(id_filled, 'device_ip', 'device_id')
    # 한 유저가 보유하는 App ID 종류 통계 - 96%가 1개
    count_df = ret_count_df(id_filled, 'device_id', 'app_id')
    # 특정 App ID를 사용하는 User 수 통계 - 31%
    count_df = ret_count_df(id_filled, 'app_id', 'device_id')

In [None]:
def count_df2():
    # 한 IP + Model + App 값에 대한 User 수 통계 - 94.5%가 한명
    id_filled['IP + Model + App'] = id_filled[['device_ip', 'device_model', 'app_id']].agg('-'.join, axis=1)
    count_df = ret_count_df(id_filled, 'IP + Model + App','device_id')

    # 한 IP + Model 값에 대한 User 수 통계 - 93.1%가 한명
    id_filled['IP + Model'] = id_filled[['device_ip', 'device_model']].agg('-'.join, axis=1)
    count_df = ret_count_df(id_filled, 'IP + Model','device_id')