# Imports

In [2]:
import os
import sys
import gc
import random
import logging
import pickle
import typing as T
from logging import getLogger

import pandas as pd
import numpy as np
import pyarrow
from tqdm.notebook import tqdm
from sklearn.preprocessing import LabelEncoder


# Settings

In [6]:
PROJ_PATH = ".."
DATA_PATH = os.path.join(PROJ_PATH, "data")

TRAIN_DATA_PATH = os.path.join(DATA_PATH, "final/clickstream")
TEST_DATA_PATH = os.path.join(DATA_PATH, "final/target")
SAVE_DATA_PATH = os.path.join(DATA_PATH, "preprocessed")

In [7]:
def seed_everything(seed=42):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)

In [8]:
SEED = 42
seed_everything(seed=SEED)

In [9]:
logger = getLogger(name=__name__)
logging.basicConfig(format = u'%(filename)s[LINE:%(lineno)d] # [%(levelname)-8s] [%(asctime)s]  %(message)s', level = logging.INFO)


# Methods

In [10]:
def get_full_dataset(
    data_path: str,
    part_names: T.List[str],
    columns: T.List[str]
    ) -> pd.DataFrame:
    df = pd.DataFrame(columns=columns)
    for part in tqdm(part_names):
        dft = pd.read_parquet(
            os.path.join(data_path, part),
                columns=columns,
                engine='pyarrow'
            )
        df = pd.concat((df, dft))
        del dft
        collected_gc = gc.collect()
        logger.info(f"Partition read: {part}; gc.collect: {collected_gc}")
    return df.reset_index(drop=True)

In [11]:
def get_replace_rare_values_dict(
    df_col: pd.Series, thres: int = 10, verbose: bool = False
    ) -> T.Dict[str, str]:
    vc = df_col.value_counts()
    mask = vc < thres
    vcs = list(vc[~mask].index)
    if verbose:
        print(f"Number of values = {len(vc)}\nNumber >= {thres} = {len(vcs)}")
    vc[~mask] = vcs
    vc[mask] = 'other'
    return dict(vc)

In [12]:
def map_pop_vals(df: pd.DataFrame, mappers: T.Dict[str, str]) -> pd.DataFrame:
    for col, mapper in tqdm(mappers.items(), desc="Mapping pop vals"):
        if col in df.columns:
            df[col] = df[col].map(mapper)
    return df

In [13]:
def save_pickle(a, filepath):
    with open(filepath, 'wb') as handle:
        pickle.dump(a, handle, protocol=pickle.HIGHEST_PROTOCOL)

# Get data

## Get tables

In [None]:
part_names = [
 'part-00000.parquet',
 'part-00001.parquet',
 'part-00002.parquet',
 'part-00003.parquet',
 'part-00004.parquet',
 'part-00005.parquet',
 'part-00006.parquet',
 'part-00007.parquet',
 'part-00008.parquet',
 'part-00009.parquet'
]

cols = ['event_type', 'event_category', 'event_name', 'device_screen_name',
        'net_connection_type', 'net_connection_tech']

In [None]:
df = get_full_dataset(
    data_path=TRAIN_DATA_PATH,
    part_names=part_names,
    columns=cols
    )
df.shape

HBox(children=(FloatProgress(value=0.0, max=4.0), HTML(value='')))

<ipython-input-10-430b0fece559>[LINE:16] # [INFO    ] [2020-12-14 18:22:35,000]  Partition read: part-00000.parquet; gc.collect: 11
<ipython-input-10-430b0fece559>[LINE:16] # [INFO    ] [2020-12-14 18:22:38,891]  Partition read: part-00001.parquet; gc.collect: 0
<ipython-input-10-430b0fece559>[LINE:16] # [INFO    ] [2020-12-14 18:22:43,376]  Partition read: part-00002.parquet; gc.collect: 0
<ipython-input-10-430b0fece559>[LINE:16] # [INFO    ] [2020-12-14 18:22:48,688]  Partition read: part-00003.parquet; gc.collect: 0





# Get pop vals

In [None]:
mappers = dict()
for col in df.columns:
    print(f"\nCol = {col}")
    mappers[col] = get_replace_rare_values_dict(
        df_col=df[col], thres=50000, verbose=True
    )

utils.py[LINE:141] # [INFO    ] [2020-12-14 17:33:34,114]  NumExpr defaulting to 4 threads.



Col = event_type
Number of values = 5
Number >= 50000 = 4

Col = event_category
Number of values = 375
Number >= 50000 = 62

Col = event_name
Number of values = 83162
Number >= 50000 = 120

Col = device_screen_name
Number of values = 606
Number >= 50000 = 127

Col = timezone
Number of values = 302
Number >= 50000 = 42

Col = net_connection_type
Number of values = 3
Number >= 50000 = 3

Col = net_connection_tech
Number of values = 17
Number >= 50000 = 7


# Get combined pop vals

In [None]:
df = map_pop_vals(df=df, mappers=mappers)
df = df.fillna('null')

HBox(children=(FloatProgress(value=0.0, description='Mapping pop vals', max=7.0, style=ProgressStyle(descripti…




In [None]:
event_cols = ['event_type', 'event_category', 'event_name', 'device_screen_name']
net_cols = ['net_connection_type', 'net_connection_tech']

In [None]:
df['event'] = df['event_type'] + " @ " + df['event_category'] + " @ " + df['event_name'] + " @ " + df['device_screen_name']
df['net'] = df['net_connection_type'] + " @ " + df['net_connection_tech']
df['event'].nunique(), df['net'].nunique()


(408, 17)

In [None]:
df = df.drop(net_cols, axis=1)

In [None]:
df.shape

(48418352, 2)

In [None]:
combined_mappers = dict()
for col in ['event', 'net']:
    print(f"\nCol = {col}")
    combined_mappers[col] = get_replace_rare_values_dict(
        df_col=df[col], thres=25000, verbose=True
    )


Col = event
Number of values = 408
Number >= 25000 = 206

Col = net
Number of values = 17
Number >= 25000 = 10


# Save

In [None]:
save_pickle(mappers, os.path.join(SAVE_DATA_PATH, "pop_vals_mappers_50k.pickle"))
save_pickle(combined_mappers, os.path.join(SAVE_DATA_PATH, "event_net_pop_vals_mappers_25k.pickle"))
