# Imports

In [2]:
import os
import sys
import gc
import random
import logging
import pickle
import typing as T
from logging import getLogger

import pandas as pd
import numpy as np
import pyarrow
from tqdm.notebook import tqdm
from sklearn.preprocessing import LabelEncoder


# Settings

In [6]:
PROJ_PATH = ".."
DATA_PATH = os.path.join(PROJ_PATH, "data")

TRAIN_DATA_PATH = os.path.join(DATA_PATH, "final/clickstream")
TEST_DATA_PATH = os.path.join(DATA_PATH, "final/target")
SAVE_DATA_PATH = os.path.join(DATA_PATH, "preprocessed")

In [7]:
def seed_everything(seed=42):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)

In [8]:
SEED = 42
seed_everything(seed=SEED)

In [9]:
logger = getLogger(name=__name__)
logging.basicConfig(format = u'%(filename)s[LINE:%(lineno)d] # [%(levelname)-8s] [%(asctime)s]  %(message)s', level = logging.INFO)


# Methods

In [10]:
def get_full_dataset(
    data_path: str,
    part_names: T.List[str],
    columns: T.List[str]
    ) -> pd.DataFrame:
    df = pd.DataFrame(columns=columns)
    for part in tqdm(part_names):
        dft = pd.read_parquet(
            os.path.join(data_path, part),
                columns=columns,
                engine='pyarrow'
            )
        df = pd.concat((df, dft))
        del dft
        collected_gc = gc.collect()
        logger.info(f"Partition read: {part}; gc.collect: {collected_gc}")
    return df.reset_index(drop=True)

In [11]:
def map_pop_vals(df: pd.DataFrame, mappers: T.Dict[str, str]) -> pd.DataFrame:
    for col, mapper in tqdm(mappers.items(), desc="Mapping pop vals"):
        if col in df.columns:
            df[col] = df[col].map(mapper)
    return df

In [12]:
def save_pickle(a, filepath):
    with open(filepath, 'wb') as handle:
        pickle.dump(a, handle, protocol=pickle.HIGHEST_PROTOCOL)


def load_pickle(filepath):
    with open(filepath, 'rb') as handle:
        b = pickle.load(handle)
        
    return b

# Get data

## Get tables

In [16]:
part_names = [
 'part-00000.parquet',
 'part-00001.parquet',
 'part-00002.parquet',
 'part-00003.parquet',
 'part-00004.parquet',
 'part-00005.parquet',
 'part-00006.parquet',
 'part-00007.parquet',
 'part-00008.parquet',
 'part-00009.parquet'
]

cols = ['event_type', 'event_category', 'event_name', 'device_screen_name',
       'timezone', 'net_connection_type', 'net_connection_tech']

In [17]:
df = get_full_dataset(
    data_path=TRAIN_DATA_PATH,
    part_names=part_names,
    columns=cols
    )
df.shape

HBox(children=(FloatProgress(value=0.0, max=10.0), HTML(value='')))

<ipython-input-11-430b0fece559>[LINE:16] # [INFO    ] [2020-12-15 18:16:16,887]  Partition read: part-00000.parquet; gc.collect: 111
<ipython-input-11-430b0fece559>[LINE:16] # [INFO    ] [2020-12-15 18:16:23,190]  Partition read: part-00001.parquet; gc.collect: 0
<ipython-input-11-430b0fece559>[LINE:16] # [INFO    ] [2020-12-15 18:16:36,223]  Partition read: part-00002.parquet; gc.collect: 0
<ipython-input-11-430b0fece559>[LINE:16] # [INFO    ] [2020-12-15 18:16:54,989]  Partition read: part-00003.parquet; gc.collect: 0
<ipython-input-11-430b0fece559>[LINE:16] # [INFO    ] [2020-12-15 18:17:09,070]  Partition read: part-00004.parquet; gc.collect: 0
<ipython-input-11-430b0fece559>[LINE:16] # [INFO    ] [2020-12-15 18:17:24,370]  Partition read: part-00005.parquet; gc.collect: 0
<ipython-input-11-430b0fece559>[LINE:16] # [INFO    ] [2020-12-15 18:17:41,955]  Partition read: part-00006.parquet; gc.collect: 0
<ipython-input-11-430b0fece559>[LINE:16] # [INFO    ] [2020-12-15 18:17:56,882]  




# Get pop val mappers

In [37]:
mappers = load_pickle(os.path.join(SAVE_DATA_PATH, "pop_vals_mappers_50k.pickle"))
combined_mappers = load_pickle(os.path.join(SAVE_DATA_PATH, "event_net_pop_vals_mappers_25k.pickle"))


# Map pop vals

In [None]:
df = map_pop_vals(df=df, mappers=mappers)


# Get label encoders

In [42]:
def get_label_encoders(mappers: T.Dict[str, str]) -> dict:
    label_encoders = dict()
    for col in tqdm(mappers.keys(), desc="fitting label encoders"):
        labels = np.unique(list(mappers[col].values()) + ['null'])
        le = LabelEncoder()
        le.fit(labels)
        label_encoders[col] = le
    return label_encoders

In [43]:
label_encoders = get_label_encoders(mappers=mappers)

HBox(children=(FloatProgress(value=0.0, description='fitting label encoders', max=7.0, style=ProgressStyle(des…




In [45]:
combined_label_encoders = get_label_encoders(mappers=combined_mappers)


HBox(children=(FloatProgress(value=0.0, description='fitting label encoders', max=2.0, style=ProgressStyle(des…




# Get le for target

In [15]:
df_target = pd.read_csv(
        os.path.join(TEST_DATA_PATH, "abattle_train_target.csv")
        )
df_target.shape

(5065350, 4)

In [23]:
labels = list(df_target['multi_class_target'].unique()) + ['null']
le = LabelEncoder()
le.fit(labels)
label_encoders['multi_class_target'] = le

# Save

In [25]:
save_pickle(label_encoders, os.path.join(SAVE_DATA_PATH, "label_encoders_from_pop_vals_50k.pickle"))
save_pickle(combined_label_encoders, os.path.join(SAVE_DATA_PATH, "label_encoders_from_combi_pop_vals_mappers_25k.pickle"))
