# Imports

In [2]:
import os
import sys
import gc
import random
import logging
import pickle
import typing as T
from datetime import datetime
from dateutil.relativedelta import relativedelta
from logging import getLogger

import pandas as pd
import numpy as np
from tqdm.notebook import tqdm
import matplotlib.pyplot as plt
import seaborn as sns
import pyarrow

# Settings

In [9]:
PROJ_PATH = ".."
DATA_PATH = os.path.join(PROJ_PATH, "data")

In [21]:
TRAIN_DATA_PATH = os.path.join(DATA_PATH, "final/clickstream")
TEST_DATA_PATH = os.path.join(DATA_PATH, "final/target")
PREPROCESSED_DATA_PATH = os.path.join(DATA_PATH, "preprocessed")
MAPPERS_PATH = os.path.join(PREPROCESSED_DATA_PATH, "mappers")

In [12]:
def seed_everything(seed=42):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)

In [13]:
SEED = 42
seed_everything(seed=SEED)

In [14]:
logger = getLogger(name=__name__)
logging.basicConfig(format = u'%(filename)s[LINE:%(lineno)d] # [%(levelname)-8s] [%(asctime)s]  %(message)s', level = logging.INFO)


# Methods

In [15]:
def save_pickle(a, filepath):
    with open(filepath, 'wb') as handle:
        pickle.dump(a, handle, protocol=pickle.HIGHEST_PROTOCOL)
        
        
def load_pickle(filepath):
    with open(filepath, 'rb') as handle:
        b = pickle.load(handle)
        
    return b

In [16]:
def map_pop_vals(df: pd.DataFrame, mappers: T.Dict[str, str]) -> pd.DataFrame:
    for col, mapper in tqdm(mappers.items(), desc="Mapping pop vals"):
        if col in df.columns:
            df[col] = df[col].map(lambda x: mapper[x] if x in mapper else 'other')
    return df

# Code

## Methods

In [17]:
def join_cols(df: pd.DataFrame) -> pd.DataFrame:
    # create joined cols
    df = df.fillna('null')
    # df['event'] = df['event_type'] + " @ " + df['event_category']\
    #     + " @ " + df['event_name'] + " @ " + df['device_screen_name']
    df['net'] = df['net_connection_type'] + " @ " + df['net_connection_tech']
    logger.info(f"Before COMBINED pop val mapping: Unique col net = {df['net'].nunique()}")
    return df

In [18]:
def get_session_dataframe_from_partition(
    df: pd.DataFrame,
    pop_val_mappers: T.Dict[str, str],
    combined_pop_val_mappers: T.Dict[str, str]
    ) -> pd.DataFrame:
    logger.info(f"Before GENERAL pop val  mapping: Unique event_type = {df['event_type'].nunique()}; event_category = {df['event_category'].nunique()}; device_screen_name = {df['device_screen_name'].nunique()}")

    # map popular values
    df = map_pop_vals(df=df, mappers=pop_val_mappers)
    logger.info(f"After GENERAL pop val  mapping: Unique event_type = {df['event_type'].nunique()}; event_category = {df['event_category'].nunique()}; device_screen_name = {df['device_screen_name'].nunique()}")

    df = join_cols(df=df)

    # Filter
    cols = ['timestamp', 'client', 'session_id', 'event_type', 'event_category', 'device_screen_name', 'net']
    df = df[cols]
    logger.info(f"df shape = {df.shape}")

    # map popular values
    df = map_pop_vals(df=df, mappers=combined_pop_val_mappers)
    logger.info(f"AFTER COMBINED pop val mapping: Unique col net = {df['net'].nunique()}")

    # Get session df
    df_sess = df.groupby("session_id").agg({
                'timestamp': [
                                min,
                                lambda x: (x.max() - x.min()).seconds
                            ],
                'client': min,
                'net': min,
                'event_type': list,
                'event_category': list,
                'device_screen_name': list,
            }).reset_index()
    df_sess.columns = [
                   'session_id', 'timestamp', 'sess_length_seconds',
                   'client', 'net', 'event_type', 'event_category', 'device_screen_name'
                   ]
    return df_sess


In [19]:
def run_events_to_sessions(
    file_names: T.List[str],
    cols: T.List[str],
    pop_val_mappers_path: str,
    combined_pop_val_mappers_path: str,
    save_path: str
    ) -> None:
    logger.info(f"***** run_events_to_sessions *****")
    for part in tqdm(file_names):
        # get dataframe
        df = pd.read_parquet(
            os.path.join(
                TRAIN_DATA_PATH,
                part),
                columns=cols,
                engine='pyarrow'
            )
        logger.info(f"Events df shape = {df.shape}")
        
        # Filter
        df = df[
                (df['application_id'] == 'mobile')\
                & (df['event_type'].isin(['se', 'sv']))
            ].reset_index(drop=True)
        logger.info(f"Events df shape after filter = {df.shape}")

        # get pop val mappers
        pop_val_mappers = load_pickle(pop_val_mappers_path)
        combined_pop_val_mappers = load_pickle(combined_pop_val_mappers_path)

        # Get session dataframe
        df_sess = get_session_dataframe_from_partition(
            df=df,
            pop_val_mappers=pop_val_mappers,
            combined_pop_val_mappers=combined_pop_val_mappers
            )
        logger.info(f"Session df shape = {df_sess.shape}")
        
        # Save
        save_pickle(df_sess, os.path.join(save_path, f"df_sess_{part.split('.')[0]}.pickle"))

        del df, df_sess
        collected = gc.collect()
        logger.info(f"Memory cleaned = {collected}")

## Run

In [35]:
part_names = [
 'part-00000.parquet',
 'part-00001.parquet',
 'part-00002.parquet',
 'part-00003.parquet',
 'part-00004.parquet',
 'part-00005.parquet',
 'part-00006.parquet',
 'part-00007.parquet',
 'part-00008.parquet',
 'part-00009.parquet'
]
cols = ['timestamp', 'application_id', 'client', 'session_id', 'event_type',
       'event_category', 'event_name', 'device_screen_name',
        'net_connection_type', 'net_connection_tech']

pop_val_mappers_path = os.path.join(
    MAPPERS_PATH,
    "pop_vals_mappers_50k.pickle"
    )
combined_pop_val_mappers_path = os.path.join(
    MAPPERS_PATH,
    "event_net_pop_vals_mappers_25k.pickle"
    )
save_path = os.path.join(
    PREPROCESSED_DATA_PATH,
    "ap-mobile_et-sv-se_cols-event-net"
    )

In [36]:
run_events_to_sessions(
    file_names=part_names,
    cols=cols,
    pop_val_mappers_path=pop_val_mappers_path,
    combined_pop_val_mappers_path=combined_pop_val_mappers_path,
    save_path=save_path
    ) 

<ipython-input-19-6ba7f77b3c6d>[LINE:8] # [INFO    ] [2021-01-28 07:58:23,859]  ***** run_events_to_sessions *****


HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))

<ipython-input-19-6ba7f77b3c6d>[LINE:18] # [INFO    ] [2021-01-28 07:58:36,475]  Events df shape = (12300571, 10)
<ipython-input-19-6ba7f77b3c6d>[LINE:25] # [INFO    ] [2021-01-28 07:58:39,909]  Events df shape after filter = (11850114, 10)
<ipython-input-18-1833ebf82e9b>[LINE:6] # [INFO    ] [2021-01-28 07:58:41,192]  Before GENERAL pop val  mapping: Unique event_type = 2; event_category = 136; device_screen_name = 536


HBox(children=(FloatProgress(value=0.0, description='Mapping pop vals', max=7.0, style=ProgressStyle(descripti…




<ipython-input-18-1833ebf82e9b>[LINE:10] # [INFO    ] [2021-01-28 07:58:56,037]  After GENERAL pop val  mapping: Unique event_type = 2; event_category = 47; device_screen_name = 128
<ipython-input-17-5e13e6b29a83>[LINE:7] # [INFO    ] [2021-01-28 07:59:04,344]  Before COMBINED pop val mapping: Unique col net = 14
<ipython-input-18-1833ebf82e9b>[LINE:17] # [INFO    ] [2021-01-28 07:59:07,415]  df shape = (11850114, 7)


HBox(children=(FloatProgress(value=0.0, description='Mapping pop vals', max=2.0, style=ProgressStyle(descripti…




<ipython-input-18-1833ebf82e9b>[LINE:21] # [INFO    ] [2021-01-28 07:59:10,670]  AFTER COMBINED pop val mapping: Unique col net = 8
<ipython-input-19-6ba7f77b3c6d>[LINE:37] # [INFO    ] [2021-01-28 08:07:55,118]  Session df shape = (983086, 8)
<ipython-input-19-6ba7f77b3c6d>[LINE:44] # [INFO    ] [2021-01-28 08:08:00,572]  Memory cleaned = 0
<ipython-input-19-6ba7f77b3c6d>[LINE:18] # [INFO    ] [2021-01-28 08:08:15,976]  Events df shape = (11681079, 10)
<ipython-input-19-6ba7f77b3c6d>[LINE:25] # [INFO    ] [2021-01-28 08:08:19,338]  Events df shape after filter = (11264314, 10)
<ipython-input-18-1833ebf82e9b>[LINE:6] # [INFO    ] [2021-01-28 08:08:20,585]  Before GENERAL pop val  mapping: Unique event_type = 2; event_category = 138; device_screen_name = 531


HBox(children=(FloatProgress(value=0.0, description='Mapping pop vals', max=7.0, style=ProgressStyle(descripti…




<ipython-input-18-1833ebf82e9b>[LINE:10] # [INFO    ] [2021-01-28 08:08:34,793]  After GENERAL pop val  mapping: Unique event_type = 2; event_category = 47; device_screen_name = 128
<ipython-input-17-5e13e6b29a83>[LINE:7] # [INFO    ] [2021-01-28 08:08:42,960]  Before COMBINED pop val mapping: Unique col net = 10
<ipython-input-18-1833ebf82e9b>[LINE:17] # [INFO    ] [2021-01-28 08:08:45,923]  df shape = (11264314, 7)


HBox(children=(FloatProgress(value=0.0, description='Mapping pop vals', max=2.0, style=ProgressStyle(descripti…

<ipython-input-18-1833ebf82e9b>[LINE:21] # [INFO    ] [2021-01-28 08:08:49,001]  AFTER COMBINED pop val mapping: Unique col net = 8





<ipython-input-19-6ba7f77b3c6d>[LINE:37] # [INFO    ] [2021-01-28 08:17:11,597]  Session df shape = (941769, 8)
<ipython-input-19-6ba7f77b3c6d>[LINE:44] # [INFO    ] [2021-01-28 08:17:16,881]  Memory cleaned = 0
<ipython-input-19-6ba7f77b3c6d>[LINE:18] # [INFO    ] [2021-01-28 08:17:34,194]  Events df shape = (11811781, 10)
<ipython-input-19-6ba7f77b3c6d>[LINE:25] # [INFO    ] [2021-01-28 08:17:37,968]  Events df shape after filter = (11357111, 10)
<ipython-input-18-1833ebf82e9b>[LINE:6] # [INFO    ] [2021-01-28 08:17:39,230]  Before GENERAL pop val  mapping: Unique event_type = 2; event_category = 135; device_screen_name = 523


HBox(children=(FloatProgress(value=0.0, description='Mapping pop vals', max=7.0, style=ProgressStyle(descripti…




<ipython-input-18-1833ebf82e9b>[LINE:10] # [INFO    ] [2021-01-28 08:17:53,555]  After GENERAL pop val  mapping: Unique event_type = 2; event_category = 47; device_screen_name = 128
<ipython-input-17-5e13e6b29a83>[LINE:7] # [INFO    ] [2021-01-28 08:18:02,497]  Before COMBINED pop val mapping: Unique col net = 11
<ipython-input-18-1833ebf82e9b>[LINE:17] # [INFO    ] [2021-01-28 08:18:06,933]  df shape = (11357111, 7)


HBox(children=(FloatProgress(value=0.0, description='Mapping pop vals', max=2.0, style=ProgressStyle(descripti…

<ipython-input-18-1833ebf82e9b>[LINE:21] # [INFO    ] [2021-01-28 08:18:10,078]  AFTER COMBINED pop val mapping: Unique col net = 8





<ipython-input-19-6ba7f77b3c6d>[LINE:37] # [INFO    ] [2021-01-28 08:26:44,231]  Session df shape = (954027, 8)
<ipython-input-19-6ba7f77b3c6d>[LINE:44] # [INFO    ] [2021-01-28 08:26:50,038]  Memory cleaned = 0



