# Imports

In [1]:
import os
import sys
import gc
import random
import logging
import pickle
import typing as T
from datetime import datetime
from dateutil.relativedelta import relativedelta
from logging import getLogger

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm.notebook import tqdm
from sklearn.preprocessing import LabelEncoder
# from tensorflow.keras.preprocessing.sequence import pad_sequences

tqdm.pandas()

# Settings

In [2]:
PROJ_PATH = ".."
DATA_PATH = os.path.join(PROJ_PATH, "data")

TEST_DATA_PATH = os.path.join(DATA_PATH, "final/target")
PREPROCESSED_DATA_PATH = os.path.join(DATA_PATH, "preprocessed")
MAPPERS_PATH = os.path.join(PREPROCESSED_DATA_PATH, "mappers")
SESSION_DATA_PATH = os.path.join(
    PREPROCESSED_DATA_PATH, "ap-mobile_et-sv-se_cols-event-net"
    )

In [3]:
def seed_everything(seed=42):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)

In [4]:
SEED = 42
seed_everything(seed=SEED)

In [5]:
logger = getLogger(name=__name__)
logging.basicConfig(format = u'%(filename)s[LINE:%(lineno)d] # [%(levelname)-8s] [%(asctime)s]  %(message)s', level = logging.INFO)


# Methods

In [6]:
def save_pickle(a, filepath):
    with open(filepath, 'wb') as handle:
        pickle.dump(a, handle, protocol=pickle.HIGHEST_PROTOCOL)
        
        
def load_pickle(filepath):
    with open(filepath, 'rb') as handle:
        b = pickle.load(handle)
        
    return b

In [7]:
def get_full_dataset(
    data_path: str,
    part_names: T.List[str],
    columns: T.Optional[T.List[str]] = None
    ) -> pd.DataFrame:
    df = pd.DataFrame(columns=columns)
    for part in tqdm(part_names):
        dft = load_pickle(os.path.join(data_path, part))
        if columns:
            dft = dft[columns]
        df = pd.concat((df, dft))
        del dft
        collected_gc = gc.collect()
        logger.info(f"Partition read: {part}; gc.collect: {collected_gc}")
    return df.reset_index(drop=True)

In [8]:
def show_df_memory_usage_gb(df: pd.DataFrame): 
    memory_usage = df.memory_usage(index=True, deep=True).sum() / 1024**3
    print(f'RAM memory usage: {round(memory_usage, 4)} Gb')

In [9]:
def label_encode_arrs(series: pd.Series, le: LabelEncoder) -> pd.Series:
    out = series.apply(lambda arr: le.transform(arr))
    del series
    gc.collect()
    return out

In [10]:
def get_text_array(df, grp_col: str, txt_col: str, length: int):
    df_grp = df.groupby(grp_col)
    result = [
            [str(el) for el in x[txt_col][max(i - length, 0):i].values]
            for j, x in df_grp 
                for i in range(len(x))
            ]
    del df_grp
    gc.collect()
    return result

# Get data

## Get tables

### History (train) data

In [13]:
def get_train_data(part_names: T.List[str]) -> pd.DataFrame:
    df = get_full_dataset(
        data_path=SESSION_DATA_PATH, part_names=part_names, columns=None
    )
    logger.info(f"df_size = {show_df_memory_usage_gb(df=df)}")
    logger.info(f"read df shape = {df.shape}")
    logger.info(f"read df timestamps: min={df['timestamp'].min()}; max={df['timestamp'].max()}")
    return df

In [14]:
part_names = [
              'df_sess_part-00000.pickle',
                'df_sess_part-00001.pickle',
                'df_sess_part-00002.pickle',
                'df_sess_part-00003.pickle',
                'df_sess_part-00004.pickle',
                'df_sess_part-00005.pickle',
                'df_sess_part-00006.pickle',
                'df_sess_part-00007.pickle',
                'df_sess_part-00008.pickle',
                'df_sess_part-00009.pickle'
                ]

In [69]:
df = get_train_data(part_names=part_names)

(Timestamp('2020-01-01 00:00:00.077000'),
 Timestamp('2020-09-30 22:41:48.848000'))

### Train target

In [15]:
def get_target_data() -> pd.DataFrame:
    df_target = pd.read_csv(
        os.path.join(TEST_DATA_PATH, "abattle_train_target.csv")
        )
    logger.info(f"target df shape = {df_target.shape}")
    return df_target

In [None]:
df_target = get_target_data()

### Join

In [18]:
def prepare_train_df(df: pd.DataFrame, df_target: pd.DataFrame) -> pd.DataFrame:
    df = pd.merge(
        df,
        df_target[['session_id', 'multi_class_target']], how='left', on='session_id'
    ).reset_index(drop=True)
    df = df.sort_values(by=['client', 'timestamp']).reset_index(drop=True)
    logger.info(f"df shape after merge = {df.shape}")
    return df

In [None]:
df = prepare_train_df(df, df_target)

In [77]:
del df_target
# gc.collect()

0

## Get label encoders

In [17]:
def get_label_encoders() -> T.Tuple[T.Any, T.Any]:
    label_encoders = load_pickle(
        os.path.join(
            MAPPERS_PATH, "label_encoders_from_pop_vals_50k.pickle"
            )
        )
    combined_label_encoders = load_pickle(
        os.path.join(
            MAPPERS_PATH, "label_encoders_from_combi_pop_vals_mappers_25k.pickle"
            )
        )
    return label_encoders, combined_label_encoders

In [78]:
label_encoders, combined_label_encoders = get_label_encoders()

# Preprocessing

## Label encode

In [20]:
def df_label_encode(
    df: pd.DataFrame, label_encoders: T.Any, combined_label_encoders: T.Any
    ) -> pd.DataFrame:
    df['multi_class_target'] = df['multi_class_target'].fillna('null')

    df['net'] = combined_label_encoders['net'].transform(df['net'])
    df['multi_class_target'] = label_encoders['multi_class_target'].transform(df['multi_class_target'])

    df['event_type'] = label_encode_arrs(
        series=df['event_type'], le=label_encoders['event_type']
        )
    df['event_category'] = label_encode_arrs(
        series=df['event_category'], le=label_encoders['event_category']
        )
    df['device_screen_name'] = label_encode_arrs(
        series=df['device_screen_name'], le=label_encoders['device_screen_name']
        )
    logger.info(f"df shape after label encode = {df.shape}")
    return df

In [None]:
%%time
df = df_label_encode(df, label_encoders, combined_label_encoders)

## Stage 1

### Time features

In [21]:
def create_time_diff_seq(df: pd.DataFrame) -> pd.DataFrame:
    min_td = 0.0
    max_td = 5.0369526024136295
    df_grp = df.groupby('client')
    df['time_diff_hours'] = [
                            el for uid, data in df_grp
                            for el in (
                                data['timestamp'] - data['timestamp'].shift(1)
                                ).astype('timedelta64[h]').values
                            ]
    
    df['time_diff_hours'] = df['time_diff_hours'].clip(0.0, 153.0)
    df['time_diff_hours'] = np.log1p(df['time_diff_hours'])
    df['time_diff_hours'] = (df['time_diff_hours'] - min_td)/(max_td - min_td)
    df['time_diff_hours'] = df['time_diff_hours'] + 0.1
    df['time_diff_hours'] = df['time_diff_hours'].fillna(0)

    df['prev_time_diffs'] = [
                         list(x.time_diff_hours[max(i-59, 0):i+1].values)
                         for j, x in df.groupby('client')
                            for i in range(len(x))
                        ]
    del df_grp
    df['time_diff_hours'].hist(bins=100)
    plt.title("time_diff_hours")
    plt.show()
    logger.info(f"df shape after time diff seq creation = {df.shape}")
    return df

In [22]:
def create_sess_length_seq(df: pd.DataFrame) -> pd.DataFrame:
    min_td = 1.0986122886681096
    max_td = 6.661854740545311

    df['sess_length_seconds'] = df['sess_length_seconds'].clip(2.0, 781.0)
    df['sess_length_seconds'] = np.log1p(df['sess_length_seconds'])
    df['sess_length_seconds'] = (df['sess_length_seconds'] - min_td)/(max_td-min_td)
    df['sess_length_seconds'] = df['sess_length_seconds'] + 0.1
    df['sess_length_seconds'] = df['sess_length_seconds'].fillna(0)

    df_grp = df.groupby('client')
    df['prev_sess_length_seconds'] = [list(x.sess_length_seconds[max(i-60, 0):i].values) for j, x in df_grp
                                            for i in range(len(x))]

    df['sess_length_seconds'].hist(bins=100)
    plt.title("sess_length_seconds")
    plt.show()
    logger.info(f"df shape after sess length seq creation = {df.shape}")
    del df_grp
    return df

In [23]:
def get_time_seq(df: pd.DataFrame) -> pd.DataFrame:
    df = create_time_diff_seq(df)
    df = create_sess_length_seq(df)
    return df

In [24]:
%%time
df = get_time_seq(df)

NameError: name 'df' is not defined

### Text

In [25]:
def get_text_seq(df: pd.DataFrame, length: T.Optional[int] = 60) -> pd.DataFrame:
    df['prev_target'] = get_text_array(
        df=df, grp_col='client', txt_col='multi_class_target', length=length
        )
    logger.info(f"df shape after target seq creation = {df.shape}")
    
    df['prev_net'] = get_text_array(
        df=df, grp_col='client', txt_col='net', length=length
        )
    logger.info(f"df shape after net seq creation = {df.shape}")
    return df

In [None]:
%%time
df = get_text_seq(df, length=60)

## List features

In [26]:
def get_list_seq(df: pd.DataFrame, length: T.Optional[int] = 60) -> pd.DataFrame:
    df_grp = df.groupby('client')
    df['prev_event_type_seq'] = [x.event_type[max(i-length, 0):i].tolist() for j, x in df_grp
                                          for i in range(len(x))]
    logger.info(f"df shape after event_type seq creation = {df.shape}")
    
    df['prev_event_category_seq'] = [x.event_category[max(i-length, 0):i].tolist() for j, x in df_grp
                                          for i in range(len(x))]
    logger.info(f"df shape after event_category seq creation = {df.shape}")
    
    df['prev_event_device_screen_name_seq'] = [x.device_screen_name[max(i-length, 0):i].tolist() for j, x in df_grp
                                          for i in range(len(x))]
    logger.info(f"df shape after screens seq creation = {df.shape}")

    del df_grp
    return df

In [None]:
%%time
df = get_list_seq(df, length=60)

## Filter

In [27]:
def filter_df_with_target(df: pd.DataFrame) -> pd.DataFrame:
    null_label = label_encoders['multi_class_target'].transform(['null'])[0]
    logger.info(f"null_label = {null_label}")
    dfs = df[(df['multi_class_target'] != null_label)].reset_index(drop=True)
    logger.info(f"df shape after FILTER = {dfs.shape}")
    logger.info(f"df_size after FILTER = {show_df_memory_usage_gb(df=dfs)}")

    del df
    return dfs

In [155]:
%%time
df = filter_df_with_target(df)

In [157]:
gc.collect()


RAM memory usage: 19.936 Gb


# Stage 2

## Numbers (times) padding

In [None]:
def time_padding(df: pd.DataFrame, length: T.Optional[int] = 60) -> pd.DataFrame:
    df['prev_time_diffs_token'] = df['prev_time_diffs'].progress_apply(lambda arr: [0.]*(length-len(arr))+arr[-length:])
    df['prev_sess_length_seconds_token'] = df['prev_sess_length_seconds'].progress_apply(lambda arr: [0.]*(length-len(arr))+arr[-length:])
    df.drop(['prev_time_diffs', 'prev_sess_length_seconds'], axis=1, inplace=True)

    logger.info(f"df shape after time padding = {df.shape}")
    return df

In [None]:
%%time
df = time_padding(df, length=60)

## Texts padding

In [None]:
def text_padding(df: pd.DataFrame, length: T.Optional[int] = 60) -> pd.DataFrame:
    df['prev_target_token'] = list(pad_sequences(
                df['prev_target'],
                maxlen=length,
                padding='pre'
            ))
    df['prev_net_token'] = list(pad_sequences(
                    df['prev_net'],
                    maxlen=length,
                    padding='pre'
                ))
    df.drop(['prev_target', 'prev_net'], axis=1, inplace=True)
    
    logger.info(f"df shape after text padding = {df.shape}")
    return df

In [161]:
%%time
df = text_padding(df, length=60)

CPU times: user 3min 35s, sys: 3min 2s, total: 6min 37s
Wall time: 6min 35s


## Arrays padding

In [167]:
def get_encoded_row(
    row: list,
    maxsess: int,
    maxlen: int,
    padding: str
):
    sessions = []
    if len(row[:maxsess]) < maxsess:
        for _ in range(maxsess - len(row[:maxsess])):
            sessions.append(np.zeros(maxlen))
    for sess in row[:maxsess]:
        sessions.append(sess+1)
    return pad_sequences(
                sessions,
                maxlen=maxlen,
                padding=padding
            )

In [168]:
def encode_arrays(series: pd.Series):
    out = series.progress_apply(
        lambda row: get_encoded_row(
            row=row, maxsess=60, maxlen=10, padding='post'
            )
        )
    return out


In [None]:
def list_padding(df: pd.DataFrame, length: T.Optional[int] = 60) -> pd.DataFrame:
    df['prev_event_type_seq_token'] = encode_arrays(
        series=df['prev_event_type_seq']
        )
    df['prev_event_category_seq_token'] = encode_arrays(
        series=df['prev_event_category_seq']
        )
    df['prev_event_device_screen_name_seq_token'] = encode_arrays(
        series=df['prev_event_device_screen_name_seq']
        )
    df.drop(['prev_event_type_seq', 'prev_event_category_seq', 'prev_event_device_screen_name_seq'], axis=1, inplace=True)
    return df

In [None]:
%%time
df = list_padding(df, length=60)

gc.collect()

## Combine getting train data

In [28]:
def get_train_prep_pipeline(part_names: T.List[str], length: T.Optional[int] = 60) -> pd.DataFrame:
    df = get_train_data(part_names=part_names)
    df_target = get_target_data()

    df = prepare_train_df(df, df_target)
    label_encoders, combined_label_encoders = get_label_encoders()
    df = df_label_encode(df, label_encoders, combined_label_encoders)

    df = get_time_seq(df)
    df = get_text_seq(df, length=60)
    df = get_list_seq(df, length=60)
    df = filter_df_with_target(df)
    logger.info(f"gc.collect() = {gc.collect()}")

    df = time_padding(df, length=60)
    df = text_padding(df, length=60)
    df = list_padding(df, length=60)
    logger.info(f"gc.collect() = {gc.collect()}")
    return df, label_encoders, combined_label_encoders

In [None]:
# %%time
# df, label_encoders, combined_label_encoders = get_train_prep_pipeline(part_names, length=60)

# Create test data

## Get test data

In [None]:
def get_test_data() -> T.Tuple[pd.DataFrame, pd.DataFrame]:
    df_test = pd.read_csv(os.path.join(
    TEST_DATA_PATH, 'prediction_session_timestamp.csv'
    ))

    df_test_target = pd.read_csv(os.path.join(
        TEST_DATA_PATH, 'abattle_test_target_both_private_and_public.csv'
        ))

    df_test.columns = ['client', 'current_timestamp']
    df_test_target.columns = ['client', 'target']
    df_test['current_timestamp'] = pd.to_datetime(df_test['current_timestamp'])
    
    logger.info(f"Test datasets' shapes: {df_test.shape}, {df_test_target.shape}")
    return df_test, df_test_target

In [None]:
df_test, df_test_target = get_test_data()

## Join

In [2]:
def join_dataframes(
    df: pd.DataFrame, df_test: pd.DataFrame, df_test_target: pd.DataFrame
    ) -> pd.DataFrame:
    df = df.sort_values(by=['client', 'timestamp']).reset_index(drop=True)
    df_last = df.groupby('client').tail(1).reset_index(drop=True)

    logger.info(f"df_last shape before join: {df_last.shape}")
    logger.info(f"test clients - train clients = {len(set(df_test['client']) - set(df_last['client']))}")
    
    df_last = pd.merge(
        df_test,
        df_last[['client', 'timestamp', 'sess_length_seconds',
            'event_type', 'event_category', 'net',
            'device_screen_name', 'multi_class_target',
            'prev_time_diffs_token', 'prev_sess_length_seconds_token',
            'prev_target_token', 'prev_net_token',
            'prev_event_type_seq_token', 'prev_event_category_seq_token',
            'prev_event_device_screen_name_seq_token'
            ]],
        on='client',
        how='inner'
        ).reset_index(drop=True)

    df_last = pd.merge(
        df_last, df_test_target, on='client', how='inner'
        ).reset_index(drop=True)

    logger.info(f"df_last shape after join: {df_last.shape}")

    return df_last

In [None]:
df_last = join_dataframes(df, df_test, df_test_target)

## preproc

In [None]:
def get_new_seq(row, col, seq_col):
    arr = list(row[col])
    # arr = np.array([0]*(10-len(arr)) + arr[-10:]) # pre padding
    arr = np.array(arr[:10] + [0]*(10-len(arr))) # post padding
    row[seq_col] = np.array(list(row[seq_col][1:]) + [arr])
    return row

def get_new_row(row):
    row['prev_time_diffs_token'] = list(row['prev_time_diffs_token'][1:]) + [row['time_diff']]
    row['prev_sess_length_seconds_token'] = list(row['prev_sess_length_seconds_token'][1:]) + [row['sess_length_seconds']]

    row['prev_target_token'] = np.array(list(row['prev_target_token'][1:]) + [row['multi_class_target']])
    row['prev_net_token'] = np.array(list(row['prev_net_token'][1:]) + [row['net']])

    row = get_new_seq(row=row, col='event_type', seq_col='prev_event_type_seq_token')
    row = get_new_seq(row=row, col='event_category', seq_col='prev_event_category_seq_token')
    row = get_new_seq(row=row, col='device_screen_name', seq_col='prev_event_device_screen_name_seq_token')
    
    return row

In [None]:
def prepare_test_data(df_last: pd.DataFrame) -> pd.DataFrame:
    # add time diff col
    min_td = 0.0
    max_td = 5.0369526024136295
    df_last['time_diff'] = (df_last['current_timestamp'] - df_last['timestamp']).astype('timedelta64[h]')
    df_last['time_diff'] = np.log1p(df_last['time_diff'].clip(0.0, 153.0))
    df_last['time_diff'] = (df_last['time_diff'] - min_td)/(max_td - min_td)
    df_last['time_diff'] = df_last['time_diff'] + 0.1
    df_last['time_diff'] = df_last['time_diff'].fillna(0)

    df_last['time_diff'].hist(bins=100)
    plt.title("test time diff")
    plt.show()

    # get new sequence columns
    df_prep = df_last.apply(get_new_row, axis=1)[['client', 'current_timestamp',
             'prev_time_diffs_token', 'prev_sess_length_seconds_token',
            'prev_target_token', 'prev_net_token',
            'prev_event_type_seq_token', 'prev_event_category_seq_token',
            'prev_event_device_screen_name_seq_token']]
    logger.info(f"df_prep shape after preparing: {df_prep.shape}")
    return df_prep


In [None]:
%%time
df_test_prep = prepare_test_data(df_last)

## Combine in pipeline

In [None]:
def get_test_prep_pipeline(df: pd.DataFrame) -> pd.DataFrame:
    df_test, df_test_target = get_test_data()
    df_last = join_dataframes(df, df_test, df_test_target)
    df_test_prep = prepare_test_data(df_last)
    return df_test_prep

In [None]:
# %%time
# df_test_prep = get_test_prep_pipeline(df)

# Save

In [242]:
SAVE_DATA_PATH = os.path.join(PREPROCESSED_DATA_PATH, "sequence")
SAVE_TEST_DATA_PATH = os.path.join(SAVE_DATA_PATH, "test")
SAVE_TRAIN_DATA_PATH = os.path.join(SAVE_DATA_PATH, "train")

## Save test

In [243]:
save_pickle(df_test_prep, os.path.join(SAVE_TEST_DATA_PATH, 'df_test.pickle'))

## Save train

In [247]:
cols = ['session_id', 'timestamp', 'client',
       'multi_class_target', 'prev_time_diffs_token',
       'prev_sess_length_seconds_token', 'prev_target_token', 'prev_net_token',
       'prev_event_type_seq_token', 'prev_event_category_seq_token',
       'prev_event_device_screen_name_seq_token']


In [250]:
df_april = df[(df['timestamp'] < '2020-04-01')].reset_index(drop=True)[cols]
df_july = df[(df['timestamp'] < '2020-07-01') & (df['timestamp'] >= '2020-04-01')].reset_index(drop=True)[cols]
df_sep = df[(df['timestamp'] >= '2020-07-01')].reset_index(drop=True)[cols]


In [251]:
df_april.shape, df_july.shape, df_sep.shape

((1751913, 11), (1868024, 11), (1445413, 11))

In [252]:
show_df_memory_usage_gb(df=df_july)



RAM memory usage: 15.6889 Gb


In [254]:
gc.collect()

27675

In [256]:
%%time
save_pickle(df_sep, os.path.join(SAVE_TRAIN_DATA_PATH, 'preprocessed_df_sep.pickle'))

CPU times: user 12min 40s, sys: 4min 32s, total: 17min 12s
Wall time: 17min 10s


In [257]:
%%time
save_pickle(df_july, os.path.join(SAVE_TRAIN_DATA_PATH, 'preprocessed_df_july.pickle'))

CPU times: user 16min 35s, sys: 6min 11s, total: 22min 47s
Wall time: 22min 44s


In [258]:
%%time
save_pickle(df_april, os.path.join(SAVE_TRAIN_DATA_PATH, 'preprocessed_df_april.pickle'))

CPU times: user 10min 27s, sys: 3min 15s, total: 13min 43s
Wall time: 13min 38s
