In [1]:
from matplotlib import pyplot as plt
import seaborn as sns

import pandas as pd
import datetime
import swifter
import numpy as np

from tqdm.notebook import tqdm

In [2]:
path_to_data = '../data/'

In [3]:
columns_to_read = ['timestamp', 'client', 'session_id']

In [4]:
sample_click_stream = pd.read_parquet(path_to_data + 'alfabattle2_abattle_clickstream/part-00005.parquet', columns=columns_to_read)

In [5]:
users_time_df = sample_click_stream.groupby(['client', 'session_id'])['timestamp'].agg(['min', 'max']).reset_index()

In [13]:
users_first_sessions = users_time_df.sort_values(['client', 'min']).drop_duplicates(['client'], keep='first')

In [14]:
users_last_sessions = users_time_df.sort_values(['client', 'max']).drop_duplicates(['client'], keep='last')

In [15]:
users_first_sessions.head(1)

Unnamed: 0,client,session_id,min,max
24,00096befff35b861d19c6de26db86fd6,591204b097729826354e04b0b8f16a1f,2020-01-25 09:59:05.834,2020-01-25 09:59:49.681


In [16]:
users_first_sessions = users_first_sessions.rename(columns={'min': 'first_session_min_time'}).drop(['session_id', 'max'], axis=1)

In [17]:
users_last_sessions.head(1)

Unnamed: 0,client,session_id,min,max
44,00096befff35b861d19c6de26db86fd6,a77cf1f8cd36d4c7a66f8230d247c94b,2020-08-22 14:18:35.107,2020-08-22 14:20:21.299


In [19]:
users_last_sessions = users_last_sessions.rename(columns={'min': 'last_session_min_time', 'max': 'last_session_max_time'}).drop(['session_id'], axis=1)

In [21]:
users_time_df = users_first_sessions.merge(users_last_sessions)

In [22]:
train_df = pd.read_csv(path_to_data + 'alfabattle2_abattle_train_target.csv', parse_dates=['timestamp'])

In [23]:
train_df.head(1)

Unnamed: 0,session_id,client_pin,timestamp,multi_class_target
0,0000029e72e5fcde6a9f29c3a3ed198f,7cf9221322a0e2fdefb1b998b8f2ab29,2020-06-15 14:01:12,main_screen


In [24]:
users_train_time_df = train_df.groupby(['client_pin'])['timestamp'].agg(['min', 'max']).reset_index()

In [25]:
users_time_df.shape

(8031, 4)

In [28]:
users_train_time_df = users_train_time_df.rename(columns={'min': 'first_labeled_session_time', 'max': 'last_labeled_session_time'})

In [29]:
result = users_time_df.merge(users_train_time_df, how='inner', left_on='client', right_on='client_pin')

In [30]:
result.head(1)

Unnamed: 0,client,first_session_min_time,last_session_min_time,last_session_max_time,client_pin,first_labeled_session_time,last_labeled_session_time
0,00096befff35b861d19c6de26db86fd6,2020-01-25 09:59:05.834,2020-08-22 14:18:35.107,2020-08-22 14:20:21.299,00096befff35b861d19c6de26db86fd6,2020-02-06 18:07:13,2020-08-17 08:00:05


In [31]:
test_df = pd.read_csv(path_to_data + 'alfabattle2_prediction_session_timestamp.csv', parse_dates=['timestamp'])

In [32]:
result.drop(['client'], axis=1, inplace=True)

In [33]:
result.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 7919 entries, 0 to 7918
Data columns (total 6 columns):
 #   Column                      Non-Null Count  Dtype         
---  ------                      --------------  -----         
 0   first_session_min_time      7919 non-null   datetime64[ns]
 1   last_session_min_time       7919 non-null   datetime64[ns]
 2   last_session_max_time       7919 non-null   datetime64[ns]
 3   client_pin                  7919 non-null   object        
 4   first_labeled_session_time  7919 non-null   datetime64[ns]
 5   last_labeled_session_time   7919 non-null   datetime64[ns]
dtypes: datetime64[ns](5), object(1)
memory usage: 433.1+ KB


In [34]:
result = result.merge(test_df, how='inner')

In [35]:
result.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 7919 entries, 0 to 7918
Data columns (total 7 columns):
 #   Column                      Non-Null Count  Dtype         
---  ------                      --------------  -----         
 0   first_session_min_time      7919 non-null   datetime64[ns]
 1   last_session_min_time       7919 non-null   datetime64[ns]
 2   last_session_max_time       7919 non-null   datetime64[ns]
 3   client_pin                  7919 non-null   object        
 4   first_labeled_session_time  7919 non-null   datetime64[ns]
 5   last_labeled_session_time   7919 non-null   datetime64[ns]
 6   timestamp                   7919 non-null   datetime64[ns]
dtypes: datetime64[ns](6), object(1)
memory usage: 494.9+ KB


In [36]:
result['test_train_lag'] = (result['timestamp'] - result['last_labeled_session_time']).apply(lambda x: x.days)

In [37]:
result['test_train_lag'].describe()

count    7919.000000
mean       14.141179
std        28.897967
min         0.000000
25%         0.000000
50%         3.000000
75%        14.000000
max       248.000000
Name: test_train_lag, dtype: float64

In [48]:
result['test_log_lag'] = (result['timestamp'] - result['last_session_min_time']).apply(lambda x: x.total_seconds())

In [49]:
result['test_log_lag'].describe()

count    7.919000e+03
mean     3.838789e+05
std      9.273569e+05
min      9.078030e+02
25%      2.153861e+04
50%      9.241595e+04
75%      3.399425e+05
max      1.702419e+07
Name: test_log_lag, dtype: float64