In [1]:
import csv
import tqdm
import datetime
import argparse
import numpy as np
import pandas as pd
import os
from collections import defaultdict
from sklearn import preprocessing
import time
from datetime import datetime, timezone, timedelta
from datetime import datetime as dt

import warnings
warnings.filterwarnings('ignore')

SEED = 666
np.random.seed(SEED)

In [2]:
dataset_path = './data/raw'

In [3]:
MIN_SESSION_LENGTH = 2
MIN_ITEM_SUPPORT = 2

# min date config
MIN_DATE = '2016-05-07'

# slicing default config
NUM_SLICES = 5
DAYS_OFFSET = 45
DAYS_SHIFT = 18
DAYS_TRAIN = 25
DAYS_TEST = 7

In [4]:
data = pd.read_csv(os.path.join(dataset_path,'train-item-views.csv'), sep=';')
data.shape

(1235380, 5)

In [5]:
data.head(10)

Unnamed: 0,sessionId,userId,itemId,timeframe,eventdate
0,1,,81766,526309,2016-05-09
1,1,,31331,1031018,2016-05-09
2,1,,32118,243569,2016-05-09
3,1,,9654,75848,2016-05-09
4,1,,32627,1112408,2016-05-09
5,1,,33043,173912,2016-05-09
6,1,,12352,329870,2016-05-09
7,1,,35077,390072,2016-05-09
8,1,,36118,487369,2016-05-09
9,1,,129055,991416,2016-05-09


In [6]:
def preprocess(data, logging=False):
 
    data = data.iloc[:,[0,2,3,4]]
    data.columns = ['SessionId', 'ItemId', 'Time', 'Date']
    data = data.astype({'SessionId':'int32', 'ItemId':'int64', 'Time':'int64', 'Date':'str'})
    data['Time'] = data.Time.fillna(0).astype(np.int64)
    
    # convert time string to timestamp and remove the original column
    data['Date'] = data.Date.apply(lambda x: datetime.strptime(x, '%Y-%m-%d'))
    data['Datestamp'] = data['Date'].apply(lambda x: x.timestamp())
    data['Time'] = (data['Time'] / 1000)
    data['Time'] = data['Time'] + data['Datestamp']
    data['TimeO'] = data.Time.apply(lambda x: datetime.fromtimestamp(x, timezone.utc))
    
    if logging:
        data_start = datetime.fromtimestamp(data.Time.min(), timezone.utc)
        data_end = datetime.fromtimestamp(data.Time.max(), timezone.utc)
        
        print('Loaded data set\n\tEvents: {}\n\tSessions: {}\n\tItems: {}\n\tSpan: {} / {}\n\n'.
              format(len(data), data.SessionId.nunique(), data.ItemId.nunique(), data_start.date().isoformat(),
                     data_end.date().isoformat()))

    data = data.groupby('SessionId').apply(lambda x: x.sort_values('Time'))
    data.index = data.index.get_level_values(1)
    
    return data

In [7]:
def filter_data(data, mindate=False, logging=False):
    
    #filter sessions of length 1
    session_lengths = data.groupby('SessionId').size()
    data = data[np.in1d(data.SessionId, session_lengths[session_lengths>1].index)]
    
    #filter item support
    item_supports = data.groupby('ItemId').size()
    data = data[np.in1d(data.ItemId, item_supports[ item_supports>= MIN_ITEM_SUPPORT].index)]
    
    #filter session length
    session_lengths = data.groupby('SessionId').size()
    data = data[np.in1d(data.SessionId, session_lengths[session_lengths>= MIN_SESSION_LENGTH].index)]
    
    #filter min date
    if mindate:
        min_datetime = datetime.strptime(MIN_DATE + ' 00:00:00', '%Y-%m-%d %H:%M:%S')
        session_max_times = data.groupby('SessionId').Time.max()
        session_keep = session_max_times[session_max_times > min_datetime.timestamp()].index
        data = data[np.in1d(data.SessionId, session_keep)]
    
    if logging:
        data_start = dt.fromtimestamp( data.Time.min(), timezone.utc)
        data_end = dt.fromtimestamp( data.Time.max(), timezone.utc)

        print('Filtered data set\n\tEvents: {}\n\tSessions: {}\n\tItems: {}\n\tSpan: {} / {}\n\n'.
              format( len(data), data.SessionId.nunique(), data.ItemId.nunique(),
                     data_start.date().isoformat(), data_end.date().isoformat()))
    return data

In [8]:
# # processed_data = preprocess(data[data.sessionId.isin([1,2,5])])
processed_data = preprocess(data.sample(10000, random_state=SEED))
filtered_data = filter_data(processed_data)
filtered_data.shape

(34, 6)

In [10]:
def split_data_org(data, logging=False):
    tmax = data.Time.max()
    session_max_times = data.groupby('SessionId').Time.max()
    session_train = session_max_times[session_max_times < tmax-86400].index
    session_test = session_max_times[session_max_times >= tmax-86400].index
    train = data[np.in1d(data.SessionId, session_train)]
    test = data[np.in1d(data.SessionId, session_test)]
    test = test[np.in1d(test.ItemId, train.ItemId)]
    tslength = test.groupby('SessionId').size()
    test = test[np.in1d(test.SessionId, tslength[tslength>=2].index)]
    
    tmax = train.Time.max()
    session_max_times = train.groupby('SessionId').Time.max()
    session_train = session_max_times[session_max_times < tmax-86400].index
    session_valid = session_max_times[session_max_times >= tmax-86400].index
    train_tr = train[np.in1d(train.SessionId, session_train)]
    valid = train[np.in1d(train.SessionId, session_valid)]
    valid = valid[np.in1d(valid.ItemId, train_tr.ItemId)]
    tslength = valid.groupby('SessionId').size()
    valid = valid[np.in1d(valid.SessionId, tslength[tslength>=2].index)]
    
    if logging:
        print('Full train set\n\tEvents: {}\n\tSessions: {}\n\tItems: {}'.format(len(train), train.SessionId.nunique(), train.ItemId.nunique()))
        print('Test set\n\tEvents: {}\n\tSessions: {}\n\tItems: {}'.format(len(test), test.SessionId.nunique(), test.ItemId.nunique()))
        print('Train set\n\tEvents: {}\n\tSessions: {}\n\tItems: {}'.format(len(train_tr), train_tr.SessionId.nunique(), train_tr.ItemId.nunique()))
        print('Validation set\n\tEvents: {}\n\tSessions: {}\n\tItems: {}'.format(len(valid), valid.SessionId.nunique(), valid.ItemId.nunique()))
    
    return train, test, train_tr, valid

In [11]:
train, test, train_tr, valid = split_data_org(filtered_data.reset_index(drop=True))
display(train.head());
display(test.head());
display(train_tr.head());
display(valid.head());

Unnamed: 0,SessionId,ItemId,Time,Date,Datestamp,TimeO
0,17405,33836,1457137000.0,2016-03-05,1457136000.0,2016-03-05 00:08:36.427000+00:00
1,17405,33836,1457137000.0,2016-03-05,1457136000.0,2016-03-05 00:16:11.349000+00:00
2,53634,1151,1459901000.0,2016-04-06,1459901000.0,2016-04-06 00:01:10.629000+00:00
3,53634,1151,1459901000.0,2016-04-06,1459901000.0,2016-04-06 00:06:47.939000+00:00
4,77611,376745,1463789000.0,2016-05-21,1463789000.0,2016-05-21 00:02:39.537000+00:00


Unnamed: 0,SessionId,ItemId,Time,Date,Datestamp,TimeO


Unnamed: 0,SessionId,ItemId,Time,Date,Datestamp,TimeO
0,17405,33836,1457137000.0,2016-03-05,1457136000.0,2016-03-05 00:08:36.427000+00:00
1,17405,33836,1457137000.0,2016-03-05,1457136000.0,2016-03-05 00:16:11.349000+00:00
2,53634,1151,1459901000.0,2016-04-06,1459901000.0,2016-04-06 00:01:10.629000+00:00
3,53634,1151,1459901000.0,2016-04-06,1459901000.0,2016-04-06 00:06:47.939000+00:00
4,77611,376745,1463789000.0,2016-05-21,1463789000.0,2016-05-21 00:02:39.537000+00:00


Unnamed: 0,SessionId,ItemId,Time,Date,Datestamp,TimeO


In [12]:
def split_data(data, logging=False) :
    
    data_end = datetime.fromtimestamp( data.Time.max(), timezone.utc )
    test_from = data_end - timedelta(DAYS_TEST)
    
    session_max_times = data.groupby('SessionId').Time.max()
    session_train = session_max_times[ session_max_times < test_from.timestamp() ].index
    session_test = session_max_times[ session_max_times >= test_from.timestamp() ].index
    train = data[np.in1d(data.SessionId, session_train)]
    test = data[np.in1d(data.SessionId, session_test)]
    test = test[np.in1d(test.ItemId, train.ItemId)]
    tslength = test.groupby('SessionId').size()
    test = test[np.in1d(test.SessionId, tslength[tslength>=2].index)]
    
    if logging:
        print('Full train set\n\tEvents: {}\n\tSessions: {}\n\tItems: {}'.format(len(train), train.SessionId.nunique(), train.ItemId.nunique()))
        print('Test set\n\tEvents: {}\n\tSessions: {}\n\tItems: {}'.format(len(test), test.SessionId.nunique(), test.ItemId.nunique()))
    
    return train, test

In [13]:
train, test = split_data(filtered_data)
display(train.head());
display(test.head());

Unnamed: 0,SessionId,ItemId,Time,Date,Datestamp,TimeO
43560,17405,33836,1457137000.0,2016-03-05,1457136000.0,2016-03-05 00:08:36.427000+00:00
43563,17405,33836,1457137000.0,2016-03-05,1457136000.0,2016-03-05 00:16:11.349000+00:00
150611,53634,1151,1459901000.0,2016-04-06,1459901000.0,2016-04-06 00:01:10.629000+00:00
150610,53634,1151,1459901000.0,2016-04-06,1459901000.0,2016-04-06 00:06:47.939000+00:00
231832,77611,376745,1463789000.0,2016-05-21,1463789000.0,2016-05-21 00:02:39.537000+00:00


Unnamed: 0,SessionId,ItemId,Time,Date,Datestamp,TimeO


In [15]:

def split_data_slice(data, slice_id, days_offset, days_train, days_test, logging=False):
    
    data_start = dt.fromtimestamp( data.Time.min(), timezone.utc )
    data_end = dt.fromtimestamp( data.Time.max(), timezone.utc )
    
    start = datetime.fromtimestamp( data.Time.min(), timezone.utc ) + timedelta( days_offset ) 
    middle =  start + timedelta( days_train )
    end =  middle + timedelta( days_test )
    
    #prefilter the timespan
    session_max_times = data.groupby('SessionId').Time.max()
    greater_start = session_max_times[session_max_times >= start.timestamp()].index
    lower_end = session_max_times[session_max_times <= end.timestamp()].index
    data_filtered = data[np.in1d(data.SessionId, greater_start.intersection( lower_end ))]
    
    #split to train and test
    session_max_times = data_filtered.groupby('SessionId').Time.max()
    sessions_train = session_max_times[session_max_times < middle.timestamp()].index
    sessions_test = session_max_times[session_max_times >= middle.timestamp()].index
    
    train = data[np.in1d(data.SessionId, sessions_train)]
    
    test = data[np.in1d(data.SessionId, sessions_test)]
    test = test[np.in1d(test.ItemId, train.ItemId)]
    
    tslength = test.groupby('SessionId').size()
    test = test[np.in1d(test.SessionId, tslength[tslength>=2].index)]
    
    if logging:
        print('Full data set {}\n\tEvents: {}\n\tSessions: {}\n\tItems: {}\n\tSpan: {} / {}'.
              format( slice_id, len(data), data.SessionId.nunique(), data.ItemId.nunique(), data_start.isoformat(), data_end.isoformat() ) )
        print('Slice data set {}\n\tEvents: {}\n\tSessions: {}\n\tItems: {}\n\tSpan: {} / {} / {}'.
              format( slice_id, len(data_filtered), data_filtered.SessionId.nunique(), data_filtered.ItemId.nunique(), start.date().isoformat(), middle.date().isoformat(), end.date().isoformat() ) )
        print('Train set {}\n\tEvents: {}\n\tSessions: {}\n\tItems: {}\n\tSpan: {} / {}'.
              format( slice_id, len(train), train.SessionId.nunique(), train.ItemId.nunique(), start.date().isoformat(), middle.date().isoformat() ) )
        print('Test set {}\n\tEvents: {}\n\tSessions: {}\n\tItems: {}\n\tSpan: {} / {} \n\n'.
              format( slice_id, len(test), test.SessionId.nunique(), test.ItemId.nunique(), middle.date().isoformat(), end.date().isoformat() ) )

    return train, test

In [16]:
def slice_data(data):
    for slice_id in range(0, NUM_SLICES):
        train, test = split_data_slice(data, slice_id, DAYS_OFFSET+(slice_id*DAYS_SHIFT), DAYS_TRAIN, DAYS_TEST, logging=False)
        display(train); display(test);

In [17]:
slice_data(filtered_data)

Unnamed: 0,SessionId,ItemId,Time,Date,Datestamp,TimeO
150611,53634,1151,1459901000.0,2016-04-06,1459901000.0,2016-04-06 00:01:10.629000+00:00
150610,53634,1151,1459901000.0,2016-04-06,1459901000.0,2016-04-06 00:06:47.939000+00:00
546704,184470,13780,1458779000.0,2016-03-24,1458778000.0,2016-03-24 00:15:54.069000+00:00
546702,184470,13780,1458779000.0,2016-03-24,1458778000.0,2016-03-24 00:17:27.279000+00:00
607616,208317,3719,1459555000.0,2016-04-02,1459555000.0,2016-04-02 00:00:20.351000+00:00
607618,208317,3719,1459556000.0,2016-04-02,1459555000.0,2016-04-02 00:19:26.195000+00:00
630088,217341,21115,1458778000.0,2016-03-24,1458778000.0,2016-03-24 00:00:08.074000+00:00
630089,217341,21115,1458778000.0,2016-03-24,1458778000.0,2016-03-24 00:05:54.635000+00:00
972245,387181,11114,1458778000.0,2016-03-24,1458778000.0,2016-03-24 00:13:36.531000+00:00
972242,387181,11114,1458778000.0,2016-03-24,1458778000.0,2016-03-24 00:13:57.669000+00:00


Unnamed: 0,SessionId,ItemId,Time,Date,Datestamp,TimeO


Unnamed: 0,SessionId,ItemId,Time,Date,Datestamp,TimeO
1006223,408766,201173,1460679000.0,2016-04-15,1460678000.0,2016-04-15 00:01:49.206000+00:00
1006221,408766,201173,1460679000.0,2016-04-15,1460678000.0,2016-04-15 00:03:36.402000+00:00


Unnamed: 0,SessionId,ItemId,Time,Date,Datestamp,TimeO


Unnamed: 0,SessionId,ItemId,Time,Date,Datestamp,TimeO
984151,394645,198976,1463184000.0,2016-05-14,1463184000.0,2016-05-14 00:00:15.999000+00:00
984152,394645,198976,1463184000.0,2016-05-14,1463184000.0,2016-05-14 00:07:47.945000+00:00


Unnamed: 0,SessionId,ItemId,Time,Date,Datestamp,TimeO


Unnamed: 0,SessionId,ItemId,Time,Date,Datestamp,TimeO
231832,77611,376745,1463789000.0,2016-05-21,1463789000.0,2016-05-21 00:02:39.537000+00:00
231833,77611,376745,1463789000.0,2016-05-21,1463789000.0,2016-05-21 00:04:58.066000+00:00
444496,146586,30700,1464307000.0,2016-05-27,1464307000.0,2016-05-27 00:01:41.216000+00:00
444497,146586,30700,1464308000.0,2016-05-27,1464307000.0,2016-05-27 00:07:39.395000+00:00
504488,168399,285230,1464567000.0,2016-05-30,1464566000.0,2016-05-30 00:08:37.399000+00:00
504490,168399,285230,1464567000.0,2016-05-30,1464566000.0,2016-05-30 00:11:20.201000+00:00
984151,394645,198976,1463184000.0,2016-05-14,1463184000.0,2016-05-14 00:00:15.999000+00:00
984152,394645,198976,1463184000.0,2016-05-14,1463184000.0,2016-05-14 00:07:47.945000+00:00


Unnamed: 0,SessionId,ItemId,Time,Date,Datestamp,TimeO


Unnamed: 0,SessionId,ItemId,Time,Date,Datestamp,TimeO


Unnamed: 0,SessionId,ItemId,Time,Date,Datestamp,TimeO
