In [144]:
import numpy as np
import pandas as pd
import time
from datetime import datetime, timezone, timedelta
from datetime import datetime as dt

In [10]:
data_path = './data'

## Preprocessing

In [139]:
SESSION_LENGTH = 30 * 60 #30 minutes
MIN_SESSION_LENGTH = 2
MIN_ITEM_SUPPORT = 2
MIN_DATE = '2014-04-01'
NUM_SLICES = 2
DAYS_OFFSET = 0
DAYS_SHIFT = 2
DAYS_TRAIN = 9
DAYS_TEST = 1

In [67]:
data = pd.read_csv(os.path.join(data_path,'events.csv'))
data.shape

(2756101, 5)

In [68]:
data.head()

Unnamed: 0,timestamp,visitorid,event,itemid,transactionid
0,1433221332117,257597,view,355908,
1,1433224214164,992329,view,248676,
2,1433221999827,111016,view,318965,
3,1433221955914,483717,view,253185,
4,1433221337106,951259,view,367447,


In [126]:
visitcount = data.visitorid.value_counts()
random_ids = visitcount[visitcount>5].sample(3).index

In [69]:
def load_data(data, logging=False):
    
    data = data.iloc[:,:4]
    data.columns = ['Time','UserId','Type','ItemId']
    data = data.astype({'Time':'int64', 'UserId':'int32', 'Type':'str', 'ItemId':'int32'})
    data['Time'] = (data.Time/1000).astype(int)
    data.sort_values(['UserId','Time'], ascending=True, inplace=True)
    
    data['TimeTmp'] = pd.to_datetime(data.Time, unit='s')
    data.sort_values(['UserId','TimeTmp'], ascending=True, inplace=True)
    data['TimeShift'] = data['TimeTmp'].shift(1)
    data['TimeDiff'] = (data['TimeTmp'] - data['TimeShift']).dt.total_seconds().abs()
    data['SessionIdTmp'] = (data['TimeDiff'] > SESSION_LENGTH).astype(int)
    data['SessionId'] = data['SessionIdTmp'].cumsum(skipna=False)
    del data['SessionIdTmp'], data['TimeShift'], data['TimeDiff']
    
    data.sort_values(['SessionId','Time'], ascending=True, inplace=True)
    
    cart = data[data.Type == 'addtocart']
    data = data[data.Type == 'view']
    del data['Type']
    
    data_start = dt.fromtimestamp(data.Time.min(), timezone.utc)
    data_end = dt.fromtimestamp(data.Time.max(), timezone.utc)
    del data['TimeTmp']
    
    if logging:
        print('Loaded data set\n\tEvents: {}\n\tSessions: {}\n\tItems: {}\n\tSpan: {} / {}\n\n'.
              format(len(data), data.SessionId.nunique(), 
                     data.ItemId.nunique(), data_start.date().isoformat(), 
                     data_end.date().isoformat()))
    
    return data, cart

In [78]:
def filter_data(data, mindate=False, logging=False):
    
    #filter sessions of length 1
    session_lengths = data.groupby('SessionId').size()
    data = data[np.in1d(data.SessionId, session_lengths[session_lengths>1].index)]
    
    #filter item support
    item_supports = data.groupby('ItemId').size()
    data = data[np.in1d(data.ItemId, item_supports[ item_supports>= MIN_ITEM_SUPPORT].index)]
    
    #filter session length
    session_lengths = data.groupby('SessionId').size()
    data = data[np.in1d(data.SessionId, session_lengths[session_lengths>= MIN_SESSION_LENGTH].index)]
    
    #filter min date
    if mindate:
        min_datetime = datetime.strptime(MIN_DATE + ' 00:00:00', '%Y-%m-%d %H:%M:%S')
        session_max_times = data.groupby('SessionId').Time.max()
        session_keep = session_max_times[session_max_times > min_datetime.timestamp()].index
        data = data[np.in1d(data.SessionId, session_keep)]
    
    if logging:
        data_start = dt.fromtimestamp( data.Time.min(), timezone.utc)
        data_end = dt.fromtimestamp( data.Time.max(), timezone.utc)

        print('Filtered data set\n\tEvents: {}\n\tSessions: {}\n\tItems: {}\n\tSpan: {} / {}\n\n'.
              format( len(data), data.SessionId.nunique(), data.ItemId.nunique(),
                     data_start.date().isoformat(), data_end.date().isoformat()))
    return data

In [100]:
def split_data_org(data, logging=False):
    tmax = data.Time.max()
    session_max_times = data.groupby('SessionId').Time.max()
    session_train = session_max_times[session_max_times < tmax-86400].index
    session_test = session_max_times[session_max_times >= tmax-86400].index
    train = data[np.in1d(data.SessionId, session_train)]
    test = data[np.in1d(data.SessionId, session_test)]
    test = test[np.in1d(test.ItemId, train.ItemId)]
    tslength = test.groupby('SessionId').size()
    test = test[np.in1d(test.SessionId, tslength[tslength>=2].index)]
    
    tmax = train.Time.max()
    session_max_times = train.groupby('SessionId').Time.max()
    session_train = session_max_times[session_max_times < tmax-86400].index
    session_valid = session_max_times[session_max_times >= tmax-86400].index
    train_tr = train[np.in1d(train.SessionId, session_train)]
    valid = train[np.in1d(train.SessionId, session_valid)]
    valid = valid[np.in1d(valid.ItemId, train_tr.ItemId)]
    tslength = valid.groupby('SessionId').size()
    valid = valid[np.in1d(valid.SessionId, tslength[tslength>=2].index)]
    
    if logging:
        print('Full train set\n\tEvents: {}\n\tSessions: {}\n\tItems: {}'.format(len(train), train.SessionId.nunique(), train.ItemId.nunique()))
        print('Test set\n\tEvents: {}\n\tSessions: {}\n\tItems: {}'.format(len(test), test.SessionId.nunique(), test.ItemId.nunique()))
        print('Train set\n\tEvents: {}\n\tSessions: {}\n\tItems: {}'.format(len(train_tr), train_tr.SessionId.nunique(), train_tr.ItemId.nunique()))
        print('Validation set\n\tEvents: {}\n\tSessions: {}\n\tItems: {}'.format(len(valid), valid.SessionId.nunique(), valid.ItemId.nunique()))
    
    return train, test, train_tr, valid

In [129]:
out, _ = load_data(data[data.visitorid.isin([148130,1230504,339335])])
out = filter_data(out)
out.reset_index(drop=True)

Unnamed: 0,Time,UserId,ItemId,SessionId
0,1434396644,148130,228638,1
1,1434396651,148130,228638,1
2,1437944807,148130,228638,2
3,1437945269,148130,228638,2
4,1433220727,339335,82389,3
5,1433221204,339335,82389,3
6,1437592323,1230504,339606,5
7,1437593175,1230504,339606,5


In [133]:
train, test, train_tr, valid = split_data_org(out.reset_index(drop=True))
display(train); display(test); display(train_tr); display(valid);

Unnamed: 0,Time,UserId,ItemId,SessionId
0,1434396644,148130,228638,1
1,1434396651,148130,228638,1
4,1433220727,339335,82389,3
5,1433221204,339335,82389,3
6,1437592323,1230504,339606,5
7,1437593175,1230504,339606,5


Unnamed: 0,Time,UserId,ItemId,SessionId
2,1437944807,148130,228638,2
3,1437945269,148130,228638,2


Unnamed: 0,Time,UserId,ItemId,SessionId
0,1434396644,148130,228638,1
1,1434396651,148130,228638,1
4,1433220727,339335,82389,3
5,1433221204,339335,82389,3


Unnamed: 0,Time,UserId,ItemId,SessionId


In [135]:
def split_data(data, logging=False) :
    
    data_end = datetime.fromtimestamp( data.Time.max(), timezone.utc )
    test_from = data_end - timedelta(DAYS_TEST)
    
    session_max_times = data.groupby('SessionId').Time.max()
    session_train = session_max_times[ session_max_times < test_from.timestamp() ].index
    session_test = session_max_times[ session_max_times >= test_from.timestamp() ].index
    train = data[np.in1d(data.SessionId, session_train)]
    test = data[np.in1d(data.SessionId, session_test)]
    test = test[np.in1d(test.ItemId, train.ItemId)]
    tslength = test.groupby('SessionId').size()
    test = test[np.in1d(test.SessionId, tslength[tslength>=2].index)]
    
    if logging:
        print('Full train set\n\tEvents: {}\n\tSessions: {}\n\tItems: {}'.format(len(train), train.SessionId.nunique(), train.ItemId.nunique()))
        print('Test set\n\tEvents: {}\n\tSessions: {}\n\tItems: {}'.format(len(test), test.SessionId.nunique(), test.ItemId.nunique()))
    
    return train, test

In [136]:
train, test = split_data(out)
display(train); display(test);

Unnamed: 0,Time,UserId,ItemId,SessionId
283282,1434396644,148130,228638,1
280773,1434396651,148130,228638,1
20511,1433220727,339335,82389,3
9,1433221204,339335,82389,3
2528602,1437592323,1230504,339606,5
2523559,1437593175,1230504,339606,5


Unnamed: 0,Time,UserId,ItemId,SessionId
2620448,1437944807,148130,228638,2
2625411,1437945269,148130,228638,2


In [137]:
def split_data_slice(data, slice_id, days_offset, days_train, days_test, logging=False):
    
    data_start = dt.fromtimestamp( data.Time.min(), timezone.utc )
    data_end = dt.fromtimestamp( data.Time.max(), timezone.utc )
    
    start = datetime.fromtimestamp( data.Time.min(), timezone.utc ) + timedelta( days_offset ) 
    middle =  start + timedelta( days_train )
    end =  middle + timedelta( days_test )
    
    #prefilter the timespan
    session_max_times = data.groupby('SessionId').Time.max()
    greater_start = session_max_times[session_max_times >= start.timestamp()].index
    lower_end = session_max_times[session_max_times <= end.timestamp()].index
    data_filtered = data[np.in1d(data.SessionId, greater_start.intersection( lower_end ))]
    
    #split to train and test
    session_max_times = data_filtered.groupby('SessionId').Time.max()
    sessions_train = session_max_times[session_max_times < middle.timestamp()].index
    sessions_test = session_max_times[session_max_times >= middle.timestamp()].index
    
    train = data[np.in1d(data.SessionId, sessions_train)]
    
    test = data[np.in1d(data.SessionId, sessions_test)]
    test = test[np.in1d(test.ItemId, train.ItemId)]
    
    tslength = test.groupby('SessionId').size()
    test = test[np.in1d(test.SessionId, tslength[tslength>=2].index)]
    
    if logging:
        print('Full data set {}\n\tEvents: {}\n\tSessions: {}\n\tItems: {}\n\tSpan: {} / {}'.
              format( slice_id, len(data), data.SessionId.nunique(), data.ItemId.nunique(), data_start.isoformat(), data_end.isoformat() ) )
        print('Slice data set {}\n\tEvents: {}\n\tSessions: {}\n\tItems: {}\n\tSpan: {} / {} / {}'.
              format( slice_id, len(data_filtered), data_filtered.SessionId.nunique(), data_filtered.ItemId.nunique(), start.date().isoformat(), middle.date().isoformat(), end.date().isoformat() ) )
        print('Train set {}\n\tEvents: {}\n\tSessions: {}\n\tItems: {}\n\tSpan: {} / {}'.
              format( slice_id, len(train), train.SessionId.nunique(), train.ItemId.nunique(), start.date().isoformat(), middle.date().isoformat() ) )
        print('Test set {}\n\tEvents: {}\n\tSessions: {}\n\tItems: {}\n\tSpan: {} / {} \n\n'.
              format( slice_id, len(test), test.SessionId.nunique(), test.ItemId.nunique(), middle.date().isoformat(), end.date().isoformat() ) )

    return train, test

In [141]:
def slice_data(data):
    for slice_id in range(0, NUM_SLICES):
        train, test = split_data_slice(data, slice_id, DAYS_OFFSET+(slice_id*DAYS_SHIFT), DAYS_TRAIN, DAYS_TEST, logging=False)
        display(train); display(test);

In [142]:
slice_data(out)

Unnamed: 0,Time,UserId,ItemId,SessionId
20511,1433220727,339335,82389,3
9,1433221204,339335,82389,3


Unnamed: 0,Time,UserId,ItemId,SessionId


Unnamed: 0,Time,UserId,ItemId,SessionId


Unnamed: 0,Time,UserId,ItemId,SessionId


## Stats

In [145]:
def get_stats( dataframe, name='test' ):
    print( 'get_stats ',name )
    
    res = {}
    
    res['STATS'] = ['STATS']
    res['name'] = [name]
    res['actions'] = [len(dataframe)]
    res['items'] = [ dataframe.ItemId.nunique() ]
    res['sessions'] = [ dataframe.SessionId.nunique() ]
    res['time_start'] = [ dataframe.Time.min() ]
    res['time_end'] = [ dataframe.Time.max() ]
    
    res['unique_per_session'] = dataframe.groupby('SessionId')['ItemId'].nunique().mean()
    
    res = pd.DataFrame(res)

    res['actions_per_session'] = res['actions'] / res['sessions']
    res['actions_per_items'] = res['actions'] / res['items']
    #res['sessions_per_action'] = res['sessions'] / res['actions']
    res['sessions_per_items'] = res['sessions'] / res['items']
    #res['items_per_actions'] = res['items'] / res['actions']
    res['items_per_session'] = res['items'] / res['sessions']
    res['span'] = res['time_end'] - res['time_start']
    res['days'] = res['span'] / 1000 / 60 / 60 / 24
    
    return res

In [149]:
out.head()

Unnamed: 0,Time,UserId,ItemId,SessionId
283282,1434396644,148130,228638,1
280773,1434396651,148130,228638,1
2620448,1437944807,148130,228638,2
2625411,1437945269,148130,228638,2
20511,1433220727,339335,82389,3


In [148]:
get_stats(out, 'full').T

get_stats  full


Unnamed: 0,0
STATS,STATS
name,full
actions,8
items,3
sessions,4
time_start,1433220727
time_end,1437945269
unique_per_session,1
actions_per_session,2
actions_per_items,2.66667


In [152]:
def sequential_indicators( train, name='test' ):
    
    print( 'sequential_indicators ',name )
    
    train['ItemIdNext'] = train['ItemId'].shift(-1).where(train['SessionId'].shift(-1) == train['SessionId'], np.nan)
    #train['ItemIdNext2'] = train['ItemId'].shift(-2).where(train['SessionId'].shift(-2) == train['SessionId'], np.nan)

    sequences = pd.DataFrame()
    sequences['count'] = train.dropna(axis=0, how='any').groupby( ['ItemId','ItemIdNext'] ).size()
    #sequences = sequences[ sequences['count'] > 1 ]
    #sequences = sequences[sequences['count'] > 1]

    sequences['bin20'] = pd.cut(sequences['count'], 20, labels=range(1,21))
    
#     sequences2 = pd.DataFrame()
#     sequences2['count2'] = train.dropna(axis=0, how='any').groupby( ['ItemId','ItemIdNext','ItemIdNext2'] ).size()
#     sequences2 = sequences2[ sequences2['count2'] > 1 ]
    
    sums = pd.DataFrame()
    sums['size'] = sequences.groupby( ['count'] ).size()
    sums = sums.reset_index()
    sums['size'] = sums['size'] / sums['size'].sum()
    sums = sums[ sums['count'] > 1 ]
    sums['bin20'] = pd.cut(sums['count'], 20, labels=range(1,21))
    sums['prod'] = sums['count'] * sums['size']
    sums['prodsq'] = (sums['count']**2) * sums['size']
    sums['prodbin'] = sums['bin20'].astype(np.int8) * sums['size'].astype(np.int8)
    
    sumf = sums['prod'].sum()
    sumfsq = sums['prodsq'].sum()
    sumfbin = sums['prodbin'].sum()
    
    sums = sums[ sums['count'] > 2 ]
    
    sumf2 = sums['prod'].sum()
    sumf2sq = sums['prodsq'].sum()
    
#     sums = pd.DataFrame()
#     sums['size'] = sequences.groupby( ['bin20'] ).size()
#     sums = sums.reset_index()
#     sums['size'] = sums['size'] / sums['size'].sum()
#     sums = sums[ sums['bin20'] > 1 ]
#     sums['prod'] = sums['bin20'] * sums['size']
#     print( sums )
#     sumfb = sums['prod'].sum()
    
#     sums = pd.DataFrame()
#     sums['size2'] = sequences2.groupby( ['count2'] ).size()
#     sums = sums.reset_index()
#     sums['prod2'] = (sums['count2']**2) * sums['size2']
#     
#     sumf2 = sums['prod2'].sum()
    
    res = {}
    res['name'] = [name]
    res['seq'] = [len( sequences )]
#     res['seq2'] = [len( sequences2 )]
    res['seq_count_mean'] = [sequences['count'].mean()]
    res['seq_count_max'] = [sequences['count'].max()]
    res['seq_sum'] = [sumf]
    res['seq_sum2'] = [sumf2]
    res['seq_sum_sq'] = [sumfsq]
    res['seq_sum2_sq'] = [sumf2sq]
    res['seq_sum_bin'] = [sumfbin]
    
    res = pd.DataFrame(res)
    
    res['seq_sum_seqtrain'] = res['seq_sum'] / ( res['seq'] / len( train ) )
    res['seq_sumsq_seqtrain'] = res['seq_sum_sq'] / ( res['seq'] / len( train ) )
    res['seq_sumbin_seqtrain'] = res['seq_sum_bin'] / ( res['seq'] / len( train ) )
    res['seq_sum_tupel'] = res['seq_sum'] / res['seq']
    res['seq_sum_train'] = res['seq_sum'] / len( train )
    res['seq_sum_items'] = res['seq_sum'] / train.ItemId.nunique() 
    res['seq_sum_session'] = res['seq_sum'] / train.SessionId.nunique()
    
    res['seq_sum2_seqtrain'] = res['seq_sum2'] / ( res['seq'] / len( train ) )
    res['seq_sum2_tupel'] = res['seq_sum2'] / res['seq']
    res['seq_sum2_train'] = res['seq_sum2'] / len( train )
    res['seq_sum2_items'] = res['seq_sum2'] / train.ItemId.nunique() 
    res['seq_sum2_session'] = res['seq_sum2'] / train.SessionId.nunique()
    
    res['seq_count_mean_tupel'] = res['seq_count_mean'] / res['seq']
    res['seq_count_mean_train'] = res['seq_count_mean'] / len( train )
    res['seq_count_mean_items'] = res['seq_count_mean'] / train.ItemId.nunique() 
    res['seq_count_mean_session'] = res['seq_count_mean'] / train.SessionId.nunique()
    
    res['seq_count_max_tupel'] = res['seq_count_max'] / res['seq']
    res['seq_count_max_train'] = res['seq_count_max'] / len( train )
    res['seq_count_max_items'] = res['seq_count_max'] / train.ItemId.nunique() 
    res['seq_count_max_session'] = res['seq_count_max'] / train.SessionId.nunique()
    
    return res

In [155]:
sequential_indicators(out, 'full').T

sequential_indicators  full


Unnamed: 0,0
name,full
seq,3
seq_count_mean,1.33333
seq_count_max,2
seq_sum,0.666667
seq_sum2,0
seq_sum_sq,1.33333
seq_sum2_sq,0
seq_sum_bin,0
seq_sum_seqtrain,1.77778
