In [68]:
import csv
import tqdm
import datetime
import argparse
import numpy as np
import pandas as pd
import os
from collections import defaultdict
from sklearn import preprocessing

import warnings
warnings.filterwarnings('ignore')

SEED = 666
np.random.seed(SEED)

In [4]:
dataset_path = './data/raw'

In [128]:
threshold_item = 2
threshold_sess = 2
test_fraction = 'week'

In [None]:
# parser = argparse.ArgumentParser()
# parser.add_argument('--dataset', default='train-item-views.csv', type=str)
# parser.add_argument('--is_time_fraction', default=True, type=bool)  # split into different time fraction or not
# parser.add_argument('--test_fraction', default='week', type=str)  # 'day' or 'week'
# parser.add_argument('--threshold_sess', default=1, type=int)
# parser.add_argument('--threshold_item', default=4, type=int)
# args, unknown = parser.parse_known_args()

# print('Start preprocess ' + args.dataset + ':')

In [5]:
data = pd.read_csv(os.path.join(dataset_path,'train-item-views.csv'), sep=';')
data.shape

(1235380, 5)

In [76]:
data.head(10)

Unnamed: 0,sessionId,userId,itemId,timeframe,eventdate
0,0,,4,526309,2016-05-09
1,0,,1,1031018,2016-05-09
2,0,,2,243569,2016-05-09
3,0,,0,75848,2016-05-09
4,0,,3,1112408,2016-05-09
5,1,,33043,173912,2016-05-09
6,1,,12352,329870,2016-05-09
7,1,,35077,390072,2016-05-09
8,1,,36118,487369,2016-05-09
9,1,,129055,991416,2016-05-09


In [13]:
86400.00 / data.timeframe.max()

0.07200048000320002

In [123]:
def preprocess(data):
    
    sess_map = preprocessing.LabelEncoder()
    item_map = preprocessing.LabelEncoder()
    
    converter = 86400.00 / data.timeframe.max()
    
    data['time'] = data.apply(lambda row: (datetime.datetime.strptime(row.eventdate, "%Y-%m-%d").timestamp() + \
                                           row.timeframe * converter), axis=1)
    
    # label encoding session and item ids
    data['sessionId'] = sess_map.fit_transform(data['sessionId'].values)
    data['itemId'] = sess_map.fit_transform(data['itemId'].values)
    
    del data['userId'], data['timeframe'], data['eventdate']
    
    # remove session whose length is 1
    session_lengths = data.groupby('sessionId').size()
    data = data[np.in1d(data.sessionId, session_lengths[session_lengths> 1].index)]
    
    # remove item which appear less than threshold_item
    item_supports = data.groupby('itemId').size()
    data = data[np.in1d(data.itemId, item_supports[ item_supports>= threshold_item].index)]
    
    # remove session whose length less than threshold_sess
    session_lengths = data.groupby('sessionId').size()
    data = data[np.in1d(data.sessionId, session_lengths[session_lengths>= threshold_sess].index)]
    
    # record session end time
    sess_end = dict(data.groupby('sessionId').time.max())
    
    # split entire dataset by time interval
    all_times = np.array(list(sess_end.values()))
    max_time = max(all_times)
    min_time = min(all_times)
    
    # choose the most recent 16 fraction and put left dataset in initial set
    if test_fraction == 'week':
        period_threshold = np.arange(max_time, min_time, -7 * 86400)
    elif args.test_fraction == 'day':
        period_threshold = np.arange(max_time, min_time, -86400)
    else:
        raise ValueError('invalid time fraction')
        
    period_threshold = np.sort(period_threshold)
    period_threshold = period_threshold[-17:]
    data['period'] = data.apply(lambda row: (period_threshold.searchsorted(row['time']) + 1), axis=1)
    
    data.sort_values(['period','time'], ascending=True, inplace=True)
    
    return data, sess_map, item_map, sess_end

In [129]:
# processed_data, _, _, sess_end, period_threshold = preprocess(data[data.sessionId.isin([1,2,5])])
processed_data, _, _, sess_end = preprocess(data.sample(10000, random_state=SEED))
processed_data

Unnamed: 0,sessionId,itemId,time,period
369058,2886,6725,1454571000.0,1
369057,2886,6725,1454573000.0,1
849473,6672,3636,1455500000.0,2
849474,6672,3636,1455558000.0,3
1056969,8307,1602,1455928000.0,3
1056968,8307,1602,1455946000.0,3
1116587,8803,5433,1456022000.0,3
1116588,8803,5433,1456023000.0,3
43560,339,3067,1457153000.0,5
43563,339,3067,1457186000.0,5
