In [9]:
import numpy as np
import pandas as pd
import datetime as dt

In [4]:
!ls ../data

dataset-README.txt  yoochoose-clicks.dat
yoochoose-buys.dat  yoochoose-test.dat


In [10]:
!mkdir ../data/processed

In [11]:
PATH_TO_ORIGINAL_DATA = '../data/'
PATH_TO_PROCESSED_DATA = '../data/processed'

In [13]:
%%time
data = pd.read_csv(PATH_TO_ORIGINAL_DATA + 'yoochoose-clicks.dat', sep=',', header=None, usecols=[0,1,2], dtype={0:np.int32, 1:str, 2:np.int64})

CPU times: user 22.4 s, sys: 1.92 s, total: 24.3 s
Wall time: 24.3 s


In [14]:
data.columns = ['SessionId', 'TimeStr', 'ItemId']

In [16]:
data.head()

Unnamed: 0,SessionId,TimeStr,ItemId
0,1,2014-04-07T10:51:09.277Z,214536502
1,1,2014-04-07T10:54:09.868Z,214536500
2,1,2014-04-07T10:54:46.998Z,214536506
3,1,2014-04-07T10:57:00.306Z,214577561
4,2,2014-04-07T13:56:37.614Z,214662742


In [17]:
%%time
data['Time'] = data.TimeStr.apply(lambda x: dt.datetime.strptime(x, '%Y-%m-%dT%H:%M:%S.%fZ').timestamp()) #This is not UTC. It does not really matter.

CPU times: user 9min 49s, sys: 1.02 s, total: 9min 50s
Wall time: 9min 50s


In [18]:
del(data['TimeStr'])

In [19]:
session_lengths = data.groupby('SessionId').size()

In [20]:
data = data[np.in1d(data.SessionId, session_lengths[session_lengths>1].index)]

In [21]:
item_supports = data.groupby('ItemId').size()

In [22]:
data = data[np.in1d(data.ItemId, item_supports[item_supports>=5].index)]

In [23]:
session_lengths = data.groupby('SessionId').size()

In [24]:
data = data[np.in1d(data.SessionId, session_lengths[session_lengths>=2].index)]

In [25]:
tmax = data.Time.max()

In [26]:
session_max_times = data.groupby('SessionId').Time.max()
session_train = session_max_times[session_max_times < tmax-86400].index
session_test = session_max_times[session_max_times >= tmax-86400].index
train = data[np.in1d(data.SessionId, session_train)]
test = data[np.in1d(data.SessionId, session_test)]
test = test[np.in1d(test.ItemId, train.ItemId)]
tslength = test.groupby('SessionId').size()
test = test[np.in1d(test.SessionId, tslength[tslength>=2].index)]

In [27]:
print('Full train set\n\tEvents: {}\n\tSessions: {}\n\tItems: {}'.format(len(train), train.SessionId.nunique(), train.ItemId.nunique()))
train.to_csv(PATH_TO_PROCESSED_DATA + 'rsc15_train_full.txt', sep='\t', index=False)

Full train set
	Events: 31637239
	Sessions: 7966257
	Items: 37483


In [28]:
print('Test set\n\tEvents: {}\n\tSessions: {}\n\tItems: {}'.format(len(test), test.SessionId.nunique(), test.ItemId.nunique()))
test.to_csv(PATH_TO_PROCESSED_DATA + 'rsc15_test.txt', sep='\t', index=False)

Test set
	Events: 71222
	Sessions: 15324
	Items: 6751


In [29]:
tmax = train.Time.max()
session_max_times = train.groupby('SessionId').Time.max()
session_train = session_max_times[session_max_times < tmax-86400].index
session_valid = session_max_times[session_max_times >= tmax-86400].index

In [30]:
train_tr = train[np.in1d(train.SessionId, session_train)]
valid = train[np.in1d(train.SessionId, session_valid)]
valid = valid[np.in1d(valid.ItemId, train_tr.ItemId)]
tslength = valid.groupby('SessionId').size()
valid = valid[np.in1d(valid.SessionId, tslength[tslength>=2].index)]

In [31]:
print('Train set\n\tEvents: {}\n\tSessions: {}\n\tItems: {}'.format(len(train_tr), train_tr.SessionId.nunique(), train_tr.ItemId.nunique()))
train_tr.to_csv(PATH_TO_PROCESSED_DATA + 'rsc15_train_tr.txt', sep='\t', index=False)

Train set
	Events: 31579006
	Sessions: 7953885
	Items: 37483


In [32]:
print('Validation set\n\tEvents: {}\n\tSessions: {}\n\tItems: {}'.format(len(valid), valid.SessionId.nunique(), valid.ItemId.nunique()))
valid.to_csv(PATH_TO_PROCESSED_DATA + 'rsc15_train_valid.txt', sep='\t', index=False)

Validation set
	Events: 58233
	Sessions: 12372
	Items: 6359
