In [None]:
# !mkdir -p /content/data
# !unzip -o /content/drive/MyDrive/TempData/Diginetica/diginetica.zip -d /content/data
# %cd /content/data
# !cp -r content/data/store/raw/* .
# !rm -r content
# !unzip -n dataset-train-diginetica.zip
# !rm dataset-train-diginetica.zip
# %cd /content

In [78]:
import csv
import tqdm
import datetime
import argparse
import numpy as np
import pandas as pd
import os
from collections import defaultdict

# for reproducibility
SEED = 666
np.random.seed(SEED)

# configuration parameters
dataset_path = '/content/data'

In [49]:
parser = argparse.ArgumentParser()
parser.add_argument('--dataset', default='train-item-views.csv', type=str)
parser.add_argument('--is_time_fraction', default=True, type=bool)  # split into different time fraction or not
parser.add_argument('--test_fraction', default='week', type=str)  # 'day' or 'week'
parser.add_argument('--threshold_sess', default=1, type=int)
parser.add_argument('--threshold_item', default=4, type=int)
args, unknown = parser.parse_known_args()

print('Start preprocess ' + args.dataset + ':')

Start preprocess train-item-views.csv:


In [39]:
def generate_name_Id_map(name, map):
    """
    Given a name and map, return corresponding Id. If name not in map, generate a new Id.
    :param name: session or item name in dataset
    :param map: existing map, a dictionary: map[name]=Id
    :return: Id: allocated new Id of the corresponding name
    """
    if name in map:
        Id = map[name]
    else:
        Id = len(map.keys()) + 1
        map[name] = Id
    return Id

In [40]:
def read_data(dataset_path):
    """
    Load data from raw dataset.
    :param dataset_path: the full name of dataset including extension name
    :return sess_map: map from raw data session name to session Id, a dictionary sess_map[sess_name]=sessId
    :return item_map: map from raw data item name to item Id, a dictionary item_map[item_name]=itemId
    :return reformed_data: a list: each element is a action, which is a list of [sessId, itemId, time]
    """
    sess_map = {}
    item_map = {}
    reformed_data = []

    dataset_name = dataset_path.split('/')[-1]
    with open(dataset_path) as f:

        if dataset_name.split('-')[0] == 'train':
            # with sequence information
            reader = csv.DictReader(f, delimiter=';')
            timeframes = []
            for sample in reader:
                timeframes.append(int(sample['timeframe']))
            converter = 86400.00 / max(timeframes)
            f.seek(0)
            reader = csv.DictReader(f, delimiter=';')
            # load data
            for sample in tqdm.tqdm(reader, desc='Loading data'):
                sess = sample['sessionId']
                item = sample['itemId']
                date = sample['eventdate']
                timeframe = int(sample['timeframe'])
                if date:
                    time = int(datetime.datetime.strptime(date, "%Y-%m-%d").timestamp()) + timeframe * converter
                else:
                    continue
                sessId = generate_name_Id_map(sess, sess_map)
                itemId = generate_name_Id_map(item, item_map)
                reformed_data.append([sessId, itemId, time])
        else:
            print("Error: new csv data file!")

    # print raw dataset information
    print('Total number of sessions in dataset:', len(sess_map.keys()))
    print('Total number of items in dataset:', len(item_map.keys()))
    print('Total number of actions in dataset:', len(reformed_data))
    print('Average number of actions per user:', len(reformed_data) / len(sess_map.keys()))
    print('Average number of actions per item:', len(reformed_data) / len(item_map.keys()))

    return sess_map, item_map, reformed_data

In [73]:
data = pd.read_csv(os.path.join(dataset_path, args.dataset), sep=';')
data.shape

(1235380, 5)

In [67]:
data.head()

Unnamed: 0,sessionId,userId,itemId,timeframe,eventdate
0,1,,81766,526309,2016-05-09
1,1,,31331,1031018,2016-05-09
2,1,,32118,243569,2016-05-09
3,1,,9654,75848,2016-05-09
4,1,,32627,1112408,2016-05-09


In [74]:
sess_map, item_map, reformed_data = read_data(os.path.join(dataset_path, args.dataset))

Loading data: 1235380it [00:24, 49670.64it/s]


Total number of sessions in dataset: 310324
Total number of items in dataset: 122993
Total number of actions in dataset: 1235380
Average number of actions per user: 3.9809360539307304
Average number of actions per item: 10.044311464880115


In [75]:
display(sess_map['1'], item_map['81766'])
display(reformed_data[:10])

1

1

[[1, 1, 1462789894.50063],
 [1, 2, 1462826233.790892],
 [1, 3, 1462769537.084914],
 [1, 4, 1462757461.0924072],
 [1, 5, 1462832093.9099593],
 [1, 6, 1462764521.7474782],
 [1, 7, 1462775750.7983387],
 [1, 8, 1462780085.3712358],
 [1, 9, 1462787090.8019388],
 [1, 10, 1462823382.427883]]

In [71]:
def generate_sess_end_map(sess_end, sessId, time):
    """
    Generate map recording the session end time.
    :param sess_end: the map recording session end time, a dictionary see_end[sessId]=end_time
    :param sessId:session Id of new action
    :param time:time of new action
    :return: sess_end: the map recording session end time, a dictionary see_end[sessId]=end_time
    """
    if sessId in sess_end:
        sess_end[sessId] = max(time, sess_end[sessId])
    else:
        sess_end[sessId] = time
    return sess_end

In [76]:
def short_remove(reformed_data, args):
    """
    Remove data according to threshold
    :param reformed_data: loaded data, a list: each element is a action, which is a list of [sessId, itemId, time]
    :param args: args.threshold_item: minimum number of appearance time of item -1
                 args.threshold_sess: minimum length of session -1
    :return removed_data: result data after removing
    :return sess_end: a map recording session end time, a dictionary sess_end[sessId]=end_time
    """
    org_sess_end = dict()
    for [userId, _, time] in reformed_data:
        org_sess_end = generate_sess_end_map(org_sess_end, userId, time)

    # remove session whose length is 1
    sess_counter = defaultdict(lambda: 0)
    for [userId, _, _] in reformed_data:
        sess_counter[userId] += 1
    removed_data = list(filter(lambda x: sess_counter[x[0]] > 1, reformed_data))

    # remove item which appear less or equal to threshold_item
    item_counter = defaultdict(lambda: 0)
    for [_, itemId, _] in removed_data:
        item_counter[itemId] += 1
    removed_data = list(filter(lambda x: item_counter[x[1]] > args.threshold_item, removed_data))

    # remove session whose length less or equal to threshold_sess
    sess_counter = defaultdict(lambda: 0)
    for [userId, _, _] in removed_data:
        sess_counter[userId] += 1
    removed_data = list(filter(lambda x: sess_counter[x[0]] > args.threshold_sess, removed_data))

    # record session end time
    sess_end = dict()
    for [userId, _, time] in removed_data:
        sess_end = generate_sess_end_map(sess_end, userId, time)

    # print information of removed data
    print('Number of sessions after pre-processing:', len(set(map(lambda x: x[0], removed_data))))
    print('Number of items after pre-processing:', len(set(map(lambda x: x[1], removed_data))))
    print('Number of actions after pre-processing:', len(removed_data))
    print('Average number of actions per session:', len(removed_data) / len(set(map(lambda x: x[0], removed_data))))
    print('Average number of actions per item:', len(removed_data) / len(set(map(lambda x: x[1], removed_data))))

    return removed_data, sess_end

In [79]:
# remove data according to occurrences time
removed_data, sess_end = short_remove(reformed_data, args)

Number of sessions after pre-processing: 204789
Number of items after pre-processing: 43136
Number of actions after pre-processing: 993483
Average number of actions per session: 4.851251776218449
Average number of actions per item: 23.031412277448073


In [83]:
display(reformed_data[:10], removed_data[:10])
display(sess_end[1])

[[1, 1, 1462789894.50063],
 [1, 2, 1462826233.790892],
 [1, 3, 1462769537.084914],
 [1, 4, 1462757461.0924072],
 [1, 5, 1462832093.9099593],
 [1, 6, 1462764521.7474782],
 [1, 7, 1462775750.7983387],
 [1, 8, 1462780085.3712358],
 [1, 9, 1462787090.8019388],
 [1, 10, 1462823382.427883]]

[[1, 1, 1462789894.50063],
 [1, 2, 1462826233.790892],
 [1, 3, 1462769537.084914],
 [1, 4, 1462757461.0924072],
 [1, 5, 1462832093.9099593],
 [1, 6, 1462764521.7474782],
 [1, 7, 1462775750.7983387],
 [1, 8, 1462780085.3712358],
 [1, 9, 1462787090.8019388],
 [2, 11, 1462815579.2318616]]

1462832093.9099593

In [80]:
def time_partition(removed_data, session_end, args):
    """
    Partition data according to time periods
    :param removed_data: input data, a list: each element is a action, which is a list of [sessId, itemId, time]
    :param session_end: a dictionary recording session end time, session_end[sessId]=end_time
    :param : args: args.test_fraction: time interval for each partition
    :return: time_fraction: a dictionary, the keys are different time periods, value is a list of actions in that
                            time period
    """
    if args.is_time_fraction:
        # split entire dataset by time interval
        time_fraction = dict()
        all_times = np.array(list(session_end.values()))
        max_time = max(all_times)
        min_time = min(all_times)

        if args.dataset == 'train-item-views.csv':
            # for DIGINETICA, choose the most recent 16 fraction and put left dataset in initial set
            if args.test_fraction == 'week':
                period_threshold = np.arange(max_time, min_time, -7 * 86400)
            elif args.test_fraction == 'day':
                period_threshold = np.arange(max_time, min_time, -86400)
            else:
                raise ValueError('invalid time fraction')
            period_threshold = np.sort(period_threshold)
            period_threshold = period_threshold[-17:]

        for [sessId, itemId, time] in removed_data:
            period = period_threshold.searchsorted(time) + 1
            # generate time period for dictionary keys
            if period not in time_fraction:
                time_fraction[period] = []
            # partition data according to period
            time_fraction[period].append([sessId, itemId, time])
    else:
        # if not partition, put all actions in the last period
        time_fraction = removed_data

    return time_fraction

In [84]:
# partition data according to time periods
time_fraction = time_partition(removed_data, sess_end, args)

In [101]:
display(list(time_fraction.keys())[:10])
display(time_fraction[9][:10])

[14, 9, 10, 7, 3, 1, 16, 5, 12, 17]

[[8, 32, 1459828315.8927727],
 [8, 33, 1459814806.0107067],
 [9, 34, 1459815714.152761],
 [9, 35, 1459829128.49019],
 [9, 35, 1459847755.3743691],
 [10, 36, 1459824829.2695284],
 [10, 37, 1459830032.5282168],
 [10, 38, 1459816146.4436429],
 [11, 39, 1459816154.3636959],
 [11, 19, 1459814873.8351588]]

In [102]:
def generating_txt(time_fraction, sess_end, args):
    """
    Generate final txt file
    :param time_fraction: input data, a dictionary, the keys are different time periods,
                          value is a list of actions in that time period
    :param sess_end: session end time map, sess_map[sessId]=end_time
    :param : args: args.test_fraction: if not split, time interval for test partition
    """

    if args.is_time_fraction:
        # item map second time
        item_map = {}
        for period in sorted(time_fraction.keys()):
            time_fraction[period].sort(key=lambda x: sess_end[x[0]])
        for period in sorted(time_fraction.keys()):
            for i, [userId, itemId, time] in enumerate(time_fraction[period]):
                itemId = generate_name_Id_map(itemId, item_map)
                time_fraction[period][i] = [userId, itemId, time]

        # sort action according to time sequence
        for period in sorted(time_fraction.keys()):
            time_fraction[period].sort(key=lambda x: x[2])

        # generate text file
        for i, period in enumerate(sorted(time_fraction.keys())):
            with open('period_' + str(i) + '.txt', 'w') as file_train:
                for [userId, itemId, time] in time_fraction[period]:
                    file_train.write('%d %d\n' % (userId, itemId))
    else:
        # item map second time
        item_map = {}
        time_fraction.sort(key=lambda x: x[2])
        for i, [userId, itemId, time] in enumerate(time_fraction):
            itemId = generate_name_Id_map(itemId, item_map)
            time_fraction[i] = [userId, itemId, time]

        # sort action according to time sequence
        time_fraction.sort(key=lambda x: x[2])

        max_time = max(map(lambda x: x[2], time_fraction))
        if args.test_fraction == 'day':
            test_threshold = 86400
        elif args.test_fraction == 'week':
            test_threshold = 86400 * 7

        # generate text file
        item_set = set()
        with open('test.txt', 'w') as file_test, open('train.txt', 'w') as file_train:
            for [userId, itemId, time] in time_fraction:
                if sess_end[userId] < max_time - test_threshold:
                    file_train.write('%d %d\n' % (userId, itemId))
                    item_set.add(itemId)
                else:
                    file_test.write('%d %d\n' % (userId, itemId))

In [103]:
# generate final txt file
generating_txt(time_fraction, sess_end, args)