<a href="https://colab.research.google.com/gist/sparsh-ai/fbaf4627cbd3fe5b45efc2f6ab50920a/t443547-preprocessing-of-diginetica-session-dataset.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!wget -q --show-progress https://github.com/RecoHut-Datasets/diginetica/raw/main/train-item-views.csv



In [None]:
!head train-item-views.csv

sessionId;userId;itemId;timeframe;eventdate
1;NA;81766;526309;2016-05-09
1;NA;31331;1031018;2016-05-09
1;NA;32118;243569;2016-05-09
1;NA;9654;75848;2016-05-09
1;NA;32627;1112408;2016-05-09
1;NA;33043;173912;2016-05-09
1;NA;12352;329870;2016-05-09
1;NA;35077;390072;2016-05-09
1;NA;36118;487369;2016-05-09


## Method 1

In [None]:
df = pd.read_csv('/content/train-item-views.csv', sep=';')
df.head()

Unnamed: 0,sessionId;userId;itemId;timeframe;eventdate
0,1;NA;81766;526309;2016-05-09
1,1;NA;31331;1031018;2016-05-09
2,1;NA;32118;243569;2016-05-09
3,1;NA;9654;75848;2016-05-09
4,1;NA;32627;1112408;2016-05-09


In [None]:
import time
import csv
import pickle
import operator
import datetime
import os

In [None]:
class DigineticaDataset:
    def __init__(self, path='.'):
        self.path = path

    def preprocess(self):
        dataset = os.path.join(self.path, 'train-item-views.csv')
        print("-- Starting @ %ss" % datetime.datetime.now())
        with open(dataset, "r") as f:
            reader = csv.DictReader(f, delimiter=';')
            sess_clicks = {}
            sess_date = {}
            ctr = 0
            curid = -1
            curdate = None
            for data in reader:
                sessid = data['sessionId']
                if curdate and not curid == sessid:
                    date = ''
                    date = time.mktime(time.strptime(curdate, '%Y-%m-%d'))
                    sess_date[curid] = date
                curid = sessid
                item = data['itemId'], int(data['timeframe'])
                curdate = ''
                curdate = data['eventdate']
                if sessid in sess_clicks:
                    sess_clicks[sessid] += [item]
                else:
                    sess_clicks[sessid] = [item]
                ctr += 1
            date = ''
            date = time.mktime(time.strptime(curdate, '%Y-%m-%d'))
            for i in list(sess_clicks):
                sorted_clicks = sorted(sess_clicks[i], key=operator.itemgetter(1))
                sess_clicks[i] = [c[0] for c in sorted_clicks]
            sess_date[curid] = date

        print("-- Reading data @ %ss" % datetime.datetime.now())

        # Filter out length 1 sessions
        for s in list(sess_clicks):
            if len(sess_clicks[s]) == 1:
                del sess_clicks[s]
                del sess_date[s]

        # Count number of times each item appears
        iid_counts = {}
        for s in sess_clicks:
            seq = sess_clicks[s]
            for iid in seq:
                if iid in iid_counts:
                    iid_counts[iid] += 1
                else:
                    iid_counts[iid] = 1

        sorted_counts = sorted(iid_counts.items(), key=operator.itemgetter(1))

        length = len(sess_clicks)
        for s in list(sess_clicks):
            curseq = sess_clicks[s]
            filseq = list(filter(lambda i: iid_counts[i] >= 5, curseq))
            if len(filseq) < 2:
                del sess_clicks[s]
                del sess_date[s]
            else:
                sess_clicks[s] = filseq

        # Split out test set based on dates
        dates = list(sess_date.items())
        maxdate = dates[0][1]

        for _, date in dates:
            if maxdate < date:
                maxdate = date

        # 7 days for test
        splitdate = 0
        splitdate = maxdate - 86400 * 7

        print('Splitting date', splitdate)      # Yoochoose: ('Split date', 1411930799.0)
        tra_sess = filter(lambda x: x[1] < splitdate, dates)
        tes_sess = filter(lambda x: x[1] > splitdate, dates)

        # Sort sessions by date
        tra_sess = sorted(tra_sess, key=operator.itemgetter(1))     # [(sessionId, timestamp), (), ]
        tes_sess = sorted(tes_sess, key=operator.itemgetter(1))     # [(sessionId, timestamp), (), ]
        print(len(tra_sess))    # 186670    # 7966257
        print(len(tes_sess))    # 15979     # 15324
        print(tra_sess[:3])
        print(tes_sess[:3])
        
        print("-- Splitting train set and test set @ %ss" % datetime.datetime.now())

        # Choosing item count >=5 gives approximately the same number of items as reported in paper
        item_dict = {}
        # Convert training sessions to sequences and renumber items to start from 1
        def obtian_tra():
            train_ids = []
            train_seqs = []
            train_dates = []
            item_ctr = 1
            for s, date in tra_sess:
                seq = sess_clicks[s]
                outseq = []
                for i in seq:
                    if i in item_dict:
                        outseq += [item_dict[i]]
                    else:
                        outseq += [item_ctr]
                        item_dict[i] = item_ctr
                        item_ctr += 1
                if len(outseq) < 2:  # Doesn't occur
                    continue
                train_ids += [s]
                train_dates += [date]
                train_seqs += [outseq]
            print(item_ctr)     # 43098, 37484
            return train_ids, train_dates, train_seqs


        # Convert test sessions to sequences, ignoring items that do not appear in training set
        def obtian_tes():
            test_ids = []
            test_seqs = []
            test_dates = []
            for s, date in tes_sess:
                seq = sess_clicks[s]
                outseq = []
                for i in seq:
                    if i in item_dict:
                        outseq += [item_dict[i]]
                if len(outseq) < 2:
                    continue
                test_ids += [s]
                test_dates += [date]
                test_seqs += [outseq]
            return test_ids, test_dates, test_seqs

        tra_ids, tra_dates, tra_seqs = obtian_tra()
        tes_ids, tes_dates, tes_seqs = obtian_tes()

        def process_seqs(iseqs, idates):
            out_seqs = []
            out_dates = []
            labs = []
            ids = []
            for id, seq, date in zip(range(len(iseqs)), iseqs, idates):
                for i in range(1, len(seq)):
                    tar = seq[-i]
                    labs += [tar]
                    out_seqs += [seq[:-i]]
                    out_dates += [date]
                    ids += [id]
            return out_seqs, out_dates, labs, ids

        tr_seqs, tr_dates, tr_labs, tr_ids = process_seqs(tra_seqs, tra_dates)
        te_seqs, te_dates, te_labs, te_ids = process_seqs(tes_seqs, tes_dates)
        tra = (tr_seqs, tr_labs)
        tes = (te_seqs, te_labs)
        print(len(tr_seqs))
        print(len(te_seqs))
        print(tr_seqs[:3], tr_dates[:3], tr_labs[:3])
        print(te_seqs[:3], te_dates[:3], te_labs[:3])
        all = 0

        for seq in tra_seqs:
            all += len(seq)
        for seq in tes_seqs:
            all += len(seq)
        print('avg length: ', all/(len(tra_seqs) + len(tes_seqs) * 1.0))


        pickle.dump(tra, open('train.txt', 'wb'))
        pickle.dump(tes, open('test.txt', 'wb'))
        pickle.dump(tra_seqs, open('all_train_seq.txt', 'wb'))

        print('Done.')

In [None]:
yc_data = DigineticaDataset(path='.')
yc_data.preprocess()

## Method 2

In [None]:
# import pandas as pd
# import numpy as np

In [None]:
# def get_session_id(df, interval):
#     df_prev = df.shift()
#     is_new_session = (df.userId != df_prev.userId) | (
#         df.timestamp - df_prev.timestamp > interval
#     )
#     session_id = is_new_session.cumsum() - 1
#     return session_id


# def group_sessions(df, interval):
#     sessionId = get_session_id(df, interval)
#     df = df.assign(sessionId=sessionId)
#     return df


# def filter_short_sessions(df, min_len=2):
#     session_len = df.groupby('sessionId', sort=False).size()
#     long_sessions = session_len[session_len >= min_len].index
#     df_long = df[df.sessionId.isin(long_sessions)]
#     return df_long


# def filter_infreq_items(df, min_support=5):
#     item_support = df.groupby('itemId', sort=False).size()
#     freq_items = item_support[item_support >= min_support].index
#     df_freq = df[df.itemId.isin(freq_items)]
#     return df_freq


# def filter_until_all_long_and_freq(df, min_len=2, min_support=5):
#     while True:
#         df_long = filter_short_sessions(df, min_len)
#         df_freq = filter_infreq_items(df_long, min_support)
#         if len(df_freq) == len(df):
#             break
#         df = df_freq
#     return df


# def truncate_long_sessions(df, max_len=20, is_sorted=False):
#     if not is_sorted:
#         df = df.sort_values(['sessionId', 'timestamp'])
#     itemIdx = df.groupby('sessionId').cumcount()
#     df_t = df[itemIdx < max_len]
#     return df_t


# def update_id(df, field):
#     labels = pd.factorize(df[field])[0]
#     kwargs = {field: labels}
#     df = df.assign(**kwargs)
#     return df


# def remove_immediate_repeats(df):
#     df_prev = df.shift()
#     is_not_repeat = (df.sessionId != df_prev.sessionId) | (df.itemId != df_prev.itemId)
#     df_no_repeat = df[is_not_repeat]
#     return df_no_repeat


# def reorder_sessions_by_endtime(df):
#     endtime = df.groupby('sessionId', sort=False).timestamp.max()
#     df_endtime = endtime.sort_values().reset_index()
#     oid2nid = dict(zip(df_endtime.sessionId, df_endtime.index))
#     sessionId_new = df.sessionId.map(oid2nid)
#     df = df.assign(sessionId=sessionId_new)
#     df = df.sort_values(['sessionId', 'timestamp'])
#     return df


# def keep_top_n_items(df, n):
#     item_support = df.groupby('itemId', sort=False).size()
#     top_items = item_support.nlargest(n).index
#     df_top = df[df.itemId.isin(top_items)]
#     return df_top


# def split_by_time(df, timedelta):
#     max_time = df.timestamp.max()
#     end_time = df.groupby('sessionId').timestamp.max()
#     split_time = max_time - timedelta
#     train_sids = end_time[end_time < split_time].index
#     df_train = df[df.sessionId.isin(train_sids)]
#     df_test = df[~df.sessionId.isin(train_sids)]
#     return df_train, df_test


# def train_test_split(df, test_split=0.2):
#     endtime = df.groupby('sessionId', sort=False).timestamp.max()
#     endtime = endtime.sort_values()
#     num_tests = int(len(endtime) * test_split)
#     test_session_ids = endtime.index[-num_tests:]
#     df_train = df[~df.sessionId.isin(test_session_ids)]
#     df_test = df[df.sessionId.isin(test_session_ids)]
#     return df_train, df_test


# def save_sessions(df, filepath):
#     df = reorder_sessions_by_endtime(df)
#     sessions = df.groupby('sessionId').itemId.apply(lambda x: ','.join(map(str, x)))
#     sessions.to_csv(filepath, sep='\t', header=False, index=False)


# def save_dataset(df_train, df_test):
#     # filter items in test but not in train
#     df_test = df_test[df_test.itemId.isin(df_train.itemId.unique())]
#     df_test = filter_short_sessions(df_test)

#     print(f'No. of Clicks: {len(df_train) + len(df_test)}')
#     print(f'No. of Items: {df_train.itemId.nunique()}')

#     # update itemId
#     train_itemId_new, uniques = pd.factorize(df_train.itemId)
#     df_train = df_train.assign(itemId=train_itemId_new)
#     oid2nid = {oid: i for i, oid in enumerate(uniques)}
#     test_itemId_new = df_test.itemId.map(oid2nid)
#     df_test = df_test.assign(itemId=test_itemId_new)

#     print(f'saving dataset to {os.getcwd()}')
#     save_sessions(df_train, 'train.txt')
#     save_sessions(df_test, 'test.txt')
#     num_items = len(uniques)
#     with open('num_items.txt', 'w') as f:
#         f.write(str(num_items))

In [None]:
# def preprocess_diginetica(csv_file):
#     print(f'reading {csv_file}...')
#     df = pd.read_csv(
#         csv_file,
#         usecols=[0, 2, 3, 4],
#         delimiter=';',
#         parse_dates=['eventdate'],
#         infer_datetime_format=True,
#     )
#     print('start preprocessing')
#     # timeframe (time since the first query in a session, in milliseconds)
#     df['timestamp'] = pd.to_timedelta(df.timeframe, unit='ms') + df.eventdate
#     df = df.drop(['eventdate', 'timeframe'], 1)
#     df = df.sort_values(['sessionId', 'timestamp'])
#     df = filter_short_sessions(df)
#     df = truncate_long_sessions(df, is_sorted=True)
#     df = filter_infreq_items(df)
#     df = filter_short_sessions(df)
#     df_train, df_test = split_by_time(df, pd.Timedelta(days=7))
#     save_dataset(df_train, df_test)

In [None]:
# preprocess_diginetica('train-item-views.csv')

**END**