<a href="https://colab.research.google.com/github/sparsh-ai/stanza/blob/S969796/nbs/T676042_Preprocessing_of_Gowalla_Session_Dataset.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!wget -q --show-progress https://snap.stanford.edu/data/loc-gowalla_totalCheckins.txt.gz



In [None]:
!gunzip loc-gowalla_totalCheckins.txt.gz

In [None]:
import pandas as pd
from pandas import Timedelta
import numpy as np
import os

In [None]:
def get_session_id(df, interval):
    df_prev = df.shift()
    is_new_session = (df.userId != df_prev.userId) | (
        df.timestamp - df_prev.timestamp > interval
    )
    session_id = is_new_session.cumsum() - 1
    return session_id


def group_sessions(df, interval):
    sessionId = get_session_id(df, interval)
    df = df.assign(sessionId=sessionId)
    return df


def filter_short_sessions(df, min_len=2):
    session_len = df.groupby('sessionId', sort=False).size()
    long_sessions = session_len[session_len >= min_len].index
    df_long = df[df.sessionId.isin(long_sessions)]
    return df_long


def filter_infreq_items(df, min_support=5):
    item_support = df.groupby('itemId', sort=False).size()
    freq_items = item_support[item_support >= min_support].index
    df_freq = df[df.itemId.isin(freq_items)]
    return df_freq


def filter_until_all_long_and_freq(df, min_len=2, min_support=5):
    while True:
        df_long = filter_short_sessions(df, min_len)
        df_freq = filter_infreq_items(df_long, min_support)
        if len(df_freq) == len(df):
            break
        df = df_freq
    return df


def truncate_long_sessions(df, max_len=20, is_sorted=False):
    if not is_sorted:
        df = df.sort_values(['sessionId', 'timestamp'])
    itemIdx = df.groupby('sessionId').cumcount()
    df_t = df[itemIdx < max_len]
    return df_t


def update_id(df, field):
    labels = pd.factorize(df[field])[0]
    kwargs = {field: labels}
    df = df.assign(**kwargs)
    return df


def remove_immediate_repeats(df):
    df_prev = df.shift()
    is_not_repeat = (df.sessionId != df_prev.sessionId) | (df.itemId != df_prev.itemId)
    df_no_repeat = df[is_not_repeat]
    return df_no_repeat


def reorder_sessions_by_endtime(df):
    endtime = df.groupby('sessionId', sort=False).timestamp.max()
    df_endtime = endtime.sort_values().reset_index()
    oid2nid = dict(zip(df_endtime.sessionId, df_endtime.index))
    sessionId_new = df.sessionId.map(oid2nid)
    df = df.assign(sessionId=sessionId_new)
    df = df.sort_values(['sessionId', 'timestamp'])
    return df


def keep_top_n_items(df, n):
    item_support = df.groupby('itemId', sort=False).size()
    top_items = item_support.nlargest(n).index
    df_top = df[df.itemId.isin(top_items)]
    return df_top


def split_by_time(df, timedelta):
    max_time = df.timestamp.max()
    end_time = df.groupby('sessionId').timestamp.max()
    split_time = max_time - timedelta
    train_sids = end_time[end_time < split_time].index
    df_train = df[df.sessionId.isin(train_sids)]
    df_test = df[~df.sessionId.isin(train_sids)]
    return df_train, df_test


def train_test_split(df, test_split=0.2):
    endtime = df.groupby('sessionId', sort=False).timestamp.max()
    endtime = endtime.sort_values()
    num_tests = int(len(endtime) * test_split)
    test_session_ids = endtime.index[-num_tests:]
    df_train = df[~df.sessionId.isin(test_session_ids)]
    df_test = df[df.sessionId.isin(test_session_ids)]
    return df_train, df_test


def save_sessions(df, filepath):
    df = reorder_sessions_by_endtime(df)
    sessions = df.groupby('sessionId').itemId.apply(lambda x: ','.join(map(str, x)))
    sessions.to_csv(filepath, sep='\t', header=False, index=False)


def save_dataset(df_train, df_test):
    # filter items in test but not in train
    df_test = df_test[df_test.itemId.isin(df_train.itemId.unique())]
    df_test = filter_short_sessions(df_test)

    print(f'No. of Clicks: {len(df_train) + len(df_test)}')
    print(f'No. of Items: {df_train.itemId.nunique()}')

    # update itemId
    train_itemId_new, uniques = pd.factorize(df_train.itemId)
    df_train = df_train.assign(itemId=train_itemId_new)
    oid2nid = {oid: i for i, oid in enumerate(uniques)}
    test_itemId_new = df_test.itemId.map(oid2nid)
    df_test = df_test.assign(itemId=test_itemId_new)

    print(f'saving dataset to {os.getcwd()}')
    save_sessions(df_train, 'train.txt')
    save_sessions(df_test, 'test.txt')
    num_items = len(uniques)
    with open('num_items.txt', 'w') as f:
        f.write(str(num_items))

In [None]:
def preprocess_gowalla(csv_file, usecols, interval, n):
    print(f'reading {csv_file}...')
    df = pd.read_csv(
        csv_file,
        sep='\t',
        header=None,
        names=['userId', 'timestamp', 'itemId'],
        usecols=usecols,
        parse_dates=['timestamp'],
        infer_datetime_format=True,
    )
    print('start preprocessing')
    df = df.dropna()
    df = update_id(df, 'userId')
    df = update_id(df, 'itemId')
    df = df.sort_values(['userId', 'timestamp'])

    df = group_sessions(df, interval)
    df = remove_immediate_repeats(df)
    df = truncate_long_sessions(df, is_sorted=True)
    df = keep_top_n_items(df, n)
    df = filter_until_all_long_and_freq(df)
    df_train, df_test = train_test_split(df, test_split=0.2)
    save_dataset(df_train, df_test)

In [None]:
csv_file = 'loc-gowalla_totalCheckins.txt'
usecols = [0, 1, 4]
interval = Timedelta(days=1)
n = 30000
preprocess_gowalla(csv_file, usecols, interval, n)

reading loc-gowalla_totalCheckins.txt...
start preprocessing
No. of Clicks: 1122788
No. of Items: 29510
saving dataset to /content


---

In [None]:
!apt-get -qq install tree

Selecting previously unselected package tree.
(Reading database ... 155222 files and directories currently installed.)
Preparing to unpack .../tree_1.7.0-5_amd64.deb ...
Unpacking tree (1.7.0-5) ...
Setting up tree (1.7.0-5) ...
Processing triggers for man-db (2.8.3-2ubuntu0.1) ...


In [None]:
!tree -h --du .

.
├── [376M]  loc-gowalla_totalCheckins.txt
├── [   5]  num_items.txt
├── [1.1M]  test.txt
└── [4.6M]  train.txt

 382M used in 0 directories, 4 files


In [None]:
!pip install -q watermark
%reload_ext watermark
%watermark -a "Sparsh A." -m -iv -u -t -d

[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
markdown 3.3.6 requires importlib-metadata>=4.4; python_version < "3.10", but you have importlib-metadata 2.1.2 which is incompatible.[0m
Author: Sparsh A.

Last updated: 2021-11-26 11:20:30

Compiler    : GCC 7.5.0
OS          : Linux
Release     : 5.4.104+
Machine     : x86_64
Processor   : x86_64
CPU cores   : 2
Architecture: 64bit

pandas : 1.1.5
IPython: 5.5.0
numpy  : 1.19.5



---

**END**