In [None]:
# default_exp utils.session

# Session
> Create sessions.

In [None]:
#hide
from nbdev.showdoc import *

In [None]:
#export
import numpy as np
import pandas as pd
from collections import Counter

In [None]:
#export
def construct_session_sequences(df, sessionID, itemID):
    """
    Given a dataset in pandas df format, construct a list of lists where each sublist
    represents the interactions relevant to a specific session, for each sessionID. 
    These sublists are composed of a series of itemIDs (str) and are the core training 
    data used in the Word2Vec algorithm. 
    This is performed by first grouping over the SessionID column, then casting to list
    each group's series of values in the ItemID column. 
    INPUTS
    ------------
    df:                 pandas dataframe
    sessionID: str      column name in the df that represents invididual sessions
    itemID: str         column name in the df that represents the items within a session
    """
    grp_by_session = df.groupby([sessionID])

    session_sequences = []
    for name, group in grp_by_session:
        session_sequences.append(list(group[itemID].values))

    return session_sequences

In [None]:
import pandas as pd

df = pd.DataFrame.from_dict({
    'SessionID':[1,1,1,2,2,2,2,2],
    'ItemID':[111,123,345,45,334,342,8970,345]
})
df

Unnamed: 0,SessionID,ItemID
0,1,111
1,1,123
2,1,345
3,2,45
4,2,334
5,2,342
6,2,8970
7,2,345


In [None]:
construct_session_sequences(df, sessionID='SessionID', itemID='ItemID')

[[111, 123, 345], [45, 334, 342, 8970, 345]]

In [None]:
#export
def get_session_stats(df: pd.DataFrame,
                      user_col: str = 'userid',
                      seq_col: str = 'sessid',
                      ):

    cnt = Counter()
    df[seq_col].map(cnt.update);

    stats = []
    sequence_length = df[seq_col].map(len).values
    n_sessions_per_user = df.groupby(user_col).size()

    stats.append('Number of items: {}'.format(len(cnt)))
    stats.append('Number of users: {}'.format(df[user_col].nunique()))
    stats.append('Number of sessions: {}'.format(len(df)))
    stats.append('')
    stats.append('Session length:\n\tAverage: {:.2f}\n\tMedian: {}\n\tMin: {}\n\tMax: {}'.format(
        sequence_length.mean(), 
        np.quantile(sequence_length, 0.5), 
        sequence_length.min(), 
        sequence_length.max()))
    stats.append('')
    stats.append('Sessions per user:\n\tAverage: {:.2f}\n\tMedian: {}\n\tMin: {}\n\tMax: {}'.format(
        n_sessions_per_user.mean(), 
        np.quantile(n_sessions_per_user, 0.5), 
        n_sessions_per_user.min(), 
        n_sessions_per_user.max()))
    stats.append('')
    stats.append('Most popular items: {}'.format(cnt.most_common(5)))
    return '\n'.join(stats)

Example

In [None]:
from recohut.utils.data import load_dataset
from recohut.utils.filters import filter_by_time, filter_top_k

df = load_dataset('music30_sample')
df.columns = ['session_id', 'user_id', 'item_id', 'ts', 'playtime']
df['ts'] = pd.to_datetime(df['ts'], unit='s')

# let's keep only the top-1k most popular items in the last month
df = filter_by_time(df, last_months=1, ts_col='ts')
df = filter_top_k(df, topk=1000, user_col='user_id', item_col='item_id', sess_col='session_id', ts_col='ts')
display(df.head())
df.info()

Unnamed: 0,session_id,sequence,ts,user_id
0,357,"[793, 3489]",2015-01-11 19:17:54,4296
1,359,[1762],2015-01-11 23:22:15,4296
2,394,[1256],2015-01-11 20:17:50,30980
3,4127,"[1948, 1364, 2060, 1115, 6488, 2060]",2015-01-16 14:01:36,28117
4,6400,"[687, 1394]",2015-01-09 12:49:38,35247


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6765 entries, 0 to 6764
Data columns (total 4 columns):
 #   Column      Non-Null Count  Dtype         
---  ------      --------------  -----         
 0   session_id  6765 non-null   int64         
 1   sequence    6765 non-null   object        
 2   ts          6765 non-null   datetime64[ns]
 3   user_id     6765 non-null   int64         
dtypes: datetime64[ns](1), int64(2), object(1)
memory usage: 211.5+ KB


In [None]:
stats = get_session_stats(df, user_col='user_id', seq_col='sequence')
print(stats)

Number of items: 1000
Number of users: 4165
Number of sessions: 6765

Session length:
	Average: 4.29
	Median: 3.0
	Min: 1
	Max: 148

Sessions per user:
	Average: 1.62
	Median: 1.0
	Min: 1
	Max: 13

Most popular items: [('443', 207), ('1065', 155), ('67', 146), ('2308', 138), ('658', 131)]


In [None]:
#hide
%reload_ext watermark
%watermark -a "Sparsh A." -m -iv -u -t -d -p recohut

Author: Sparsh A.

Last updated: 2021-12-26 08:50:09

recohut: 0.0.7

Compiler    : GCC 7.5.0
OS          : Linux
Release     : 5.4.144+
Machine     : x86_64
Processor   : x86_64
CPU cores   : 2
Architecture: 64bit

IPython: 5.5.0
pandas : 1.1.5
numpy  : 1.19.5

