<a href="https://colab.research.google.com/github/RecoHut-Projects/recohut/blob/master/tutorials/preprocessing/T859611_Preprocessing_Music_Sessions_Dataset.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Preprocessing Music Session Dataset

In [None]:
import numpy as np
import pandas as pd
from datetime import timezone, datetime, timedelta
import random
import time
import sys

In [None]:
'''
preprocessing method ["info","org","days_test","slice"]
    info: just load and show info
    org: from gru4rec (last day => test set)
    days_test: adapted from gru4rec (last N days => test set)
    slice: new (create multiple train-test-combinations with a sliding window approach  
'''
# METHOD = "slice"
METHOD = input('Preprocessing method (info/org/days_test/slice):') or 'slice'
assert(METHOD in 'info/org/days_test/slice'.split('/')), 'Invalid Preprocessing method.'

'''
data config (all methods) // change dataset here
'''
#30music/nowplaying/aotm
# PATH = './30music/raw/' 
# PATH_PROCESSED = './30music/slices/'
DATASET_CODE = input('Dataset (30music/nowplaying/aotm):') or '30music'
assert(DATASET_CODE in '30music/nowplaying/aotm'.split('/')), 'Invalid dataset.'

PATH = './{}/raw/'.format(DATASET_CODE)
PATH_PROCESSED = './{}/slices/'.format(DATASET_CODE)
_filenames = {'30music':'30music-200ks','nowplaying':'nowplaying','aotm':'playlists-aotm'}
FILE = _filenames[DATASET_CODE]

'''
filtering config (all methods)
'''
#filtering config (all methods)
MIN_SESSION_LENGTH = 5
MIN_ITEM_SUPPORT = 2

'''
days test default config
'''
DAYS_FOR_TEST = 4

'''
slicing default config
'''
NUM_SLICES = 5 #offset in days from the first date in the data set
DAYS_OFFSET = 0 #number of days the training start date is shifted after creating one slice
DAYS_SHIFT = 60
#each slice consists of...
DAYS_TRAIN = 90
DAYS_TEST = 5

In [None]:
if DATASET_CODE=='30music':
    !wget -q --show-progress https://github.com/RecoHut-Datasets/30music/raw/v1/30music.zip
    !unzip 30music.zip
elif DATASET_CODE=='nowplaying':
    !wget -q --show-progress https://github.com/RecoHut-Datasets/nowplaying/raw/v2/nowplaying.zip
    !unzip nowplaying.zip
elif DATASET_CODE=='aotm':
    !wget -q --show-progress https://github.com/RecoHut-Datasets/aotm/raw/v1/aotm.zip
    !unzip aotm.zip

In [None]:
def load_data( file ) : 
    
    #load csv
    data = pd.read_csv( file+'.csv', sep='\t' )
    
    #data.sort_values( by=['Time'], inplace=True )
    #data['SessionId'] = data.groupby( [data.SessionId] ).grouper.group_info[0]
    
    data.sort_values( by=['SessionId','Time'], inplace=True )
    
    #output
    data_start = datetime.fromtimestamp( data.Time.min(), timezone.utc )
    data_end = datetime.fromtimestamp( data.Time.max(), timezone.utc )
    
    print('Loaded data set\n\tEvents: {}\n\tSessions: {}\n\tItems: {}\n\tSpan: {} / {}\n\n'.
          format( len(data), data.SessionId.nunique(), data.ItemId.nunique(), data_start.date().isoformat(), data_end.date().isoformat() ) )
    
    return data

In [None]:
def filter_data( data, min_item_support, min_session_length ) : 
    
    #filter session length
    session_lengths = data.groupby('SessionId').size()
    data = data[np.in1d(data.SessionId, session_lengths[ session_lengths>1 ].index)]
    
    #filter item support
    item_supports = data.groupby('ItemId').size()
    data = data[np.in1d(data.ItemId, item_supports[ item_supports>= min_item_support ].index)]
    
    #filter session length
    session_lengths = data.groupby('SessionId').size()
    data = data[np.in1d(data.SessionId, session_lengths[ session_lengths>= min_session_length ].index)]
    
    #output
    data_start = datetime.fromtimestamp( data.Time.min(), timezone.utc )
    data_end = datetime.fromtimestamp( data.Time.max(), timezone.utc )
    
    print('Filtered data set\n\tEvents: {}\n\tSessions: {}\n\tItems: {}\n\tSpan: {} / {}\n\n'.
          format( len(data), data.SessionId.nunique(), data.ItemId.nunique(), data_start.date().isoformat(), data_end.date().isoformat() ) )
    
    return data;

In [None]:
def split_data_org( data, output_file ) :
    
    tmax = data.Time.max()
    session_max_times = data.groupby('SessionId').Time.max()
    session_train = session_max_times[session_max_times < tmax-86400].index
    session_test = session_max_times[session_max_times >= tmax-86400].index
    train = data[np.in1d(data.SessionId, session_train)]
    test = data[np.in1d(data.SessionId, session_test)]
    test = test[np.in1d(test.ItemId, train.ItemId)]
    tslength = test.groupby('SessionId').size()
    test = test[np.in1d(test.SessionId, tslength[tslength>=2].index)]
    print('Full train set\n\tEvents: {}\n\tSessions: {}\n\tItems: {}'.format(len(train), train.SessionId.nunique(), train.ItemId.nunique()))
    train.to_csv(output_file + '_train_full.txt', sep='\t', index=False)
    print('Test set\n\tEvents: {}\n\tSessions: {}\n\tItems: {}'.format(len(test), test.SessionId.nunique(), test.ItemId.nunique()))
    test.to_csv(output_file + '_test.txt', sep='\t', index=False)
    
    tmax = train.Time.max()
    session_max_times = train.groupby('SessionId').Time.max()
    session_train = session_max_times[session_max_times < tmax-86400].index
    session_valid = session_max_times[session_max_times >= tmax-86400].index
    train_tr = train[np.in1d(train.SessionId, session_train)]
    valid = train[np.in1d(train.SessionId, session_valid)]
    valid = valid[np.in1d(valid.ItemId, train_tr.ItemId)]
    tslength = valid.groupby('SessionId').size()
    valid = valid[np.in1d(valid.SessionId, tslength[tslength>=2].index)]
    print('Train set\n\tEvents: {}\n\tSessions: {}\n\tItems: {}'.format(len(train_tr), train_tr.SessionId.nunique(), train_tr.ItemId.nunique()))
    train_tr.to_csv( output_file + '_train_tr.txt', sep='\t', index=False)
    print('Validation set\n\tEvents: {}\n\tSessions: {}\n\tItems: {}'.format(len(valid), valid.SessionId.nunique(), valid.ItemId.nunique()))
    valid.to_csv( output_file + '_train_valid.txt', sep='\t', index=False)

In [None]:
def split_data( data, output_file, days_test ) :
    
    data_end = datetime.fromtimestamp( data.Time.max(), timezone.utc )
    test_from = data_end - timedelta( days_test )
    
    session_max_times = data.groupby('SessionId').Time.max()
    session_train = session_max_times[ session_max_times < test_from.timestamp() ].index
    session_test = session_max_times[ session_max_times >= test_from.timestamp() ].index
    train = data[np.in1d(data.SessionId, session_train)]
    test = data[np.in1d(data.SessionId, session_test)]
    test = test[np.in1d(test.ItemId, train.ItemId)]
    tslength = test.groupby('SessionId').size()
    test = test[np.in1d(test.SessionId, tslength[tslength>=2].index)]
    print('Full train set\n\tEvents: {}\n\tSessions: {}\n\tItems: {}'.format(len(train), train.SessionId.nunique(), train.ItemId.nunique()))
    train.to_csv(output_file + '_train_full.txt', sep='\t', index=False)
    print('Test set\n\tEvents: {}\n\tSessions: {}\n\tItems: {}'.format(len(test), test.SessionId.nunique(), test.ItemId.nunique()))
    test.to_csv(output_file + '_test.txt', sep='\t', index=False)

In [None]:
def slice_data( data, output_file, num_slices, days_offset, days_shift, days_train, days_test ): 
    
    for slice_id in range( 0, num_slices ) :
        split_data_slice( data, output_file, slice_id, days_offset+(slice_id*days_shift), days_train, days_test )

In [None]:
def split_data_slice( data, output_file, slice_id, days_offset, days_train, days_test ) :
    
    data_start = datetime.fromtimestamp( data.Time.min(), timezone.utc )
    data_end = datetime.fromtimestamp( data.Time.max(), timezone.utc )
    
    print('Full data set {}\n\tEvents: {}\n\tSessions: {}\n\tItems: {}\n\tSpan: {} / {}'.
          format( slice_id, len(data), data.SessionId.nunique(), data.ItemId.nunique(), data_start.isoformat(), data_end.isoformat() ) )
    
    
    start = datetime.fromtimestamp( data.Time.min(), timezone.utc ) + timedelta( days_offset ) 
    middle =  start + timedelta( days_train )
    end =  middle + timedelta( days_test )
    
    #prefilter the timespan
    session_max_times = data.groupby('SessionId').Time.max()
    greater_start = session_max_times[session_max_times >= start.timestamp()].index
    lower_end = session_max_times[session_max_times <= end.timestamp()].index
    data_filtered = data[np.in1d(data.SessionId, greater_start.intersection( lower_end ))]
    
    print('Slice data set {}\n\tEvents: {}\n\tSessions: {}\n\tItems: {}\n\tSpan: {} / {} / {}'.
          format( slice_id, len(data_filtered), data_filtered.SessionId.nunique(), data_filtered.ItemId.nunique(), start.date().isoformat(), middle.date().isoformat(), end.date().isoformat() ) )
    
    #split to train and test
    session_max_times = data_filtered.groupby('SessionId').Time.max()
    sessions_train = session_max_times[session_max_times < middle.timestamp()].index
    sessions_test = session_max_times[session_max_times >= middle.timestamp()].index
    
    train = data[np.in1d(data.SessionId, sessions_train)]
    
    print('Train set {}\n\tEvents: {}\n\tSessions: {}\n\tItems: {}\n\tSpan: {} / {}'.
          format( slice_id, len(train), train.SessionId.nunique(), train.ItemId.nunique(), start.date().isoformat(), middle.date().isoformat() ) )
    
    train.to_csv(output_file + '_train_full.'+str(slice_id)+'.txt', sep='\t', index=False)
    
    test = data[np.in1d(data.SessionId, sessions_test)]
    test = test[np.in1d(test.ItemId, train.ItemId)]
    
    tslength = test.groupby('SessionId').size()
    test = test[np.in1d(test.SessionId, tslength[tslength>=2].index)]
    
    print('Test set {}\n\tEvents: {}\n\tSessions: {}\n\tItems: {}\n\tSpan: {} / {} \n\n'.
          format( slice_id, len(test), test.SessionId.nunique(), test.ItemId.nunique(), middle.date().isoformat(), end.date().isoformat() ) )
    
    test.to_csv(output_file + '_test.'+str(slice_id)+'.txt', sep='\t', index=False)

In [None]:
#preprocessing from original gru4rec
def preprocess_org( path=PATH, file=FILE, path_proc=PATH_PROCESSED, min_item_support=MIN_ITEM_SUPPORT, min_session_length=MIN_SESSION_LENGTH ):
    
#    data = load_data( path+file )
    #for listening logs
    data = load_data( path+file )
    data = filter_data( data, min_item_support, min_session_length )
    split_data_org( data, path_proc+file )

In [None]:
#preprocessing adapted from original gru4rec
def preprocess_days_test( path=PATH, file=FILE, path_proc=PATH_PROCESSED, min_item_support=MIN_ITEM_SUPPORT, min_session_length=MIN_SESSION_LENGTH, days_test=DAYS_TEST ):
    
#    data = load_data( path+file )
    data = load_data( path+file )
    data = filter_data( data, min_item_support, min_session_length )
    split_data( data, path_proc+file, days_test )

In [None]:
#preprocessing to create data slices with a sliding window
def preprocess_slices( path=PATH, file=FILE, path_proc=PATH_PROCESSED, min_item_support=MIN_ITEM_SUPPORT, min_session_length=MIN_SESSION_LENGTH,
                       num_slices = NUM_SLICES, days_offset = DAYS_OFFSET, days_shift = DAYS_SHIFT, days_train = DAYS_TRAIN, days_test=DAYS_TEST ):
    
    data = load_data( path+file )
    data = filter_data( data, min_item_support, min_session_length )
    slice_data( data, path_proc+file, num_slices, days_offset, days_shift, days_train, days_test )

In [None]:
#just load and show info
def preprocess_info( path=PATH, file=FILE, path_proc=PATH_PROCESSED, min_item_support=MIN_ITEM_SUPPORT, min_session_length=MIN_SESSION_LENGTH ):
    
    data = load_data( path+file )
    data = filter_data( data, min_item_support, min_session_length )     

In [None]:
if __name__ == '__main__':
    '''
    Run the preprocessing configured above.
    '''
    
    print( "START preprocessing ", METHOD )
    sc, st = time.time(), time.time()
    
    if METHOD == "info":
        preprocess_info( PATH, FILE, MIN_ITEM_SUPPORT, MIN_SESSION_LENGTH )
    
    elif METHOD == "org":
        preprocess_org( PATH, FILE, PATH_PROCESSED, MIN_ITEM_SUPPORT, MIN_SESSION_LENGTH )
        
    elif METHOD == "days_test":
        preprocess_days_test( PATH, FILE, PATH_PROCESSED, MIN_ITEM_SUPPORT, MIN_SESSION_LENGTH, DAYS_FOR_TEST )
    
    elif METHOD == "slice":
        preprocess_slices( PATH, FILE, PATH_PROCESSED, MIN_ITEM_SUPPORT, MIN_SESSION_LENGTH, NUM_SLICES, DAYS_OFFSET, DAYS_SHIFT, DAYS_TRAIN, DAYS_TEST )
    else: 
        print( "Invalid method ", METHOD )
        
    print( "END preproccessing ", (time.time() - sc), "c ", (time.time() - st), "s" )

START preprocessing  slice
Loaded data set
	Events: 3707857
	Sessions: 200000
	Items: 1203432
	Span: 2014-01-20 / 2015-01-20


Filtered data set
	Events: 2887349
	Sessions: 169576
	Items: 449037
	Span: 2014-01-20 / 2015-01-20


Full data set 0
	Events: 2887349
	Sessions: 169576
	Items: 449037
	Span: 2014-01-20T09:24:25+00:00 / 2015-01-20T09:23:17+00:00
Slice data set 0
	Events: 682013
	Sessions: 38617
	Items: 223276
	Span: 2014-01-20 / 2014-04-20 / 2014-04-25
Train set 0
	Events: 648300
	Sessions: 36620
	Items: 216054
	Span: 2014-01-20 / 2014-04-20
Test set 0
	Events: 23718
	Sessions: 1794
	Items: 16305
	Span: 2014-04-20 / 2014-04-25 


Full data set 1
	Events: 2887349
	Sessions: 169576
	Items: 449037
	Span: 2014-01-20T09:24:25+00:00 / 2015-01-20T09:23:17+00:00
Slice data set 1
	Events: 646004
	Sessions: 36539
	Items: 216903
	Span: 2014-03-21 / 2014-06-19 / 2014-06-24
Train set 1
	Events: 615759
	Sessions: 34759
	Items: 210736
	Span: 2014-03-21 / 2014-06-19
Test set 1
	Events: 21021
	S

---

In [None]:
# !apt-get -qq install tree
# !rm -r sample_data

In [None]:
# !tree -h --du .

.
├── [256M]  30music
│   ├── [137M]  raw
│   │   └── [137M]  30music-200ks.csv
│   └── [118M]  slices
│       ├── [899K]  30music-200ks_test.0.txt
│       ├── [796K]  30music-200ks_test.1.txt
│       ├── [804K]  30music-200ks_test.2.txt
│       ├── [863K]  30music-200ks_test.3.txt
│       ├── [1.2M]  30music-200ks_test.4.txt
│       ├── [ 24M]  30music-200ks_train_full.0.txt
│       ├── [ 23M]  30music-200ks_train_full.1.txt
│       ├── [ 21M]  30music-200ks_train_full.2.txt
│       ├── [ 22M]  30music-200ks_train_full.3.txt
│       └── [ 24M]  30music-200ks_train_full.4.txt
└── [ 27M]  30music.zip

 282M used in 3 directories, 12 files


In [None]:
# !pip install -q watermark
# %reload_ext watermark
# %watermark -a "Sparsh A." -m -iv -u -t -d

Author: Sparsh A.

Last updated: 2021-12-04 15:27:08

Compiler    : GCC 7.5.0
OS          : Linux
Release     : 5.4.104+
Machine     : x86_64
Processor   : x86_64
CPU cores   : 2
Architecture: 64bit

IPython: 5.5.0
numpy  : 1.19.5
pandas : 1.1.5



---

**END**