## 0. Import packages

In [1]:
import pandas as pd
import gzip
import math
from tqdm import tqdm
tqdm.pandas() #for progres_apply etc.

## 1. Load data from previous step

In [2]:
import pickle
interactions_df = pickle.load(open("pickle_dumps/interactions_df.p", "rb"))

## 2. Pre-process interactions

In [4]:
def preprocess_classic(df):
    """
    Goal: - Remove reconsumption items

    :input df: Dataframe containing user_id, item_id and recommend
    """
    before = df.shape[0]
    #drop reconsumption items
    df = df.drop_duplicates(subset=["user_id","item_id"])
    print("After drop_duplicates (reconsumption items): {} -> {}".format(before,df.shape[0]))
    #compute item/user counts
    g1 = df.groupby('item_id', as_index=False)['user_id'].size() # nr of users interacted with item
    g1 = g1.rename({'size': 'count_item'}, axis='columns')
    g2 = df.groupby('user_id', as_index=False)['item_id'].size() # nr of items user interacted with
    g2 = g2.rename({'size': 'count_user'}, axis='columns')
    df = pd.merge(df, g1, how='left', on=['item_id'])
    df = pd.merge(df, g2, how='left', on=['user_id'])
    display(df.head(5))
    return df

interactions_df_processed = interactions_df[['user_id','item_id', "recommend"]]
interactions_df_processed = preprocess_classic(interactions_df_processed)


After drop_duplicates (reconsumption items): 3176223 -> 2832522


Unnamed: 0,user_id,item_id,recommend,count_item,count_user
0,76561198007483075,35140,True,3173,6
1,76561197970402776,707610,True,1,7
2,76561198060686749,328100,True,22,55
3,76561198023491401,35140,True,3173,109
4,76561198115331805,35140,True,3173,1


In [5]:
print("Infromation about dataset after preprocessing")
print(f"number of unique users: {interactions_df_processed['user_id'].nunique()}")
print(f"number of unique items: {interactions_df_processed['item_id'].nunique()}")

Infromation about dataset after preprocessing
number of unique users: 1485611
number of unique items: 14513


In [6]:
interactions_df_processed

Unnamed: 0,user_id,item_id,recommend,count_item,count_user
0,76561198007483075,35140,True,3173,6
1,76561197970402776,707610,True,1,7
2,76561198060686749,328100,True,22,55
3,76561198023491401,35140,True,3173,109
4,76561198115331805,35140,True,3173,1
...,...,...,...,...,...
2832517,76561197968169662,252490,True,29685,1
2832518,76561198042664856,252490,True,29685,1
2832519,76561198095476531,252490,True,29685,2
2832520,76561197962161824,252490,True,29685,1


In [7]:
dct = {}
def map_to_consecutive_id(uuid):
    if uuid in dct:
        return dct[uuid]
    else:
        id = len(dct)
        dct[uuid] = id
        return id

#1) convert user user_ids to consecutive integer ID's
interactions_df_processed['user_id_int'] = interactions_df_processed['user_id'].progress_apply(map_to_consecutive_id)

#2) convert item_ids to to consecutive integer ID's
dct.clear()
interactions_df_processed['item_id_int'] = interactions_df_processed['item_id'].progress_apply(map_to_consecutive_id)

interactions_df_processed.head()

interactions_df_processed = interactions_df_processed[["user_id_int", "item_id_int", "recommend"]]
interactions_df_processed = interactions_df_processed.rename(columns={"item_id_int": "item_id", "user_id_int": "user_id"})
interactions_df_processed

100%|██████████| 2832522/2832522 [00:02<00:00, 1082071.83it/s]
100%|██████████| 2832522/2832522 [00:01<00:00, 1735727.86it/s]


Unnamed: 0,user_id,item_id,recommend
0,0,0,True
1,1,1,True
2,2,2,True
3,3,0,True
4,4,0,True
...,...,...,...
2832517,1485607,13997,True
2832518,1485608,13997,True
2832519,561635,13997,True
2832520,1485609,13997,True


## 3. Store the preprocessed dataset as pickle file for further use

In [8]:
import pickle
pickle.dump(interactions_df_processed, open("pickle_dumps/interactions_df_processed.p", "wb"))

## 4. Create train/test split

Different options:
- **Time-based split**, i.e. split interactions before/after certain date. Keep all users with both training and test interactions for evaluations. Repeat for different train/test window, i.e. repeat for a number of key dates.
- **Session-based split**, i.e. split and keep first $x$ interactions for training and $|I_u| - x$  for testing, where $I_u = {i_1,\ldots,i_k}$ represents the user's history sorted on time
- **Random**, i.e. take $x$ random items for training and $|I_u| -x$ for testing
- **Leave-one-out**, i.e. take last (or random) item for testing

Additionaly, use techniques such as cross-validation and create multiple train/test splits and report both average and standard deviation.

In [9]:
#Session-based split:
sessions_df = interactions_df_processed.groupby(by='user_id', as_index=False)[['item_id', "recommend"]].agg(list)
display(sessions_df.head(10))

def split(items, percentage_train):
    no_train_items = math.floor(len(items) * percentage_train)
    return items[0:no_train_items], items[no_train_items:]

percentage_train = 0.8
sessions_df['train'] = sessions_df['item_id'].apply(lambda items: split(items, percentage_train)[0])
sessions_df['test'] = sessions_df['item_id'].apply(lambda items: split(items, percentage_train)[1])

sessions_df.head(10)

Unnamed: 0,user_id,item_id,recommend
0,0,"[0, 3871, 6589, 11847, 12663, 14499]","[True, True, True, True, True, True]"
1,1,"[1, 3228, 4392, 8009, 10348, 11846, 13666]","[True, True, True, True, True, True, True]"
2,2,"[2, 135, 735, 1035, 1284, 160, 1559, 2422, 253...","[True, True, True, True, True, True, True, Tru..."
3,3,"[0, 135, 284, 494, 611, 724, 902, 1020, 1211, ...","[True, True, True, True, True, True, True, Tru..."
4,4,[0],[True]
5,5,[3],[True]
6,6,"[4, 1622, 12464]","[True, True, True]"
7,7,"[2, 22, 29, 52, 50, 90, 92, 101, 120, 128, 161...","[True, True, True, True, True, True, True, Tru..."
8,8,[5],[True]
9,9,"[5, 3732, 3783, 8059]","[True, True, True, True]"


Unnamed: 0,user_id,item_id,recommend,train,test
0,0,"[0, 3871, 6589, 11847, 12663, 14499]","[True, True, True, True, True, True]","[0, 3871, 6589, 11847]","[12663, 14499]"
1,1,"[1, 3228, 4392, 8009, 10348, 11846, 13666]","[True, True, True, True, True, True, True]","[1, 3228, 4392, 8009, 10348]","[11846, 13666]"
2,2,"[2, 135, 735, 1035, 1284, 160, 1559, 2422, 253...","[True, True, True, True, True, True, True, Tru...","[2, 135, 735, 1035, 1284, 160, 1559, 2422, 253...","[9014, 10166, 9395, 10850, 11086, 11182, 12081..."
3,3,"[0, 135, 284, 494, 611, 724, 902, 1020, 1211, ...","[True, True, True, True, True, True, True, Tru...","[0, 135, 284, 494, 611, 724, 902, 1020, 1211, ...","[12246, 12423, 12488, 12616, 12644, 12678, 129..."
4,4,[0],[True],[],[0]
5,5,[3],[True],[],[3]
6,6,"[4, 1622, 12464]","[True, True, True]","[4, 1622]",[12464]
7,7,"[2, 22, 29, 52, 50, 90, 92, 101, 120, 128, 161...","[True, True, True, True, True, True, True, Tru...","[2, 22, 29, 52, 50, 90, 92, 101, 120, 128, 161...","[11747, 11770, 11778, 11782, 11807, 11717, 118..."
8,8,[5],[True],[],[5]
9,9,"[5, 3732, 3783, 8059]","[True, True, True, True]","[5, 3732, 3783]",[8059]


In [10]:
#User-based split:
sessions_df = interactions_df_processed.groupby(by='user_id', as_index=False)[['item_id', "recommend"]].agg(list)
display(sessions_df.head(10))

percentage_train = 0.8

# shuffle rows
sessions_df = sessions_df.sample(frac=1).reset_index(drop=True)
sessions_count = sessions_df.shape[0]
train_index = round(percentage_train * sessions_count)

train_df = sessions_df[:train_index].reset_index(drop=True)
test_df = sessions_df[train_index:].reset_index(drop=True)

train_df.head(10)


Unnamed: 0,user_id,item_id,recommend
0,0,"[0, 3871, 6589, 11847, 12663, 14499]","[True, True, True, True, True, True]"
1,1,"[1, 3228, 4392, 8009, 10348, 11846, 13666]","[True, True, True, True, True, True, True]"
2,2,"[2, 135, 735, 1035, 1284, 160, 1559, 2422, 253...","[True, True, True, True, True, True, True, Tru..."
3,3,"[0, 135, 284, 494, 611, 724, 902, 1020, 1211, ...","[True, True, True, True, True, True, True, Tru..."
4,4,[0],[True]
5,5,[3],[True]
6,6,"[4, 1622, 12464]","[True, True, True]"
7,7,"[2, 22, 29, 52, 50, 90, 92, 101, 120, 128, 161...","[True, True, True, True, True, True, True, Tru..."
8,8,[5],[True]
9,9,"[5, 3732, 3783, 8059]","[True, True, True, True]"


Unnamed: 0,user_id,item_id,recommend
0,1484551,[13997],[True]
1,414775,[2678],[True]
2,1288479,[12663],[True]
3,1352198,[13752],[True]
4,1024232,[8549],[True]
5,1329935,[13215],[True]
6,574291,"[4129, 11124]","[True, True]"
7,672676,"[5076, 7889]","[True, True]"
8,969042,[7966],[True]
9,260765,"[1528, 2621, 5832, 9451, 11847, 12254, 13629]","[True, True, True, True, True, True, True]"


## 5. Store the train/test split dataset as pickle file for further use

In [11]:
import pickle
pickle.dump(sessions_df, open("pickle_dumps/sessions_df.p", "wb"))
pickle.dump(train_df, open("pickle_dumps/train_df.p", "wb"))
pickle.dump(test_df, open("pickle_dumps/test_df.p", "wb"))