## 0. Import packages

In [1]:
import pandas as pd
import gzip
import math
import numpy as np
from tqdm import tqdm
from sklearn.model_selection import train_test_split
tqdm.pandas() #for progres_apply etc.

## 1. Load data from previous step

In [2]:
import pickle
interactions_df = pickle.load(open("pickle_dumps/interactions_df.p", "rb"))

## 2. Pre-process interactions

In [3]:
def preprocess_classic(df):
    """
    Goal: - Remove reconsumption items

    :input df: Dataframe containing user_id, item_id and recommend
    """
    before = df.shape[0]
    #drop reconsumption items
    df = df.drop_duplicates(subset=["user_id","item_id"])
    print("After drop_duplicates (reconsumption items): {} -> {}".format(before,df.shape[0]))
    #compute item/user counts
    g1 = df.groupby('item_id', as_index=False)['user_id'].size() # nr of users interacted with item
    g1 = g1.rename({'size': 'count_item'}, axis='columns')
    g2 = df.groupby('user_id', as_index=False)['item_id'].size() # nr of items user interacted with
    g2 = g2.rename({'size': 'count_user'}, axis='columns')
    df = pd.merge(df, g1, how='left', on=['item_id'])
    df = pd.merge(df, g2, how='left', on=['user_id'])
    display(df.head(5))
    return df

interactions_df_processed = interactions_df[['user_id','item_id', "recommend"]]
interactions_df_processed = preprocess_classic(interactions_df_processed)


After drop_duplicates (reconsumption items): 3176223 -> 2832522


Unnamed: 0,user_id,item_id,recommend,count_item,count_user
0,76561198007483075,35140,True,3173,6
1,76561197970402776,707610,True,1,7
2,76561198060686749,328100,True,22,55
3,76561198023491401,35140,True,3173,109
4,76561198115331805,35140,True,3173,1


In [4]:
print("Infromation about dataset after preprocessing")
print(f"number of unique users: {interactions_df_processed['user_id'].nunique()}")
print(f"number of unique items: {interactions_df_processed['item_id'].nunique()}")

Infromation about dataset after preprocessing
number of unique users: 1485611
number of unique items: 14513


In [5]:
interactions_df_processed

Unnamed: 0,user_id,item_id,recommend,count_item,count_user
0,76561198007483075,35140,True,3173,6
1,76561197970402776,707610,True,1,7
2,76561198060686749,328100,True,22,55
3,76561198023491401,35140,True,3173,109
4,76561198115331805,35140,True,3173,1
...,...,...,...,...,...
2832517,76561197968169662,252490,True,29685,1
2832518,76561198042664856,252490,True,29685,1
2832519,76561198095476531,252490,True,29685,2
2832520,76561197962161824,252490,True,29685,1


In [6]:
dct = {}
def map_to_consecutive_id(uuid):
    if uuid in dct:
        return dct[uuid]
    else:
        id = len(dct)
        dct[uuid] = id
        return id

#1) convert user user_ids to consecutive integer ID's
interactions_df_processed['user_id_int'] = interactions_df_processed['user_id'].progress_apply(map_to_consecutive_id)

#2) convert item_ids to to consecutive integer ID's
dct.clear()
interactions_df_processed['item_id_int'] = interactions_df_processed['item_id'].progress_apply(map_to_consecutive_id)

interactions_df_processed.head()

interactions_df_processed = interactions_df_processed[["user_id_int", "item_id_int", "recommend"]]
interactions_df_processed = interactions_df_processed.rename(columns={"item_id_int": "item_id", "user_id_int": "user_id"})
interactions_df_processed

100%|█████████████████████████████| 2832522/2832522 [00:03<00:00, 713334.28it/s]
100%|████████████████████████████| 2832522/2832522 [00:02<00:00, 1253404.62it/s]


Unnamed: 0,user_id,item_id,recommend
0,0,0,True
1,1,1,True
2,2,2,True
3,3,0,True
4,4,0,True
...,...,...,...
2832517,1485607,13997,True
2832518,1485608,13997,True
2832519,561635,13997,True
2832520,1485609,13997,True


## 3. Store the preprocessed dataset as pickle file for further use

In [7]:
import pickle
pickle.dump(interactions_df_processed, open("pickle_dumps/interactions_df_processed.p", "wb"))
pickle.dump(dct, open("pickle_dumps/item_dct.p", "wb"))

## 4. Create train/test split

Different options:
- **Time-based split**, i.e. split interactions before/after certain date. Keep all users with both training and test interactions for evaluations. Repeat for different train/test window, i.e. repeat for a number of key dates.
- **Session-based split**, i.e. split and keep first $x$ interactions for training and $|I_u| - x$  for testing, where $I_u = {i_1,\ldots,i_k}$ represents the user's history sorted on time
- **Random**, i.e. take $x$ random items for training and $|I_u| -x$ for testing
- **Leave-one-out**, i.e. take last (or random) item for testing

Additionaly, use techniques such as cross-validation and create multiple train/test splits and report both average and standard deviation.

In [8]:
def convert_uid_df_to_iid_df(df):
    new = {}
    for _, row in df.iterrows():
        for item in row["item_id"]:
            if item in new:
                new[item].append(row["user_id"])
            else:
                new[item] = [row["user_id"]]
    return pd.DataFrame(new.items(), columns=["item_id", "user_id"])

    

In [9]:
#User-based split:
sessions_df = interactions_df_processed.groupby(by='user_id', as_index=False)[['item_id', "recommend"]].agg(list)
sessions_df = sessions_df[sessions_df.item_id.apply(lambda x: len(x) > 1)] # only users with more then one interaction

display(sessions_df.head(10))



test_size = 0.025

train_df, test_df = train_test_split(sessions_df, test_size=test_size)

Unnamed: 0,user_id,item_id,recommend
0,0,"[0, 3871, 6589, 11847, 12663, 14499]","[True, True, True, True, True, True]"
1,1,"[1, 3228, 4392, 8009, 10348, 11846, 13666]","[True, True, True, True, True, True, True]"
2,2,"[2, 135, 735, 1035, 1284, 160, 1559, 2422, 253...","[True, True, True, True, True, True, True, Tru..."
3,3,"[0, 135, 284, 494, 611, 724, 902, 1020, 1211, ...","[True, True, True, True, True, True, True, Tru..."
6,6,"[4, 1622, 12464]","[True, True, True]"
7,7,"[2, 22, 29, 52, 50, 90, 92, 101, 120, 128, 161...","[True, True, True, True, True, True, True, Tru..."
9,9,"[5, 3732, 3783, 8059]","[True, True, True, True]"
10,10,"[6, 11, 36, 121, 119, 65, 241, 268, 267, 385, ...","[True, True, True, True, True, True, True, Tru..."
11,11,"[7, 222, 694, 259, 1096, 1183, 2053, 2868, 315...","[True, True, True, True, True, True, True, Tru..."
13,13,"[8, 36, 380, 473, 938, 1163, 1315, 1390, 1471,...","[True, True, True, True, True, True, True, Tru..."


In [10]:
print("train size:", train_df.size)
print("test size:", test_df.size)

train size: 1316226
test size: 33750


In [11]:
train_df.head(10)

Unnamed: 0,user_id,item_id,recommend
668923,668923,"[5050, 5430, 10363]","[True, True, True]"
133620,133620,"[754, 2433, 2464, 3229, 3859, 7847, 9362, 1050...","[True, True, True, True, True, True, True, Tru..."
43244,43244,"[94, 2752, 7063, 13586]","[True, True, True, True]"
116373,116373,"[676, 13775]","[True, True]"
179662,179662,"[988, 6769]","[True, True]"
254551,254551,"[1528, 1612]","[True, True]"
201985,201985,"[1284, 5563, 12129, 13586]","[True, True, True, True]"
909190,909190,"[7063, 9364]","[True, True]"
249130,249130,"[1513, 6544, 7765, 10365]","[True, True, True, True]"
1205970,1205970,"[11520, 12313]","[True, True]"


In [12]:
#User-based split:
article_train_df = convert_uid_df_to_iid_df(train_df)
article_test_df = convert_uid_df_to_iid_df(test_df)

In [13]:
print("train size:", article_train_df.size)
print("test size:", article_test_df.size)

train size: 27880
test size: 11226


In [14]:
def createSplit(user_items):
    splits = []
    
#     scores = cross_val_score(clf, X, y, cv=5)
    tmp = np.array_split(user_items, 4)
    
    for l in tmp:
        if len(l) > 0:
            splits.append(list(l))
    
    res = []
    for i in splits:
        join = []
        for j in splits:
            if i != j:
                join += j
        res.append((join, i))
                    
                

    return res

In [15]:
article_test_df["test_split"] = article_test_df["user_id"].apply(createSplit)

In [16]:
article_test_df.head(10)

Unnamed: 0,item_id,user_id,test_split
0,9170,"[1051830, 35996, 19426, 115020, 1033596, 39811...","[([1053366, 1057662, 664838, 1053544, 131511, ..."
1,10076,"[1051830, 1057469, 36275, 1125450, 71719, 6306...","[([421604, 157857, 223440, 987362, 879739, 932..."
2,2752,"[394789, 48183, 196830, 403300, 416536, 26408,...","[([413737, 409243, 265571, 394247, 358662, 409..."
3,3111,"[394789, 195357, 200031, 222386, 459675, 45874...","[([105077, 106237, 466886, 469725, 55233, 4881..."
4,3229,"[394789, 275011, 469816, 443263, 473562, 12273...","[([43584, 461234, 210561, 500771, 489190, 4938..."
5,3716,"[394789, 359281, 616497, 575586, 454043, 48953...","[([124400, 129632, 522978, 602240, 585482, 611..."
6,8135,"[394789, 36275, 22503, 969392, 147003, 61342, ...","[([216987, 472495, 519565, 198723, 62224, 6901..."
7,9559,"[394789, 28079, 348988, 73403, 165682, 65438, ...","[([65438, 1074945, 107794, 188063, 85150, 1122..."
8,12107,"[394789, 380580, 17537, 45774, 222386, 77349, ...","[([109103, 217396, 288792, 1016722, 613652, 10..."
9,14133,"[394789, 89843, 157629, 414323, 300475, 20442,...","[([157629, 414323, 300475, 20442, 591201, 3290..."


## 5. Store the train/test split dataset as pickle file for further use

In [17]:
import pickle
pickle.dump(train_df, open("pickle_dumps/train_df.p", "wb"))
pickle.dump(test_df, open("pickle_dumps/test_df.p", "wb"))
pickle.dump(article_train_df, open("pickle_dumps/article_train_df.p", "wb"))
pickle.dump(article_test_df, open("pickle_dumps/article_test_df.p", "wb"))