## 0. Import packages

In [1]:
import pandas as pd
import gzip
import math
import numpy as np
from tqdm import tqdm
from sklearn.model_selection import train_test_split
tqdm.pandas() #for progres_apply etc.

## 1. Load data from previous step

In [2]:
import pickle
interactions_df = pickle.load(open("pickle_dumps/interactions_df.p", "rb"))

## 2. Pre-process interactions

In [3]:
def preprocess_classic(df):
    """
    Goal: - Remove reconsumption items

    :input df: Dataframe containing user_id, item_id and recommend
    """
    before = df.shape[0]
    #drop reconsumption items
    df = df.drop_duplicates(subset=["user_id","item_id"])
    print("After drop_duplicates (reconsumption items): {} -> {}".format(before,df.shape[0]))
    #compute item/user counts
    g1 = df.groupby('item_id', as_index=False)['user_id'].size() # nr of users interacted with item
    g1 = g1.rename({'size': 'count_item'}, axis='columns')
    g2 = df.groupby('user_id', as_index=False)['item_id'].size() # nr of items user interacted with
    g2 = g2.rename({'size': 'count_user'}, axis='columns')
    df = pd.merge(df, g1, how='left', on=['item_id'])
    df = pd.merge(df, g2, how='left', on=['user_id'])
    display(df.head(5))
    return df

interactions_df_processed = interactions_df[['user_id','item_id', "recommend"]]
interactions_df_processed = preprocess_classic(interactions_df_processed)


After drop_duplicates (reconsumption items): 3176223 -> 2832522


Unnamed: 0,user_id,item_id,recommend,count_item,count_user
0,76561198007483075,35140,True,3173,6
1,76561197970402776,707610,True,1,7
2,76561198060686749,328100,True,22,55
3,76561198023491401,35140,True,3173,109
4,76561198115331805,35140,True,3173,1


In [4]:
print("Infromation about dataset after preprocessing")
print(f"number of unique users: {interactions_df_processed['user_id'].nunique()}")
print(f"number of unique items: {interactions_df_processed['item_id'].nunique()}")

Infromation about dataset after preprocessing
number of unique users: 1485611
number of unique items: 14513


In [5]:
interactions_df_processed

Unnamed: 0,user_id,item_id,recommend,count_item,count_user
0,76561198007483075,35140,True,3173,6
1,76561197970402776,707610,True,1,7
2,76561198060686749,328100,True,22,55
3,76561198023491401,35140,True,3173,109
4,76561198115331805,35140,True,3173,1
...,...,...,...,...,...
2832517,76561197968169662,252490,True,29685,1
2832518,76561198042664856,252490,True,29685,1
2832519,76561198095476531,252490,True,29685,2
2832520,76561197962161824,252490,True,29685,1


In [6]:
dct = {}
def map_to_consecutive_id(uuid):
    if uuid in dct:
        return dct[uuid]
    else:
        id = len(dct)
        dct[uuid] = id
        return id

#1) convert user user_ids to consecutive integer ID's
interactions_df_processed['user_id_int'] = interactions_df_processed['user_id'].progress_apply(map_to_consecutive_id)

#2) convert item_ids to to consecutive integer ID's
dct.clear()
interactions_df_processed['item_id_int'] = interactions_df_processed['item_id'].progress_apply(map_to_consecutive_id)

interactions_df_processed.head()

interactions_df_processed = interactions_df_processed[["user_id_int", "item_id_int", "recommend"]]
interactions_df_processed = interactions_df_processed.rename(columns={"item_id_int": "item_id", "user_id_int": "user_id"})
interactions_df_processed

100%|██████████| 2832522/2832522 [00:03<00:00, 904959.83it/s] 
100%|██████████| 2832522/2832522 [00:02<00:00, 1318081.28it/s]


Unnamed: 0,user_id,item_id,recommend
0,0,0,True
1,1,1,True
2,2,2,True
3,3,0,True
4,4,0,True
...,...,...,...
2832517,1485607,13997,True
2832518,1485608,13997,True
2832519,561635,13997,True
2832520,1485609,13997,True


## 3. Store the preprocessed dataset as pickle file for further use

In [7]:
import pickle
pickle.dump(interactions_df_processed, open("pickle_dumps/interactions_df_processed.p", "wb"))
pickle.dump(dct, open("pickle_dumps/item_dct.p", "wb"))

## 4. Create train/test split

Different options:
- **Time-based split**, i.e. split interactions before/after certain date. Keep all users with both training and test interactions for evaluations. Repeat for different train/test window, i.e. repeat for a number of key dates.
- **Session-based split**, i.e. split and keep first $x$ interactions for training and $|I_u| - x$  for testing, where $I_u = {i_1,\ldots,i_k}$ represents the user's history sorted on time
- **Random**, i.e. take $x$ random items for training and $|I_u| -x$ for testing
- **Leave-one-out**, i.e. take last (or random) item for testing

Additionaly, use techniques such as cross-validation and create multiple train/test splits and report both average and standard deviation.

In [8]:
#User-based split:
sessions_df = interactions_df_processed.groupby(by='user_id', as_index=False)[['item_id', "recommend"]].agg(list)
sessions_df = sessions_df[sessions_df.item_id.apply(lambda x: len(x) > 1)] # only users with more then one interaction

display(sessions_df.head(10))



test_size = 0.025

train_df, test_df = train_test_split(sessions_df, test_size=test_size)

Unnamed: 0,user_id,item_id,recommend
0,0,"[0, 3871, 6589, 11847, 12663, 14499]","[True, True, True, True, True, True]"
1,1,"[1, 3228, 4392, 8009, 10348, 11846, 13666]","[True, True, True, True, True, True, True]"
2,2,"[2, 135, 735, 1035, 1284, 160, 1559, 2422, 253...","[True, True, True, True, True, True, True, Tru..."
3,3,"[0, 135, 284, 494, 611, 724, 902, 1020, 1211, ...","[True, True, True, True, True, True, True, Tru..."
6,6,"[4, 1622, 12464]","[True, True, True]"
7,7,"[2, 22, 29, 52, 50, 90, 92, 101, 120, 128, 161...","[True, True, True, True, True, True, True, Tru..."
9,9,"[5, 3732, 3783, 8059]","[True, True, True, True]"
10,10,"[6, 11, 36, 121, 119, 65, 241, 268, 267, 385, ...","[True, True, True, True, True, True, True, Tru..."
11,11,"[7, 222, 694, 259, 1096, 1183, 2053, 2868, 315...","[True, True, True, True, True, True, True, Tru..."
13,13,"[8, 36, 380, 473, 938, 1163, 1315, 1390, 1471,...","[True, True, True, True, True, True, True, Tru..."


In [9]:
print("train size:", train_df.size)
print("test size:", test_df.size)

train size: 1316226
test size: 33750


In [10]:
train_df.head(10)

Unnamed: 0,user_id,item_id,recommend
642076,642076,"[4706, 5624]","[True, True]"
703220,703220,"[5400, 7362]","[True, True]"
549051,549051,"[4105, 11396]","[True, True]"
106660,106660,"[577, 691, 3290, 5662, 6179, 8028, 9277, 10476...","[True, True, True, True, True, True, True, Tru..."
196155,196155,"[1215, 4579, 7964]","[True, True, True]"
1244536,1244536,"[12071, 12129, 13525]","[True, True, True]"
268838,268838,"[1661, 5746]","[True, True]"
353247,353247,"[2283, 4206, 5544, 9016, 11110, 14055]","[True, True, True, True, True, True]"
442621,442621,"[2678, 6769, 12232]","[True, True, True]"
576026,576026,"[3783, 3716, 5747, 12019, 13757, 13997]","[True, True, True, True, True, True]"


In [11]:
#User-based split:
article_df = interactions_df_processed.groupby(by='item_id', as_index=False)[['user_id', "recommend"]].agg(list)
article_df = article_df[article_df.user_id.apply(lambda x: len(x) > 1)] # only items with more then one user interaction

display(article_df.head(10))

test_size = 0.025

article_train_df, article_test_df = train_test_split(article_df, test_size=test_size)

Unnamed: 0,item_id,user_id,recommend
0,0,"[0, 3, 4, 14, 68, 69, 72, 73, 74, 75, 76, 77, ...","[True, True, True, True, True, True, True, Tru..."
2,2,"[2, 7, 70, 71, 79, 152, 217, 225, 226, 285, 28...","[True, True, True, True, True, True, True, Tru..."
4,4,"[6, 23, 44, 45, 46, 90, 126, 159, 160, 168, 17...","[True, True, True, True, True, True, True, Tru..."
5,5,"[8, 9, 27, 28, 51]","[True, True, True, True, True]"
6,6,"[10, 29, 52, 89]","[True, True, True, True]"
7,7,"[11, 12, 53, 88, 101, 124, 136, 162, 169, 178,...","[True, True, True, True, True, True, True, Tru..."
8,8,"[13, 32]","[True, True]"
9,9,"[15, 36]","[True, True]"
10,10,"[16, 37, 38, 39, 58, 59, 64, 81, 82, 83, 95, 9...","[True, True, True, True, True, True, True, Tru..."
11,11,"[17, 18, 40, 60, 65, 80, 93, 94, 109, 165, 172...","[True, True, True, True, True, True, True, Tru..."


In [12]:
print("train size:", article_train_df.size)
print("test size:", article_test_df.size)

train size: 38580
test size: 990


In [13]:
def createSplit(user_items):
    splits = []
    
#     scores = cross_val_score(clf, X, y, cv=5)
    tmp = np.array_split(user_items, 4)
    
    for l in tmp:
        if len(l) > 0:
            splits.append(list(l))
    
    res = []
    for i in splits:
        join = []
        for j in splits:
            if i != j:
                join += j
        res.append((join, i))
                    
                

    return res

In [14]:
article_test_df["test_split"] = article_test_df["user_id"].apply(createSplit)

In [15]:
article_test_df.head(10)

Unnamed: 0,item_id,user_id,recommend,test_split
7546,7546,"[931158, 931159, 311746, 183346, 931163, 93118...","[True, True, True, True, True, True, True, Tru...","[([16988, 15759, 329863, 931602, 341784, 93163..."
9945,9945,"[476924, 206153, 14318, 1097305, 1097306, 1097...","[True, True, True, True, True, True, True, Tru...","[([1097540, 494279, 1097542, 658148, 1097545, ..."
11634,11634,"[1210230, 339530, 268428, 6681, 1008341, 40511...","[True, True, True, True, True, True, True]","[([268428, 6681, 1008341, 405110, 1210262], [1..."
12976,12976,"[6868, 1300742, 286562, 1053159, 529018]","[True, True, True, True, True]","[([286562, 1053159, 529018], [6868, 1300742]),..."
14267,14267,"[960655, 893009, 810480, 660505]","[True, True, True, True]","[([893009, 810480, 660505], [960655]), ([96065..."
1994,1994,"[298584, 298587, 298645, 298646, 159791, 29864...","[True, True, True, True, True, True, True, Tru...","[([298712, 298713, 298714, 298738, 126068, 298..."
11513,11513,"[403984, 7, 1204610, 7508, 129, 1204651, 20475...","[True, True, True, True, True, True, True, True]","[([1204610, 7508, 129, 1204651, 204759, 120468..."
11157,11157,"[207664, 471432, 462945, 1175901, 322079, 1175...","[True, True, True, True, True, True, True, Tru...","[([1175901, 322079, 1175904, 703503, 107823, 3..."
3742,3742,"[508019, 261719, 508023, 508026, 508028, 24295...","[True, True, True, True, True, True, True, Tru...","[([508774, 508778, 3125, 104759, 123702, 50882..."
737,737,"[130005, 130007]","[True, True]","[([130007], [130005]), ([130005], [130007])]"


## 5. Store the train/test split dataset as pickle file for further use

In [16]:
import pickle
pickle.dump(train_df, open("pickle_dumps/train_df.p", "wb"))
pickle.dump(test_df, open("pickle_dumps/test_df.p", "wb"))
pickle.dump(article_train_df, open("pickle_dumps/article_train_df.p", "wb"))
pickle.dump(article_test_df, open("pickle_dumps/article_test_df.p", "wb"))