## 0. Import packages

In [1]:
import pandas as pd
import gzip
import math
import numpy as np
from tqdm import tqdm
from sklearn.model_selection import train_test_split
tqdm.pandas() #for progres_apply etc.

## 1. Load data from previous step

In [2]:
import pickle
interactions_df = pickle.load(open("pickle_dumps/interactions_df.p", "rb"))

## 2. Pre-process interactions

In [3]:
def preprocess_classic(df):
    """
    Goal: - Remove reconsumption items

    :input df: Dataframe containing user_id, item_id
    """
    before = df.shape[0]
    #drop reconsumption items
    df = df.drop_duplicates(subset=["user_id","item_id"])
    print("After drop_duplicates (reconsumption items): {} -> {}".format(before,df.shape[0]))
    #compute item/user counts
    g1 = df.groupby('item_id', as_index=False)['user_id'].size() # nr of users interacted with item
    g1 = g1.rename({'size': 'count_item'}, axis='columns')
    g2 = df.groupby('user_id', as_index=False)['item_id'].size() # nr of items user interacted with
    g2 = g2.rename({'size': 'count_user'}, axis='columns')
    df = pd.merge(df, g1, how='left', on=['item_id'])
    df = pd.merge(df, g2, how='left', on=['user_id'])
    display(df.head(5))
    return df

interactions_df_processed = interactions_df[['user_id','item_id']]
interactions_df_processed = preprocess_classic(interactions_df_processed)


After drop_duplicates (reconsumption items): 5153209 -> 5094082


Unnamed: 0,user_id,item_id,count_item,count_user
0,76561197970982479,10,9611,277
1,76561197970982479,20,6268,277
2,76561197970982479,30,3431,277
3,76561197970982479,40,3242,277
4,76561197970982479,50,6216,277


In [4]:
print("Infromation about dataset after preprocessing")
print(f"number of unique users: {interactions_df_processed['user_id'].nunique()}")
print(f"number of unique items: {interactions_df_processed['item_id'].nunique()}")

Infromation about dataset after preprocessing
number of unique users: 70912
number of unique items: 10978


In [5]:
interactions_df_processed

Unnamed: 0,user_id,item_id,count_item,count_user
0,76561197970982479,10,9611,277
1,76561197970982479,20,6268,277
2,76561197970982479,30,3431,277
3,76561197970982479,40,3242,277
4,76561197970982479,50,6216,277
...,...,...,...,...
5094077,76561198329548331,346330,906,7
5094078,76561198329548331,373330,472,7
5094079,76561198329548331,388490,329,7
5094080,76561198329548331,521570,195,7


In [6]:
dct = {}
def map_to_consecutive_id(uuid):
    if uuid in dct:
        return dct[uuid]
    else:
        id = len(dct)
        dct[uuid] = id
        return id

#1) convert user user_ids to consecutive integer ID's
interactions_df_processed['user_id_int'] = interactions_df_processed['user_id'].progress_apply(map_to_consecutive_id)

#2) convert item_ids to to consecutive integer ID's
dct.clear()
interactions_df_processed['item_id_int'] = interactions_df_processed['item_id'].progress_apply(map_to_consecutive_id)

interactions_df_processed.head()

interactions_df_processed = interactions_df_processed[["user_id_int", "item_id_int"]]
interactions_df_processed = interactions_df_processed.rename(columns={"item_id_int": "item_id", "user_id_int": "user_id"})
interactions_df_processed

100%|██████████| 5094082/5094082 [00:03<00:00, 1300171.07it/s]
100%|██████████| 5094082/5094082 [00:03<00:00, 1291602.55it/s]


Unnamed: 0,user_id,item_id
0,0,0
1,0,1
2,0,2
3,0,3
4,0,4
...,...,...
5094077,70911,1922
5094078,70911,3041
5094079,70911,2073
5094080,70911,3510


## 3. Store the preprocessed dataset as pickle file for further use

In [7]:
import pickle
pickle.dump(interactions_df_processed, open("pickle_dumps/interactions_df_processed.p", "wb"))
pickle.dump(dct, open("pickle_dumps/item_dct.p", "wb"))

## 4. Create train/test split

Different options:
- **Time-based split**, i.e. split interactions before/after certain date. Keep all users with both training and test interactions for evaluations. Repeat for different train/test window, i.e. repeat for a number of key dates.
- **Session-based split**, i.e. split and keep first $x$ interactions for training and $|I_u| - x$  for testing, where $I_u = {i_1,\ldots,i_k}$ represents the user's history sorted on time
- **Random**, i.e. take $x$ random items for training and $|I_u| -x$ for testing
- **Leave-one-out**, i.e. take last (or random) item for testing

Additionaly, use techniques such as cross-validation and create multiple train/test splits and report both average and standard deviation.

In [8]:
def convert_uid_df_to_iid_df(df):
    new = {}
    for _, row in df.iterrows():
        for item in row["item_id"]:
            if item in new:
                new[item].append(row["user_id"])
            else:
                new[item] = [row["user_id"]]
    return pd.DataFrame(new.items(), columns=["item_id", "user_id"])

    

In [9]:
#User-based split:
sessions_df = interactions_df_processed.groupby(by='user_id', as_index=False)[['item_id']].agg(list)
sessions_df = sessions_df[sessions_df.item_id.apply(lambda x: len(x) > 100 and len(x) < 600)] # only users with more then one interaction
sessions_df = sessions_df.head(1100)
display(sessions_df.head(10))



test_size = 0.091

train_df, test_df = train_test_split(sessions_df, test_size=test_size)

Unnamed: 0,user_id,item_id
0,0,"[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,..."
2,2,"[1021, 1022, 1023, 745, 19, 20, 21, 22, 23, 24..."
3,3,"[0, 1, 2, 3, 4, 5, 6, 7, 277, 278, 8, 9, 1021,..."
4,4,"[8, 1, 4, 6, 7, 0, 2, 3, 5, 277, 278, 1075, 10..."
5,5,"[4, 9, 20, 280, 1407, 19, 21, 22, 23, 24, 25, ..."
6,6,"[9, 10, 280, 20, 21, 303, 31, 1, 4, 6, 7, 19, ..."
7,7,"[19, 21, 20, 22, 23, 280, 1143, 17, 18, 25, 31..."
9,9,"[280, 46, 47, 1222, 0, 1, 2, 3, 4, 5, 6, 277, ..."
11,11,"[14, 280, 304, 1932, 1, 4, 6, 7, 19, 138, 20, ..."
14,14,"[280, 63, 64, 1033, 336, 337, 101, 128, 140, 1..."


In [10]:
print("train size:", len(train_df))
print("test size:", len(test_df))

train size: 999
test size: 101


In [12]:
train_df.head(10)

Unnamed: 0,user_id,item_id
2277,2277,"[19, 9, 138, 20, 21, 22, 23, 1237, 14, 1213, 1..."
2503,2503,"[20, 30, 280, 1220, 1619, 42, 40, 26, 476, 27,..."
1579,1579,"[14, 1213, 16, 357, 30, 975, 1109, 1110, 1111,..."
117,117,"[9, 8, 20, 1021, 1022, 1023, 1668, 1669, 280, ..."
3032,3032,"[9, 280, 3299, 304, 1932, 19, 20, 21, 22, 23, ..."
900,900,"[9, 30, 280, 42, 94, 1222, 2481, 63, 64, 460, ..."
3171,3171,"[9, 280, 17, 18, 19, 20, 21, 22, 23, 24, 25, 3..."
631,631,"[9, 8, 20, 21, 1021, 1022, 1023, 280, 143, 144..."
2622,2622,"[280, 24, 1, 4, 6, 7, 19, 138, 20, 21, 22, 23,..."
2470,2470,"[284, 285, 286, 299, 301, 302, 20, 1109, 1110,..."


In [19]:
#User-based split:
article_train_df = convert_uid_df_to_iid_df(train_df)
article_train_df = article_train_df[article_train_df.user_id.apply(lambda x: len(x) > 1 and len(x) < 400)]

article_test_df = convert_uid_df_to_iid_df(test_df)
article_test_df = article_test_df[article_test_df.user_id.apply(lambda x: len(x) > 1 and len(x) < 400)] # only users with more then one interaction


In [20]:
print("train size:", len(article_train_df))
print("test size:", len(article_test_df))

train size: 5036
test size: 1974


In [21]:
def createSplit(user_items):
    splits = []
    
#     scores = cross_val_score(clf, X, y, cv=5)
    tmp = np.array_split(user_items, 4)
    
    for l in tmp:
        if len(l) > 0:
            splits.append(list(l))
    
    res = []
    for i in splits:
        join = []
        for j in splits:
            if i != j:
                join += j
        res.append((join, i))
                    
                

    return res

In [22]:
article_test_df["test_split"] = article_test_df["user_id"].apply(createSplit)

In [23]:
article_test_df.head(10)

Unnamed: 0,item_id,user_id,test_split
0,280,"[488, 641, 965, 1274, 2538, 2144, 2964, 32, 28...","[([119, 2502, 2935, 1565, 866, 2316, 2390, 277..."
1,19,"[488, 641, 1274, 2538, 2964, 32, 539, 1944, 31...","[([1565, 2316, 2390, 2771, 527, 3135, 2353, 80..."
2,20,"[488, 641, 1274, 2964, 32, 539, 1944, 3140, 26...","[([1565, 2316, 2390, 2771, 527, 3135, 2353, 80..."
3,21,"[488, 641, 1274, 2538, 2964, 32, 539, 1944, 31...","[([2502, 1565, 2316, 2390, 2771, 527, 3135, 23..."
4,22,"[488, 641, 1274, 2964, 32, 539, 3140, 2637, 14...","[([2316, 2390, 2771, 527, 3135, 802, 1890, 146..."
5,23,"[488, 641, 1274, 2964, 32, 539, 3140, 2637, 14...","[([2316, 2390, 2771, 527, 3135, 802, 1890, 146..."
6,24,"[488, 641, 965, 32, 539, 1944, 3140, 2637, 228...","[([2390, 2771, 527, 3135, 2353, 802, 1890, 146..."
7,25,"[488, 641, 1274, 2964, 32, 539, 3140, 2637, 14...","[([1565, 2316, 2390, 2771, 527, 3135, 802, 189..."
8,46,"[488, 2538, 2964, 3140, 2284, 1472, 2114, 68, ...","[([2935, 866, 2316, 527, 3135, 2353, 802, 1466..."
9,47,"[488, 2538, 2964, 3140, 2284, 1472, 2114, 68, ...","[([2935, 866, 2316, 527, 3135, 2353, 802, 1466..."


## 5. Store the train/test split dataset as pickle file for further use

In [24]:
import pickle
pickle.dump(train_df, open("pickle_dumps/train_df.p", "wb"))
pickle.dump(test_df, open("pickle_dumps/test_df.p", "wb"))
pickle.dump(article_train_df, open("pickle_dumps/article_train_df.p", "wb"))
pickle.dump(article_test_df, open("pickle_dumps/article_test_df.p", "wb"))