## 0. Import packages

In [1]:
import pandas as pd
import gzip
import math
import numpy as np
from tqdm import tqdm
from sklearn.model_selection import train_test_split
tqdm.pandas() #for progres_apply etc.

## 1. Load data from previous step

In [2]:
import pickle
interactions_df = pickle.load(open("pickle_dumps/interactions_df.p", "rb"))

## 2. Pre-process interactions

In [3]:
def preprocess_classic(df):
    """
    Goal: - Remove reconsumption items

    :input df: Dataframe containing user_id, item_id
    """
    before = df.shape[0]
    #drop reconsumption items
    df = df.drop_duplicates(subset=["user_id","item_id"])
    print("After drop_duplicates (reconsumption items): {} -> {}".format(before,df.shape[0]))
    
    #compute item/user counts
    g1 = df.groupby('item_id', as_index=False)['user_id'].size() # nr of users interacted with item
    g1 = g1.rename({'size': 'count_item'}, axis='columns')
    g2 = df.groupby('user_id', as_index=False)['item_id'].size() # nr of items user interacted with
    g2 = g2.rename({'size': 'count_user'}, axis='columns')
    df = pd.merge(df, g1, how='left', on=['item_id'])
    df = pd.merge(df, g2, how='left', on=['user_id'])
    
    # print first 5 items in dataframe
    display(df.head(5))
    return df

interactions_df_processed = interactions_df[['user_id','item_id']]
interactions_df_processed = preprocess_classic(interactions_df_processed)


After drop_duplicates (reconsumption items): 5153209 -> 5094082


Unnamed: 0,user_id,item_id,count_item,count_user
0,76561197970982479,10,9611,277
1,76561197970982479,20,6268,277
2,76561197970982479,30,3431,277
3,76561197970982479,40,3242,277
4,76561197970982479,50,6216,277


In [4]:
print("Infromation about dataset after preprocessing")
print(f"number of unique users: {interactions_df_processed['user_id'].nunique()}")
print(f"number of unique items: {interactions_df_processed['item_id'].nunique()}")

Infromation about dataset after preprocessing
number of unique users: 70912
number of unique items: 10978


In [5]:
interactions_df_processed

Unnamed: 0,user_id,item_id,count_item,count_user
0,76561197970982479,10,9611,277
1,76561197970982479,20,6268,277
2,76561197970982479,30,3431,277
3,76561197970982479,40,3242,277
4,76561197970982479,50,6216,277
...,...,...,...,...
5094077,76561198329548331,346330,906,7
5094078,76561198329548331,373330,472,7
5094079,76561198329548331,388490,329,7
5094080,76561198329548331,521570,195,7


In [6]:
dct = {}
def map_to_consecutive_id(uuid):
    """
    convert ids to consecutive for easier processing
    :uuid user_id to convert
    """
    if uuid in dct:
        return dct[uuid]
    else:
        id = len(dct)
        dct[uuid] = id
        return id

#1) convert user user_ids to consecutive integer ID's
interactions_df_processed['user_id_int'] = interactions_df_processed['user_id'].progress_apply(map_to_consecutive_id)

#2) convert item_ids to to consecutive integer ID's
dct.clear()
interactions_df_processed['item_id_int'] = interactions_df_processed['item_id'].progress_apply(map_to_consecutive_id)

interactions_df_processed.head()

# only keep user_id_int and item_id_int columns in dataframe
interactions_df_processed = interactions_df_processed[["user_id_int", "item_id_int"]]

# rename columns in dataframe
interactions_df_processed = interactions_df_processed.rename(columns={"item_id_int": "item_id", "user_id_int": "user_id"})
interactions_df_processed

100%|████████████████████████████| 5094082/5094082 [00:03<00:00, 1356149.41it/s]
100%|████████████████████████████| 5094082/5094082 [00:03<00:00, 1341104.01it/s]


Unnamed: 0,user_id,item_id
0,0,0
1,0,1
2,0,2
3,0,3
4,0,4
...,...,...
5094077,70911,1922
5094078,70911,3041
5094079,70911,2073
5094080,70911,3510


## 3. Store the preprocessed dataset as pickle file for further use

In [7]:
import pickle
pickle.dump(interactions_df_processed, open("pickle_dumps/interactions_df_processed.p", "wb"))
pickle.dump(dct, open("pickle_dumps/item_dct.p", "wb"))

## 4. Create train/test split

Different options:
- **Time-based split**, i.e. split interactions before/after certain date. Keep all users with both training and test interactions for evaluations. Repeat for different train/test window, i.e. repeat for a number of key dates.
- **Session-based split**, i.e. split and keep first $x$ interactions for training and $|I_u| - x$  for testing, where $I_u = {i_1,\ldots,i_k}$ represents the user's history sorted on time
- **Random**, i.e. take $x$ random items for training and $|I_u| -x$ for testing
- **Leave-one-out**, i.e. take last (or random) item for testing

Additionaly, use techniques such as cross-validation and create multiple train/test splits and report both average and standard deviation.

In [8]:
def convert_uid_df_to_iid_df(df):
    """
    convert a dataframe which contains rows of sort {"user_id": <int>, "item_id", <list<int>>}
    to the corresponding transposed dataframe {"item_id": <int>, "user_id": <list<int>>}
    """
    new = {}
    for _, row in df.iterrows():
        for item in row["item_id"]:
            if item in new:
                new[item].append(row["user_id"])
            else:
                new[item] = [row["user_id"]]
    return pd.DataFrame(new.items(), columns=["item_id", "user_id"])

    

In [9]:
#User-based split for article association rules:
sessions_df = interactions_df_processed.groupby(by='user_id', as_index=False)[['item_id']].agg(list)
sessions_df = sessions_df[sessions_df.item_id.apply(lambda x: len(x) > 100 and len(x) < 600)] # only users with more then one interaction
sessions_df = sessions_df.head(1100)
display(sessions_df.head(10))



test_size = 0.091

# create train/test split with +- 100 test items and 1000 train items
train_df, test_df = train_test_split(sessions_df, test_size=test_size)

Unnamed: 0,user_id,item_id
0,0,"[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,..."
2,2,"[1021, 1022, 1023, 745, 19, 20, 21, 22, 23, 24..."
3,3,"[0, 1, 2, 3, 4, 5, 6, 7, 277, 278, 8, 9, 1021,..."
4,4,"[8, 1, 4, 6, 7, 0, 2, 3, 5, 277, 278, 1075, 10..."
5,5,"[4, 9, 20, 280, 1407, 19, 21, 22, 23, 24, 25, ..."
6,6,"[9, 10, 280, 20, 21, 303, 31, 1, 4, 6, 7, 19, ..."
7,7,"[19, 21, 20, 22, 23, 280, 1143, 17, 18, 25, 31..."
9,9,"[280, 46, 47, 1222, 0, 1, 2, 3, 4, 5, 6, 277, ..."
11,11,"[14, 280, 304, 1932, 1, 4, 6, 7, 19, 138, 20, ..."
14,14,"[280, 63, 64, 1033, 336, 337, 101, 128, 140, 1..."


In [10]:
print("train size:", len(train_df))
print("test size:", len(test_df))

train size: 999
test size: 101


In [11]:
train_df.head(10)

Unnamed: 0,user_id,item_id
1943,1943,"[19, 21, 9, 1209, 1210, 1211, 744, 279, 280, 7..."
1011,1011,"[280, 1, 4, 6, 7, 19, 138, 20, 21, 22, 23, 25,..."
1815,1815,"[19, 21, 9, 1, 4, 6, 7, 10, 280, 1143, 281, 12..."
224,224,"[8, 20, 1021, 1022, 1023, 9, 280, 40, 27, 30, ..."
276,276,"[8, 19, 21, 9, 20, 280, 1143, 40, 27, 1081, 10..."
2053,2053,"[9, 8, 20, 21, 280, 1078, 1143, 1144, 281, 121..."
2206,2206,"[19, 21, 9, 20, 22, 23, 280, 24, 25, 38, 48, 6..."
2975,2975,"[9, 280, 19, 20, 21, 22, 23, 24, 25, 1321, 161..."
2408,2408,"[4, 19, 9, 138, 20, 21, 22, 23, 303, 284, 285,..."
656,656,"[280, 9, 1078, 281, 2648, 17, 18, 97, 43, 46, ..."


In [12]:
#User-based split for user association rules:
# convert train data to transposed dataframe
article_train_df = convert_uid_df_to_iid_df(train_df)
article_train_df = article_train_df[article_train_df.user_id.apply(lambda x: len(x) > 1 and len(x) < 400)]

# convert test data to transposed dataframe
article_test_df = convert_uid_df_to_iid_df(test_df)
article_test_df = article_test_df[article_test_df.user_id.apply(lambda x: len(x) > 1 and len(x) < 400)] # only users with more then one interaction


In [13]:
print("train size:", len(article_train_df))
print("test size:", len(article_test_df))

train size: 5019
test size: 2069


In [14]:
def createSplit(user_items):
    """
    split list of items in 4 tuples for 4-fold cross validation
    """
    splits = []
    
    tmp = np.array_split(user_items, 4)
    
    for l in tmp:
        if len(l) > 0:
            splits.append(list(l))
    
    res = []
    for i in splits:
        join = []
        for j in splits:
            if i != j:
                join += j
        res.append((join, i))
                    
                

    return res

In [15]:
article_test_df["test_split"] = article_test_df["user_id"].apply(createSplit)

In [16]:
article_test_df.head(10)

Unnamed: 0,item_id,user_id,test_split
0,9,"[1587, 2265, 554, 1280, 2794, 498, 2863, 2876,...","[([2469, 772, 2799, 3128, 3107, 2528, 380, 551..."
1,0,"[1587, 554, 2104, 2863, 1914, 2174, 2549, 772,...","[([2528, 380, 551, 1345, 651, 575, 2607, 3040,..."
2,2,"[1587, 2104, 2863, 1693, 2549, 772, 2528, 2179...","[([2549, 772, 2528, 2179, 526, 256, 262, 2353,..."
3,3,"[1587, 2104, 2863, 2549, 772, 2528, 2179, 526,...","[([2549, 772, 2528, 2179, 526, 256, 262, 2353]..."
4,5,"[1587, 2104, 2863, 2549, 772, 2528, 2179, 526,...","[([2549, 772, 2528, 2179, 526, 256, 262, 2353]..."
5,277,"[1587, 554, 2104, 2863, 1914, 2174, 2549, 772,...","[([2528, 380, 551, 1345, 651, 575, 2607, 3040,..."
6,278,"[1587, 554, 2104, 2863, 1914, 2174, 2549, 772,...","[([2528, 380, 551, 1345, 651, 575, 2607, 3040,..."
7,280,"[1587, 1113, 563, 2265, 554, 2867, 1280, 1376,...","[([1693, 2758, 1915, 221, 2549, 2469, 772, 292..."
8,19,"[1587, 554, 1280, 1376, 498, 2863, 2876, 1344,...","[([772, 2929, 2799, 3128, 2528, 380, 1521, 134..."
9,20,"[1587, 554, 498, 2863, 2876, 1344, 2320, 2164,...","[([2929, 2799, 2528, 380, 1345, 1261, 651, 106..."


## 5. Store the train/test split dataset as pickle file for further use

In [17]:
import pickle
pickle.dump(train_df, open("pickle_dumps/train_df.p", "wb"))
pickle.dump(test_df, open("pickle_dumps/test_df.p", "wb"))
pickle.dump(article_train_df, open("pickle_dumps/article_train_df.p", "wb"))
pickle.dump(article_test_df, open("pickle_dumps/article_test_df.p", "wb"))