## 0. Import packages

In [1]:
import pandas as pd
import gzip
import math
import numpy as np
from tqdm import tqdm
from sklearn.model_selection import train_test_split
tqdm.pandas() #for progres_apply etc.

## 1. Load data from previous step

In [2]:
import pickle
interactions_df = pickle.load(open("pickle_dumps/interactions_df.p", "rb"))

## 2. Pre-process interactions

In [3]:
def preprocess_classic(df):
    """
    Goal: - Remove reconsumption items

    :input df: Dataframe containing user_id, item_id
    """
    before = df.shape[0]
    #drop reconsumption items
    df = df.drop_duplicates(subset=["user_id","item_id"])
    print("After drop_duplicates (reconsumption items): {} -> {}".format(before,df.shape[0]))
    
    #compute item/user counts
    g1 = df.groupby('item_id', as_index=False)['user_id'].size() # nr of users interacted with item
    g1 = g1.rename({'size': 'count_item'}, axis='columns')
    g2 = df.groupby('user_id', as_index=False)['item_id'].size() # nr of items user interacted with
    g2 = g2.rename({'size': 'count_user'}, axis='columns')
    df = pd.merge(df, g1, how='left', on=['item_id'])
    df = pd.merge(df, g2, how='left', on=['user_id'])
    
    # print first 5 items in dataframe
    display(df.head(5))
    return df

interactions_df_processed = interactions_df[['user_id','item_id']]
interactions_df_processed = preprocess_classic(interactions_df_processed)


After drop_duplicates (reconsumption items): 5153209 -> 5094082


Unnamed: 0,user_id,item_id,count_item,count_user
0,76561197970982479,10,9611,277
1,76561197970982479,20,6268,277
2,76561197970982479,30,3431,277
3,76561197970982479,40,3242,277
4,76561197970982479,50,6216,277


In [4]:
print("Infromation about dataset after preprocessing")
print(f"number of unique users: {interactions_df_processed['user_id'].nunique()}")
print(f"number of unique items: {interactions_df_processed['item_id'].nunique()}")

Infromation about dataset after preprocessing
number of unique users: 70912
number of unique items: 10978


In [5]:
interactions_df_processed

Unnamed: 0,user_id,item_id,count_item,count_user
0,76561197970982479,10,9611,277
1,76561197970982479,20,6268,277
2,76561197970982479,30,3431,277
3,76561197970982479,40,3242,277
4,76561197970982479,50,6216,277
...,...,...,...,...
5094077,76561198329548331,346330,906,7
5094078,76561198329548331,373330,472,7
5094079,76561198329548331,388490,329,7
5094080,76561198329548331,521570,195,7


In [6]:
dct = {}
def map_to_consecutive_id(uuid):
    """
    convert ids to consecutive for easier processing
    :uuid user_id to convert
    """
    if uuid in dct:
        return dct[uuid]
    else:
        id = len(dct)
        dct[uuid] = id
        return id

#1) convert user user_ids to consecutive integer ID's
interactions_df_processed['user_id_int'] = interactions_df_processed['user_id'].progress_apply(map_to_consecutive_id)

#2) convert item_ids to to consecutive integer ID's
dct.clear()
interactions_df_processed['item_id_int'] = interactions_df_processed['item_id'].progress_apply(map_to_consecutive_id)

interactions_df_processed.head()

# only keep user_id_int and item_id_int columns in dataframe
interactions_df_processed = interactions_df_processed[["user_id_int", "item_id_int"]]

# rename columns in dataframe
interactions_df_processed = interactions_df_processed.rename(columns={"item_id_int": "item_id", "user_id_int": "user_id"})
interactions_df_processed

100%|████████████████████████████| 5094082/5094082 [00:03<00:00, 1332795.24it/s]
100%|████████████████████████████| 5094082/5094082 [00:03<00:00, 1326150.78it/s]


Unnamed: 0,user_id,item_id
0,0,0
1,0,1
2,0,2
3,0,3
4,0,4
...,...,...
5094077,70911,1922
5094078,70911,3041
5094079,70911,2073
5094080,70911,3510


## 3. Store the preprocessed dataset as pickle file for further use

In [7]:
import pickle
pickle.dump(interactions_df_processed, open("pickle_dumps/interactions_df_processed.p", "wb"))
pickle.dump(dct, open("pickle_dumps/item_dct.p", "wb"))

## 4. Create train/test split

Different options:
- **Time-based split**, i.e. split interactions before/after certain date. Keep all users with both training and test interactions for evaluations. Repeat for different train/test window, i.e. repeat for a number of key dates.
- **Session-based split**, i.e. split and keep first $x$ interactions for training and $|I_u| - x$  for testing, where $I_u = {i_1,\ldots,i_k}$ represents the user's history sorted on time
- **Random**, i.e. take $x$ random items for training and $|I_u| -x$ for testing
- **Leave-one-out**, i.e. take last (or random) item for testing

Additionaly, use techniques such as cross-validation and create multiple train/test splits and report both average and standard deviation.

In [8]:
def convert_uid_df_to_iid_df(df):
    """
    convert a dataframe which contains rows of sort {"user_id": <int>, "item_id", <list<int>>}
    to the corresponding transposed dataframe {"item_id": <int>, "user_id": <list<int>>}
    """
    new = {}
    for _, row in df.iterrows():
        for item in row["item_id"]:
            if item in new:
                new[item].append(row["user_id"])
            else:
                new[item] = [row["user_id"]]
    return pd.DataFrame(new.items(), columns=["item_id", "user_id"])

    

In [9]:
def convert_test_uid_df_to_iid_df(df):
    """
    convert a dataframe which contains rows of sort {"user_id": <int>, "item_id", <list<int>>}
    to the corresponding transposed dataframe {"item_id": <int>, "user_id": <list<int>>}
    preserving the test split
    """
    new = {}
    for _, row in df.iterrows():
        for i in range(4):
            for item in row["test_split"][i][0]:
                if item in new:
                    new[item][i].append(row["user_id"])
                else:
                    new[item] = [[] for i in range(4)]
                    new[item][i] = [row["user_id"]]                    
    return pd.DataFrame(new.items(), columns=["item_id", "test_split"])

In [10]:
def splitUser(user_items):
    splits = []
    
#     scores = cross_val_score(clf, X, y, cv=5)
    tmp = np.array_split(user_items, 4)
    
    for l in tmp:
        if len(l) > 0:
            splits.append(list(l))
    
    res = []
    for i in splits:
        join = []
        for j in splits:
            if i != j:
                join += j
        res.append((join, i))
                    
                

    return res

In [11]:
#User-based split for article association rules:
sessions_df = interactions_df_processed.groupby(by='user_id', as_index=False)[['item_id']].agg(list)
sessions_df = sessions_df[sessions_df.item_id.apply(lambda x: len(x) > 100 and len(x) < 600)] # only users with more then one interaction
sessions_df = sessions_df.head(1100)
display(sessions_df.head(10))



test_size = 0.091

# create train/test split with +- 100 test items and 1000 train items
train_df, test_df = train_test_split(sessions_df, test_size=test_size)

test_df["test_split"] = test_df["item_id"].apply(splitUser)

Unnamed: 0,user_id,item_id
0,0,"[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,..."
2,2,"[1021, 1022, 1023, 745, 19, 20, 21, 22, 23, 24..."
3,3,"[0, 1, 2, 3, 4, 5, 6, 7, 277, 278, 8, 9, 1021,..."
4,4,"[8, 1, 4, 6, 7, 0, 2, 3, 5, 277, 278, 1075, 10..."
5,5,"[4, 9, 20, 280, 1407, 19, 21, 22, 23, 24, 25, ..."
6,6,"[9, 10, 280, 20, 21, 303, 31, 1, 4, 6, 7, 19, ..."
7,7,"[19, 21, 20, 22, 23, 280, 1143, 17, 18, 25, 31..."
9,9,"[280, 46, 47, 1222, 0, 1, 2, 3, 4, 5, 6, 277, ..."
11,11,"[14, 280, 304, 1932, 1, 4, 6, 7, 19, 138, 20, ..."
14,14,"[280, 63, 64, 1033, 336, 337, 101, 128, 140, 1..."


In [12]:
print("train size:", len(train_df))
print("test size:", len(test_df))

train size: 999
test size: 101


In [13]:
train_df.head(10)

Unnamed: 0,user_id,item_id
872,872,"[19, 21, 20, 22, 23, 10, 280, 17, 18, 25, 1081..."
1334,1334,"[8, 4, 6, 7, 9, 2764, 17, 18, 142, 19, 20, 21,..."
194,194,"[30, 1804, 24, 1411, 46, 47, 1674, 63, 64, 110..."
2097,2097,"[19, 21, 280, 39, 97, 46, 47, 1222, 4235, 63, ..."
2794,2794,"[9, 280, 67, 63, 64, 94, 95, 96, 338, 101, 114..."
864,864,"[8, 1, 6, 19, 21, 9, 280, 1143, 1079, 1080, 10..."
497,497,"[9, 280, 1377, 1619, 63, 64, 460, 83, 140, 65,..."
1150,1150,"[9, 280, 31, 1089, 52, 1222, 2352, 147, 83, 11..."
2356,2356,"[8, 9, 280, 1085, 518, 40, 26, 476, 27, 58, 12..."
89,89,"[1209, 1210, 1211, 2738, 9, 280, 1804, 40, 27,..."


In [14]:
test_df.head(10)

Unnamed: 0,user_id,item_id,test_split
1528,1528,"[280, 19, 20, 21, 22, 23, 24, 25, 2090, 7547, ...","[([1155, 1688, 454, 4267, 183, 5117, 159, 468,..."
1246,1246,"[19, 21, 9, 8, 20, 1209, 1210, 1211, 280, 1619...","[([27, 153, 81, 148, 95, 96, 129, 2087, 152, 1..."
1724,1724,"[9, 280, 5110, 1410, 1222, 460, 83, 94, 95, 96...","[([1698, 1906, 1299, 1361, 190, 1907, 191, 103..."
2187,2187,"[9, 280, 1410, 1411, 68, 69, 63, 64, 83, 1229,...","[([1111, 1254, 1143, 1627, 1214, 2647, 1144, 2..."
1220,1220,"[1, 4, 6, 7, 280, 284, 19, 20, 21, 22, 23, 24,...","[([157, 395, 171, 763, 423, 432, 160, 178, 174..."
915,915,"[9, 138, 22, 1, 4, 6, 7, 280, 19, 20, 21, 23, ...","[([3866, 122, 101, 351, 65, 114, 1388, 128, 12..."
2003,2003,"[19, 21, 20, 22, 23, 9, 280, 25, 1377, 97, 46,...","[([176, 1579, 451, 1279, 1280, 1286, 2107, 472..."
2956,2956,"[0, 277, 278, 19, 21, 20, 22, 23, 10, 9, 280, ...","[([112, 336, 337, 101, 114, 348, 128, 139, 140..."
2533,2533,"[9, 280, 97, 46, 47, 2353, 63, 64, 82, 140, 98...","[([1308, 197, 525, 540, 558, 209, 210, 1326, 5..."
105,105,"[1021, 1022, 1023, 9, 280, 1622, 1625, 38, 315...","[([1139, 408, 410, 1140, 419, 1142, 2104, 173,..."


In [15]:
#User-based split for user association rules:
# convert train data to transposed dataframe
article_train_df = convert_uid_df_to_iid_df(train_df)
article_train_df = article_train_df[article_train_df.user_id.apply(lambda x: len(x) > 1 and len(x) < 400)]

# convert test data to transposed dataframe
article_test_df = convert_test_uid_df_to_iid_df(test_df)
#article_test_df = article_test_df[article_test_df.user_id.apply(lambda x: len(x) > 1 and len(x) < 400)] # only users with more then one interaction


In [16]:
print("train size:", len(article_train_df))
print("test size:", len(article_test_df))

train size: 5051
test size: 3078


In [17]:
article_test_df.head(10)

Unnamed: 0,item_id,test_split
0,1155,"[[1528, 105, 1767, 824, 1851, 1671, 2672, 1900..."
1,1688,"[[1528, 1767, 2832, 2610], [], [1528, 1767, 28..."
2,454,"[[1528, 2187, 915, 105, 141, 554, 1767, 549, 7..."
3,4267,"[[1528], [], [1528], [1528]]"
4,183,"[[1528, 2956, 105, 554, 1925, 824, 2832, 1851,..."
5,5117,"[[1528], [], [1528], [1528]]"
6,159,"[[1528, 105, 603, 2426, 2731, 74, 1940, 1336, ..."
7,468,"[[1528, 2362], [192, 456], [1528, 2362, 192, 4..."
8,187,"[[1528, 1246, 1220, 2956, 141, 1620, 554, 1925..."
9,82,"[[1528, 105, 824, 670, 2832, 2370, 1851, 1787,..."


## 5. Store the train/test split dataset as pickle file for further use

In [18]:
import pickle
pickle.dump(train_df, open("pickle_dumps/train_df.p", "wb"))
pickle.dump(test_df, open("pickle_dumps/test_df.p", "wb"))
pickle.dump(article_train_df, open("pickle_dumps/article_train_df.p", "wb"))
pickle.dump(article_test_df, open("pickle_dumps/article_test_df.p", "wb"))