## 0. Import packages

In [1]:
import pandas as pd
import gzip
import math
import numpy as np
from tqdm import tqdm, trange
from sklearn.model_selection import train_test_split
import json
import pickle
import os
tqdm.pandas() #for progres_apply etc.

In [2]:
seed_index = 2
path = "data/"

In [3]:
def load_interactions(path, n_splits=5):
    df = pd.read_pickle(os.path.join(os.getcwd(), path))
    df[['interactions', 'train', 'val', 'test']] = df[['interactions', 'train', 'val', 'test']].applymap(lambda x: np.array(x, dtype=np.int32))
    interactions_dict = {}
    for split in trange(n_splits):
        for column in ['train', 'val', 'test']:
            interactions_dict[split, column] = pd.DataFrame({
                'user_id': df['user_id'],
                'steam_id': df['steam_id'],
                'item_id': df[column].apply(lambda x: x[split, 0]),
                'playtime_forever': df[column].apply(lambda x: x[split, 1]),
                'playtime_2weeks': df[column].apply(lambda x: x[split, 2])})
    return interactions_dict

In [4]:
dataset = load_interactions(f"{path}interactions_splits.pkl.gz")

100%|██████████| 5/5 [00:00<00:00,  6.91it/s]


In [5]:
dataset[seed_index, "train"]

Unnamed: 0,user_id,steam_id,item_id,playtime_forever,playtime_2weeks
0,76561197981203305,76561197981203305,"[2991, 2621, 1941, 2061, 1912, 5250, 458, 1680...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
1,bosslucek,76561198029968002,"[3041, 1125, 3827, 1500, 2865, 351, 514, 2858,...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
2,icantwait,76561197971666535,"[266, 2003, 472, 4443, 983, 550, 953, 1983, 28...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
3,76561198067911521,76561198067911521,"[1530, 860, 1058, 55, 309, 1021, 2643, 572, 55...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
4,kushziller,76561198021307778,"[3282, 1417, 3335, 268, 3912, 1079, 3866, 1367...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
...,...,...,...,...,...
54185,76561198056783123,76561198056783123,[45],[0],[0]
54186,76561197972619838,76561197972619838,[1],[118],[0]
54187,76561197965631636,76561197965631636,[1],[0],[0]
54188,SlimShady9,76561198067952943,[47],[0],[0]


In [6]:
train_df = pd.DataFrame(dataset[seed_index, "train"])[["item_id", "user_id"]]
train_df['user_id'] = train_df.index

test_df =  pd.DataFrame(dataset[seed_index, "test"])[["item_id", "user_id"]]
test_df['user_id'] = test_df.index

In [7]:
train_df

Unnamed: 0,item_id,user_id
0,"[2991, 2621, 1941, 2061, 1912, 5250, 458, 1680...",0
1,"[3041, 1125, 3827, 1500, 2865, 351, 514, 2858,...",1
2,"[266, 2003, 472, 4443, 983, 550, 953, 1983, 28...",2
3,"[1530, 860, 1058, 55, 309, 1021, 2643, 572, 55...",3
4,"[3282, 1417, 3335, 268, 3912, 1079, 3866, 1367...",4
...,...,...
54185,[45],54185
54186,[1],54186
54187,[1],54187
54188,[47],54188


In [8]:
test_df

Unnamed: 0,item_id,user_id
0,"[2105, 682, 524, 952, 1368, 274, 745, 1584, 35...",0
1,"[3640, 415, 516, 244, 1838, 905, 1279, 4676, 4...",1
2,"[89, 181, 2060, 1709, 298, 1039, 1684, 548, 15...",2
3,"[248, 499, 1528, 829, 3921, 1376, 3890, 142, 4...",3
4,"[946, 1102, 3966, 1125, 766, 4227, 2016, 586, ...",4
...,...,...
54185,[122],54185
54186,[4],54186
54187,[93],54187
54188,[32],54188


## 2. Invert interactions

In [9]:
def convert_uid_df_to_iid_df(df):
    """
    convert a dataframe which contains rows of sort {"user_id": <int>, "item_id", <list<int>>}
    to the corresponding transposed dataframe {"item_id": <int>, "user_id": <list<int>>}
    """
    new = {}
    for _, row in df.iterrows():
        for item in row["item_id"]:
            if item in new:
                new[item].append(row["user_id"])
            else:
                new[item] = [row["user_id"]]
    return pd.DataFrame(new.items(), columns=["item_id", "user_id"])

In [10]:
#User-based split for user association rules:
# convert train data to transposed dataframe
article_train_df = convert_uid_df_to_iid_df(train_df)

# convert test data to transposed dataframe
article_test_df = convert_uid_df_to_iid_df(test_df)

In [11]:
print("train size:", len(article_train_df))
print("test size:", len(article_test_df))

train size: 7045
test size: 5758


In [12]:
article_test_df.head(10)

Unnamed: 0,item_id,user_id
0,2105,"[0, 42, 49, 51, 62, 107, 123, 142, 252, 287, 3..."
1,682,"[0, 23, 52, 64, 85, 165, 169, 173, 230, 256, 2..."
2,524,"[0, 6, 38, 59, 84, 97, 119, 128, 174, 183, 207..."
3,952,"[0, 20, 42, 52, 70, 77, 85, 114, 118, 155, 211..."
4,1368,"[0, 12, 14, 123, 140, 158, 170, 174, 216, 248,..."
5,274,"[0, 44, 64, 97, 106, 143, 188, 221, 262, 282, ..."
6,745,"[0, 32, 33, 46, 57, 62, 64, 102, 116, 126, 138..."
7,1584,"[0, 28, 99, 117, 131, 161, 252, 253, 298, 406,..."
8,3505,"[0, 4, 7, 662, 852]"
9,2452,"[0, 70, 97, 383, 742, 1721, 2273, 3330, 3925, ..."


## 5. Store the train/test split dataset as pickle file for further use

In [13]:
import pickle
pickle.dump(train_df, open(f"pickle_dumps/train_df_{seed_index}.p", "wb"))
pickle.dump(test_df, open(f"pickle_dumps/test_df_{seed_index}.p", "wb"))
pickle.dump(article_train_df, open(f"pickle_dumps/article_train_df_{seed_index}.p", "wb"))
pickle.dump(article_test_df, open(f"pickle_dumps/article_test_df_{seed_index}.p", "wb"))