## 0. Import packages

In [1]:
import pandas as pd
import gzip
import math
import numpy as np
from tqdm import tqdm, trange
from sklearn.model_selection import train_test_split
import json
import pickle
import os
tqdm.pandas() #for progres_apply etc.

In [2]:
seed_index = 0
path = "data/"

In [3]:
def load_interactions(path, n_splits=5):
    df = pd.read_pickle(os.path.join(os.getcwd(), path))
    df[['interactions', 'train', 'val', 'test']] = df[['interactions', 'train', 'val', 'test']].applymap(lambda x: np.array(x, dtype=np.int32))
    interactions_dict = {}
    for split in trange(n_splits):
        for column in ['train', 'val', 'test']:
            interactions_dict[split, column] = pd.DataFrame({
                'user_id': df['user_id'],
                'steam_id': df['steam_id'],
                'item_id': df[column].apply(lambda x: x[split, 0]),
                'playtime_forever': df[column].apply(lambda x: x[split, 1]),
                'playtime_2weeks': df[column].apply(lambda x: x[split, 2])})
    return interactions_dict

In [4]:
dataset = load_interactions(f"{path}interactions_splits.pkl.gz")

100%|██████████| 5/5 [00:00<00:00,  5.86it/s]


In [5]:
dataset[seed_index, "train"]

Unnamed: 0,user_id,steam_id,item_id,playtime_forever,playtime_2weeks
0,76561197981203305,76561197981203305,"[3485, 2370, 163, 2188, 2484, 2130, 3197, 413,...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
1,bosslucek,76561198029968002,"[470, 3223, 1912, 4349, 2249, 380, 3860, 1483,...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
2,76561198067911521,76561198067911521,"[173, 139, 2088, 2132, 285, 352, 678, 521, 798...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
3,icantwait,76561197971666535,"[206, 299, 354, 1125, 2196, 2839, 1752, 1410, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
4,kushziller,76561198021307778,"[417, 1897, 4786, 840, 1637, 3957, 926, 505, 4...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
...,...,...,...,...,...
62965,76561198064213407,76561198064213407,[0],[147],[0]
62966,Mish2002,76561198046430892,[80],[266],[0]
62967,76561198072381918,76561198072381918,[23],[381],[0]
62968,76561198095572106,76561198095572106,[11],[0],[0]


In [6]:
train_df = pd.DataFrame(dataset[seed_index, "train"])[["item_id", "user_id"]]
train_df['user_id'] = train_df.index

test_df =  pd.DataFrame(dataset[seed_index, "test"])[["item_id", "user_id"]]
test_df['user_id'] = test_df.index

In [7]:
train_df

Unnamed: 0,item_id,user_id
0,"[3485, 2370, 163, 2188, 2484, 2130, 3197, 413,...",0
1,"[470, 3223, 1912, 4349, 2249, 380, 3860, 1483,...",1
2,"[173, 139, 2088, 2132, 285, 352, 678, 521, 798...",2
3,"[206, 299, 354, 1125, 2196, 2839, 1752, 1410, ...",3
4,"[417, 1897, 4786, 840, 1637, 3957, 926, 505, 4...",4
...,...,...
62965,[0],62965
62966,[80],62966
62967,[23],62967
62968,[11],62968


In [8]:
test_df

Unnamed: 0,item_id,user_id
0,"[2955, 457, 1430, 2348, 1716, 102, 464, 2417, ...",0
1,"[56, 2486, 573, 1662, 3174, 3132, 292, 4443, 1...",1
2,"[3994, 187, 2602, 4571, 1231, 1620, 1182, 926,...",2
3,"[840, 596, 1148, 890, 815, 1096, 3442, 1660, 8...",3
4,"[980, 1094, 2780, 1095, 3803, 2973, 2052, 981,...",4
...,...,...
62965,[3],62965
62966,[19],62966
62967,[79],62967
62968,[5],62968


## 2. Invert interactions

In [9]:
def convert_uid_df_to_iid_df(df):
    """
    convert a dataframe which contains rows of sort {"user_id": <int>, "item_id", <list<int>>}
    to the corresponding transposed dataframe {"item_id": <int>, "user_id": <list<int>>}
    """
    new = {}
    for _, row in df.iterrows():
        for item in row["item_id"]:
            if item in new:
                new[item].append(row["user_id"])
            else:
                new[item] = [row["user_id"]]
    return pd.DataFrame(new.items(), columns=["item_id", "user_id"])

In [10]:
#User-based split for user association rules:
# convert train data to transposed dataframe
article_train_df = convert_uid_df_to_iid_df(train_df)

# convert test data to transposed dataframe
article_test_df = convert_uid_df_to_iid_df(test_df)

In [11]:
print("train size:", len(article_train_df))
print("test size:", len(article_test_df))

train size: 7121
test size: 5829


In [12]:
article_test_df.head(10)

Unnamed: 0,item_id,user_id
0,2955,"[0, 292, 1046, 3729, 6851, 14362, 18624, 30638]"
1,457,"[0, 17, 30, 52, 76, 84, 101, 125, 176, 214, 22..."
2,1430,"[0, 23, 39, 49, 69, 78, 211, 629, 782, 842, 10..."
3,2348,"[0, 168, 305, 812, 1819, 2196, 2810, 7946, 881..."
4,1716,"[0, 222, 223, 282, 481, 561, 624, 636, 763, 81..."
5,102,"[0, 28, 31, 33, 38, 54, 66, 72, 74, 90, 94, 10..."
6,464,"[0, 5, 16, 19, 29, 30, 44, 56, 72, 74, 119, 12..."
7,2417,"[0, 12, 99, 102, 151, 188, 408, 881, 1060, 117..."
8,822,"[0, 22, 28, 62, 104, 107, 135, 174, 198, 250, ..."
9,51,"[0, 11, 15, 21, 28, 37, 51, 86, 88, 90, 91, 11..."


## 5. Store the train/test split dataset as pickle file for further use

In [13]:
import pickle
pickle.dump(train_df, open(f"pickle_dumps/train_df_{seed_index}.p", "wb"))
pickle.dump(test_df, open(f"pickle_dumps/test_df_{seed_index}.p", "wb"))
pickle.dump(article_train_df, open(f"pickle_dumps/article_train_df_{seed_index}.p", "wb"))
pickle.dump(article_test_df, open(f"pickle_dumps/article_test_df_{seed_index}.p", "wb"))