## 0. Import packages

In [1]:
import pandas as pd
import gzip
import math
import numpy as np
from tqdm import tqdm
from sklearn.model_selection import train_test_split
import json
import pickle
import os
tqdm.pandas() #for progres_apply etc.

In [2]:
seed_index = 0
path = "data2/"

In [3]:
def unpack_split(df, col):
    return pd.DataFrame({
        'item_id': df[col].apply(lambda x: x[0]),
        'playtime_forever': df[col].apply(lambda x: x[1]),
        'playtime_2weeks': df[col].apply(lambda x: x[2])
    })

dataset = pd.read_pickle(os.path.join(path, f'interactions_split_{seed_index}.pkl'))

train_df = pd.DataFrame(dataset["train"].apply(lambda x: x[0])).rename(columns={"train": "item_id"})
train_df['user_id'] = train_df.index

test_df =  pd.DataFrame(dataset["test"].apply(lambda x: x[0])).rename(columns={"test": "item_id"})
test_df['user_id'] = test_df.index

In [4]:
train_df

Unnamed: 0,item_id,user_id
0,"[300, 924, 2877, 1588, 192, 35, 3596, 1531, 24...",0
1,"[5277, 1179, 1897, 141, 2825, 169, 1053, 1936,...",1
2,"[1206, 55, 323, 440, 58, 3946, 698, 54, 3106, ...",2
3,"[1198, 2839, 342, 2052, 332, 1132, 2512, 475, ...",3
4,"[2083, 126, 106, 2249, 345, 4934, 669, 1957, 0...",4
...,...,...
62977,[0],62977
62978,[3],62978
62979,[198],62979
62980,[14],62980


In [5]:
test_df

Unnamed: 0,item_id,user_id
0,"[682, 561, 4927, 817, 217, 266, 53, 3062, 185,...",0
1,"[213, 1900, 4025, 1456, 3141, 1438, 1757, 459,...",1
2,"[2460, 1989, 1301, 1857, 2507, 104, 56, 215, 1...",2
3,"[681, 1804, 643, 729, 805, 915, 1029, 716, 193...",3
4,"[3, 1407, 1938, 540, 1006, 2205, 2291, 1746, 3...",4
...,...,...
62977,[2],62977
62978,[2],62978
62979,[5],62979
62980,[137],62980


## 2. Invert interactions

In [6]:
def convert_uid_df_to_iid_df(df):
    """
    convert a dataframe which contains rows of sort {"user_id": <int>, "item_id", <list<int>>}
    to the corresponding transposed dataframe {"item_id": <int>, "user_id": <list<int>>}
    """
    new = {}
    for _, row in df.iterrows():
        for item in row["item_id"]:
            if item in new:
                new[item].append(row["user_id"])
            else:
                new[item] = [row["user_id"]]
    return pd.DataFrame(new.items(), columns=["item_id", "user_id"])

In [7]:
#User-based split for user association rules:
# convert train data to transposed dataframe
article_train_df = convert_uid_df_to_iid_df(train_df)

# convert test data to transposed dataframe
article_test_df = convert_uid_df_to_iid_df(test_df)

In [8]:
print("train size:", len(article_train_df))
print("test size:", len(article_test_df))

train size: 7095
test size: 5836


In [9]:
article_test_df.head(10)

Unnamed: 0,item_id,user_id
0,682,"[0, 11, 13, 31, 32, 39, 62, 74, 79, 91, 115, 1..."
1,561,"[0, 4, 10, 15, 22, 25, 47, 59, 76, 88, 105, 12..."
2,4927,"[0, 22, 296, 622]"
3,817,"[0, 58, 90, 116, 138, 148, 202, 210, 215, 218,..."
4,217,"[0, 9, 50, 87, 103, 109, 143, 176, 177, 194, 2..."
5,266,"[0, 12, 23, 24, 33, 49, 59, 83, 92, 115, 120, ..."
6,53,"[0, 12, 74, 101, 165, 179, 182, 184, 187, 196,..."
7,3062,"[0, 29, 269, 774, 845, 994, 2955, 4498, 6014, ..."
8,185,"[0, 3, 5, 7, 23, 32, 33, 36, 62, 66, 71, 74, 8..."
9,2844,"[0, 221, 1727, 1772, 2137, 3702, 3965, 4612, 4..."


## 5. Store the train/test split dataset as pickle file for further use

In [10]:
import pickle
pickle.dump(train_df, open(f"pickle_dumps/train_df_{seed_index}.p", "wb"))
pickle.dump(test_df, open(f"pickle_dumps/test_df_{seed_index}.p", "wb"))
pickle.dump(article_train_df, open(f"pickle_dumps/article_train_df_{seed_index}.p", "wb"))
pickle.dump(article_test_df, open(f"pickle_dumps/article_test_df_{seed_index}.p", "wb"))