In [1]:
import pandas as pd
import numpy as np
import warnings
import time
import itertools
import copy
warnings.filterwarnings("ignore")
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all'
pd.set_option('display.max_columns', None)
pd.set_option('max_colwidth',100)

In [2]:
dataset_name = "/Movie"

# 1. Read Data

In [None]:
rating_df = pd.read_csv('../Dataset/ml-1m/ratings.dat',sep="::", names=["userid","itemid","rating","timestamp"])
rating_df
(rating_df.isna().sum()/rating_df.shape[0]).sort_values(ascending=False)
rating_df["userid"].max()
rating_df["itemid"].max()

In [None]:
user_df = pd.read_csv('../Dataset/ml-1m/users.dat',sep="::", names=["userid","gender","age","occupation","zip_code"])
user_df
(user_df.isna().sum()/user_df.shape[0]).sort_values(ascending=False)

In [None]:
item_df = pd.read_csv('../Dataset/ml-1m/movies.dat',sep="::", names=["itemid","title","genres"],encoding="latin-1")
item_df
(item_df.isna().sum()/item_df.shape[0]).sort_values(ascending=False)

# 2. Fliter

In [None]:
item_title_exist_idset = item_df.itemid.tolist()
rating_df_fliter = rating_df[rating_df["itemid"].isin(item_title_exist_idset)].reset_index(drop=True)
rating_df_fliter

In [None]:
rating_df_fliter["label"] = rating_df_fliter["rating"].apply(lambda x: 1 if x>=4 else 0)
rating_df_fliter

In [None]:
rating_df_fliter[rating_df_fliter["label"]==1].shape[0] / rating_df_fliter.shape[0]

In [None]:
item_pop = rating_df_fliter[rating_df_fliter["label"]==1]["itemid"].value_counts().rename_axis("itemid").reset_index(name="pop")
item_pop

In [None]:
rating_df_fliter.sort_values(["userid", "timestamp"] , inplace=True, ascending=True)
sequence_df = rating_df_fliter.groupby(['userid']).agg(
    itemid_seq=("itemid", list),
    label_seq=("label", list)
).reset_index()
sequence_df

In [None]:
def get_seq(row):
    pos_seq = []
    neg_seq = []
    for i in range(len(row.label_seq)):
        if row.label_seq[i]==1:
            pos_seq.append(row.itemid_seq[i])
        else:
            neg_seq.append(row.itemid_seq[i])
    return pos_seq, neg_seq, len(row.itemid_seq), len(pos_seq), len(neg_seq)

sequence_df[["pos_seq","neg_seq", "seq_length", "pos_seq_length", "neg_seq_length"]] = sequence_df.apply(get_seq, axis=1, result_type="expand") 
sequence_df

In [None]:
sequence_df_fliter = sequence_df[(sequence_df["pos_seq_length"]>=12) & (sequence_df["neg_seq_length"]>=10)].reset_index(drop=True)
sequence_df_fliter

In [None]:
def get_new_df(df, n):
    df_new = copy.deepcopy(df)
    df_new["pos_seq"] = df_new["pos_seq"].apply(lambda x: x[:-n]) 
    df_new["pos_seq_length"] = df_new["pos_seq"].apply(lambda x:len(x))

    return df_new

sequence_df_fliter["data_index"] = sequence_df_fliter["userid"].apply(lambda x: 0)
sequence_df_fliter_list = [sequence_df_fliter]
for i in range(1, 2):
    sequence_df_fliter_new = get_new_df(sequence_df_fliter, i)
    sequence_df_fliter_new["data_index"] = sequence_df_fliter_new["userid"].apply(lambda x: i)
    sequence_df_fliter_list.append(sequence_df_fliter_new)
    
full_sequence_df = pd.concat(sequence_df_fliter_list).reset_index(drop=True)
full_sequence_df

# 3. Gnerate Test Data for Top-k Ranking

In [None]:
import random
random.seed(2023)

def get_pos_neg_target(row):

    return row.pos_seq[-1], row.neg_seq[-1]


def get_full_candidate(row, n_candidate):
    cache = copy.deepcopy(row.neg_seq)
    cache.remove(row.neg_id_target)
    candidate = [row.pos_id_target] + [row.neg_id_target] + (random.sample(cache, n_candidate-2))
    random.shuffle(candidate)
    
    return candidate


def get_sub_candidate(row, n_candidate):
    cand_cache = copy.deepcopy(row.full_candidate)
    cand_cache.remove(row.pos_id_target)
    cand_cache.remove(row.neg_id_target)
    candidate = [row.pos_id_target] + [row.neg_id_target] + (random.sample(list(cand_cache), n_candidate-2))
    random.shuffle(candidate)
    
    return candidate


def get_full_history(row, n_history):
    history = row.pos_seq[-1-n_history : -1]
    return history


def get_sub_history(row, n_history):
    history = row.full_history[0-n_history : ]
    return history

def get_pos_target_index(row):
    return row["itemid_candidate"].index(row["pos_id_target"])

In [None]:
full_sequence_df_new = full_sequence_df[["userid","pos_seq","neg_seq","data_index"]]
full_sequence_df_new[["pos_id_target", "neg_id_target"]] = full_sequence_df_new.apply(get_pos_neg_target, axis=1, result_type="expand")
full_sequence_df_new["full_candidate"] = full_sequence_df_new.apply(get_full_candidate, n_candidate=10, axis=1)

In [None]:
full_sequence_df_new["full_history"] = full_sequence_df_new.apply(get_full_history, n_history=10, axis=1)
full_sequence_df_new

In [None]:
full_sequence_df_new.to_csv(f"./Dataset/{dataset_name}/processed/full_sequence_df.csv", sep="\t", index=False)

In [None]:
def get_pairwise_data(row):
    candidate = row.itemid_candidate
    pos = candidate[row.pos_target_index]
    neg_list = copy.deepcopy(candidate)
    neg_list.remove(pos)
    pair_list = []
    answer_list = []
    for neg in neg_list:
        pair = [pos, neg]
        random.shuffle(pair)
        answer = pair.index(pos)
        pair_list.append(pair)
        answer_list.append(answer)
    
    return pair_list, answer_list

In [None]:
def get_topk_final_data(df):
    for n_candidate in [2,5,10]:
        for n_history in [1,3,5,10]:
            print(f"n_candidate:{n_candidate} ; n_history{n_history}")
            LLM_top1_data = copy.deepcopy(df)
            LLM_top1_data["itemid_candidate"] = LLM_top1_data.apply(get_sub_candidate, n_candidate=n_candidate, axis=1)
            LLM_top1_data["itemid_history"] = LLM_top1_data.apply(get_sub_history, n_history=n_history, axis=1)
            LLM_top1_data["pos_target_index"] = LLM_top1_data.apply(get_pos_target_index, axis=1)
            LLM_top1_data = LLM_top1_data.sample(frac=1.0, random_state=2023).reset_index(drop=True)
            LLM_top1_data[["pair_itemid_candidate", "pair_pos_target_index"]] = LLM_top1_data.apply(get_pairwise_data, axis=1, result_type="expand")
            LLM_top1_data.to_csv(f"./Dataset/{dataset_name}/LLM/topk_candidate@{n_candidate}_history@{n_history}.csv", sep="\t", index=False)

In [None]:
get_topk_final_data(full_sequence_df_new)

# 4. Generate Datamaps

In [None]:
def process_movielens_name(s):
    s = s[:-7]
    s = s.split(" (")[0]
    for pattern in [", The", ", A"]:
        if s.endswith(pattern):
            s = pattern.split(", ")[1] + " " + s.replace(pattern, "")
    return s
item_df["title"] = item_df["title"].apply(lambda x: process_movielens_name(x))
item_df

In [None]:
id2item_dict = item_df.set_index("itemid")["title"].to_dict()
id2item_dict
item2id_dict = item_df.set_index("title")["itemid"].to_dict()
item2id_dict

In [None]:
datamaps = {}
datamaps["id2item_dict"] = id2item_dict
datamaps["item2id_dict"] = item2id_dict

import json
json_str = json.dumps(datamaps)
with open(f"./Dataset/{dataset_name}/LLM/datamaps.json", 'w') as out:
    out.write(json_str)


In [None]:
item2pop_dict = item_pop.set_index("itemid")["pop"].to_dict()
for i in id2item_dict.keys():
    if i not in item2pop_dict.keys():
        item2pop_dict[i] = 0
item2pop_dict

json_str = json.dumps(item2pop_dict)
with open(f"./Dataset/{dataset_name}/LLM/popularity_datamaps.json", 'w') as out:
    out.write(json_str)

## Example of Loading Final Evaluation Data 

In [3]:
read_example_df = pd.read_csv(f"./Dataset/{dataset_name}/LLM/topk_candidate@5_history@5.csv", delimiter="\t")
read_example_df = read_example_df[["userid", "itemid_history", "pos_target_index", "itemid_candidate", "pair_itemid_candidate", "pair_pos_target_index"]]
for col in ['itemid_history', 'itemid_candidate', "pair_itemid_candidate", "pair_pos_target_index"]:
    read_example_df[col] = read_example_df[col].apply(lambda x: eval(x))
    
read_example_df

Unnamed: 0,userid,itemid_history,pos_target_index,itemid_candidate,pair_itemid_candidate,pair_pos_target_index
0,2424,"[1653, 1917, 2797, 2017, 3578]",3,"[1552, 1907, 1249, 1466, 2628]","[[1552, 1466], [1907, 1466], [1249, 1466], [1466, 2628]]","[1, 1, 1, 0]"
1,1026,"[3556, 3902, 3185, 3578, 3598]",3,"[3952, 1069, 3749, 3570, 2940]","[[3952, 3570], [3570, 1069], [3570, 3749], [2940, 3570]]","[1, 0, 0, 1]"
2,5680,"[1589, 1597, 1711, 1459, 2187]",1,"[2076, 1805, 1092, 3519, 1210]","[[2076, 1805], [1805, 1092], [3519, 1805], [1210, 1805]]","[1, 0, 1, 1]"
3,123,"[381, 14, 3809, 1661, 3051]",2,"[3791, 450, 1352, 2600, 1079]","[[1352, 3791], [450, 1352], [1352, 2600], [1352, 1079]]","[0, 1, 0, 0]"
4,1531,"[2455, 3481, 1965, 480, 2454]",1,"[144, 3450, 3457, 23, 3917]","[[144, 3450], [3457, 3450], [23, 3450], [3917, 3450]]","[1, 1, 1, 1]"
...,...,...,...,...,...,...
9933,1072,"[3379, 1387, 898, 58, 1674]",4,"[34, 2396, 2724, 3545, 296]","[[34, 296], [2396, 296], [2724, 296], [296, 3545]]","[1, 1, 1, 0]"
9934,1316,"[2686, 3185, 2070, 2396, 2791]",1,"[2699, 3929, 3596, 1129, 2572]","[[2699, 3929], [3929, 3596], [3929, 1129], [3929, 2572]]","[1, 0, 0, 0]"
9935,3343,"[318, 3743, 3861, 3752, 3617]",0,"[3798, 3263, 3785, 528, 3578]","[[3798, 3263], [3785, 3798], [3798, 528], [3578, 3798]]","[0, 1, 0, 1]"
9936,840,"[1955, 1372, 580, 1927, 2302]",3,"[2070, 3606, 148, 2762, 2968]","[[2762, 2070], [2762, 3606], [2762, 148], [2762, 2968]]","[0, 0, 0, 0]"


In [4]:
read_example_df = pd.read_csv(f"./Dataset/{dataset_name}/LLM/topk_candidate@5_history@5.csv", delimiter="\t")
read_example_df = read_example_df[["userid", "pos_target_index", "itemid_candidate"]]
read_example_df["itemid_candidate"] = read_example_df["itemid_candidate"].apply(lambda x: x[1:-1])
candidate = read_example_df['itemid_candidate'].str.split(',', expand=True)
for col in candidate.columns:
    candidate[col] = candidate[col].apply(lambda x: int(x))
candidate.values

array([[1552, 1907, 1249, 1466, 2628],
       [3952, 1069, 3749, 3570, 2940],
       [2076, 1805, 1092, 3519, 1210],
       ...,
       [3798, 3263, 3785,  528, 3578],
       [2070, 3606,  148, 2762, 2968],
       [2860,    2, 1127, 2153,  597]])