In [1]:
import pandas as pd
import numpy as np
import warnings
import time
import itertools
import copy
warnings.filterwarnings("ignore")
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all'
pd.set_option('display.max_columns', None)
pd.set_option('max_colwidth',100)

In [2]:
dataset_name = "/Music"

# 1. Read Data

In [None]:
item_df = pd.read_csv('../Dataset/Amazon_CDs_Vinyl/Amazon_CDs_and_Vinyl.item',sep="\t",encoding="latin-1")
item_df.columns = ["itemid","title","categories","brand","sales_type","sales_rank","price"]
item_df.shape[0]
item_df = item_df[item_df["sales_type"]=="CDs & Vinyl"].reset_index(drop=True)
item_df
(item_df.isna().sum()/item_df.shape[0]).sort_values(ascending=False)

In [None]:
item_df.drop(["sales_rank","price"], axis=1, inplace=True)
item_df = item_df.dropna().reset_index(drop=True) 
item_df["title"] = item_df["title"].apply(lambda x: "-1" if str(x)[0]=="<" else str(x))
item_df = item_df[item_df["title"]!="-1"].reset_index(drop=True)
item_df

In [None]:
rating_df = pd.read_csv('../Dataset/Amazon_CDs_Vinyl/Amazon_CDs_and_Vinyl.inter',sep="\t")
rating_df.columns = ["userid","itemid","rating","timestamp"]
rating_df = rating_df[rating_df["itemid"].isin(item_df.itemid.tolist())].reset_index(drop=True) # fliter no title
rating_df
(rating_df.isna().sum()/rating_df.shape[0]).sort_values(ascending=False)
rating_df["userid"].nunique()
rating_df["itemid"].nunique()

In [None]:
item_interactions_count = rating_df[["userid","itemid"]].groupby(["itemid"]).count()
item_interactions_count.columns = ["count"]
item_interactions_count.sort_values(["count"] , inplace=True, ascending=False)
item_interactions_count

In [None]:
item_interactions_count[:50000]

# 2. Fliter

In [None]:
item_title_exist_idset = item_interactions_count[:50000].index.tolist()
rating_df_fliter = rating_df[rating_df["itemid"].isin(item_title_exist_idset)].reset_index(drop=True)
rating_df_fliter

In [None]:
rating_df_fliter["rating"].value_counts()

In [None]:
rating_df_fliter["label"] = rating_df_fliter["rating"].apply(lambda x: 1 if x>=4 else 0)
rating_df_fliter
rating_df_fliter[rating_df_fliter["label"]==1].shape[0] / rating_df_fliter.shape[0]

In [None]:
item_pop = rating_df_fliter[rating_df_fliter["label"]==1]["itemid"].value_counts().rename_axis("itemid").reset_index(name="pop")
item_pop

In [None]:
rating_df_fliter.sort_values(["userid", "timestamp"] , inplace=True, ascending=True) 
rating_df_fliter = rating_df_fliter.reset_index(drop=True)
sequence_df = rating_df_fliter.groupby(['userid']).agg(
    itemid_seq=("itemid", list),
    label_seq=("label", list)
).reset_index()
sequence_df

In [None]:
def get_seq(row):
    pos_seq = []
    neg_seq = []
    for i in range(len(row.label_seq)):
        if row.label_seq[i]==1:
            pos_seq.append(row.itemid_seq[i])
        else:
            neg_seq.append(row.itemid_seq[i])
    return pos_seq, neg_seq, len(row.itemid_seq), len(pos_seq), len(neg_seq)


sequence_df[["pos_seq","neg_seq", "seq_length", "pos_seq_length", "neg_seq_length"]] = sequence_df.apply(get_seq, axis=1, result_type="expand") 
sequence_df

In [None]:
sequence_df_fliter = sequence_df[(sequence_df["pos_seq_length"]>=20) & (sequence_df["neg_seq_length"]>=10)].reset_index(drop=True)
item_df = item_df[item_df["itemid"].isin(list(itertools.chain.from_iterable(sequence_df_fliter["itemid_seq"].values.tolist())))].reset_index(drop=True)
sequence_df_fliter
item_df

# 3. LabelEncoder

In [None]:
# LabelEncoder 
from sklearn import preprocessing

user_le = preprocessing.LabelEncoder()
user_le.fit(sequence_df_fliter['userid'].values.tolist())
print("user id unique nums:", len(user_le.classes_))
sequence_df_fliter['userid'] = pd.Series(user_le.transform(sequence_df_fliter['userid']))

item_le = preprocessing.LabelEncoder()
item_le.fit(list(itertools.chain.from_iterable(sequence_df_fliter["itemid_seq"].values.tolist())))
print("item id unique nums:", len(item_le.classes_))
sequence_df_fliter['pos_seq'] = sequence_df_fliter['pos_seq'].apply(lambda x: list(item_le.transform(x)))
sequence_df_fliter['neg_seq'] = sequence_df_fliter['neg_seq'].apply(lambda x: list(item_le.transform(x)))

sequence_df_fliter

In [None]:
item_pop = item_pop[item_pop["itemid"].isin(list(item_df['itemid'].values.tolist()))].reset_index(drop=True)
item_pop["itemid"] = pd.Series(item_le.transform(item_pop['itemid']))

item_df["itemid"] = pd.Series(item_le.transform(item_df['itemid']))

In [None]:
item_df.to_csv(f"./Dataset/{dataset_name}/processed/full_item_df.csv", sep="\t", index=False)

# 4. Gnerate Test Data for Top-K Ranking

In [None]:
def get_new_df(df, n):
    df_new = copy.deepcopy(df)
    df_new["pos_seq"] = df_new["pos_seq"].apply(lambda x: x[:-n]) 
    df_new["pos_seq_length"] = df_new["pos_seq"].apply(lambda x:len(x))

    return df_new

sequence_df_fliter["data_index"] = sequence_df_fliter["userid"].apply(lambda x: 0)
sequence_df_fliter_list = [sequence_df_fliter]
for i in range(1, 10):
    sequence_df_fliter_new = get_new_df(sequence_df_fliter, i)
    sequence_df_fliter_new["data_index"] = sequence_df_fliter_new["userid"].apply(lambda x: i)
    sequence_df_fliter_list.append(sequence_df_fliter_new)
    
full_sequence_df = pd.concat(sequence_df_fliter_list).reset_index(drop=True)
full_sequence_df

In [None]:
import random
random.seed(2023)

def get_pos_neg_target(row):

    return row.pos_seq[-1], row.neg_seq[-1]


def get_full_candidate(row, n_candidate):
    cache = copy.deepcopy(row.neg_seq)
    cache.remove(row.neg_id_target)
    candidate = [row.pos_id_target] + [row.neg_id_target] + (random.sample(cache, n_candidate-2))
    random.shuffle(candidate)
    
    return candidate


def get_sub_candidate(row, n_candidate):
    cand_cache = copy.deepcopy(row.full_candidate)
    cand_cache.remove(row.pos_id_target)
    cand_cache.remove(row.neg_id_target) 
    candidate = [row.pos_id_target] + [row.neg_id_target] + (random.sample(list(cand_cache), n_candidate-2))
    random.shuffle(candidate)
    
    return candidate


def get_full_history(row, n_history):
    history = row.pos_seq[-1-n_history : -1]
    return history


def get_sub_history(row, n_history):
    history = row.full_history[0-n_history : ]
    return history

def get_pos_target_index(row):
    return row["itemid_candidate"].index(row["pos_id_target"])

In [None]:
full_sequence_df_new = full_sequence_df[["userid","pos_seq","neg_seq","data_index"]]
full_sequence_df_new[["pos_id_target", "neg_id_target"]] = full_sequence_df_new.apply(get_pos_neg_target, axis=1, result_type="expand")
full_sequence_df_new["full_candidate"] = full_sequence_df_new.apply(get_full_candidate, n_candidate=10, axis=1)

In [None]:
full_sequence_df_new["full_history"] = full_sequence_df_new.apply(get_full_history, n_history=10, axis=1)
full_sequence_df_new

In [None]:
full_sequence_df_new.to_csv(f"./Dataset/{dataset_name}/processed/full_sequence_df.csv", sep="\t", index=False)

In [None]:
def get_pairwise_data(row):
    candidate = row.itemid_candidate
    pos = candidate[row.pos_target_index]
    neg_list = copy.deepcopy(candidate)
    neg_list.remove(pos)
    pair_list = []
    answer_list = []
    for neg in neg_list:
        pair = [pos, neg]
        random.shuffle(pair)
        answer = pair.index(pos)
        pair_list.append(pair)
        answer_list.append(answer)
    
    return pair_list, answer_list

In [None]:
def get_topk_final_data(df):
    for n_candidate in [2,5,10]:
        for n_history in [1,3,5,10]:
            print(f"n_candidate:{n_candidate} ; n_history{n_history}")
            LLM_top1_data = copy.deepcopy(df)
            LLM_top1_data["itemid_candidate"] = LLM_top1_data.apply(get_sub_candidate, n_candidate=n_candidate, axis=1)
            LLM_top1_data["itemid_history"] = LLM_top1_data.apply(get_sub_history, n_history=n_history, axis=1)
            LLM_top1_data["pos_target_index"] = LLM_top1_data.apply(get_pos_target_index, axis=1)
            LLM_top1_data = LLM_top1_data.sample(frac=1.0, random_state=2023).reset_index(drop=True)
            LLM_top1_data[["pair_itemid_candidate", "pair_pos_target_index"]] = LLM_top1_data.apply(get_pairwise_data, axis=1, result_type="expand")
            LLM_top1_data.to_csv(f"./Dataset/{dataset_name}/LLM/topk_candidate@{n_candidate}_history@{n_history}.csv", sep="\t", index=False)

In [None]:
get_topk_final_data(full_sequence_df_new)

# 5. Generate Datamaps

In [None]:
def process_name(s):
    s = s.replace("&amp;", "&")
    s = s.replace("  ", " ")
    s = s.replace("\n", " ")
    s = s.replace("\\", " ")
    return s
item_df["title"] = item_df["title"].apply(lambda x: process_name(x))
item_df

In [None]:
id2item_dict = item_df.set_index("itemid")["title"].to_dict()
id2item_dict
item2id_dict = item_df.set_index("title")["itemid"].to_dict()
item2id_dict

In [None]:
datamaps = {}
datamaps["id2item_dict"] = id2item_dict
datamaps["item2id_dict"] = item2id_dict

import json
json_str = json.dumps(datamaps)
with open(f"./Dataset/{dataset_name}/LLM/datamaps.json", 'w') as out:
    out.write(json_str)
    

In [None]:
item2pop_dict = item_pop.set_index("itemid")["pop"].to_dict()
for i in id2item_dict.keys():
    if i not in item2pop_dict.keys():
        item2pop_dict[i] = 0
item2pop_dict

json_str = json.dumps(item2pop_dict)
with open(f"./Dataset/{dataset_name}/LLM/popularity_datamaps.json", 'w') as out:
    out.write(json_str)

## Example of Loading Final Evaluation Data 

In [3]:
read_example_df = pd.read_csv(f"./Dataset/{dataset_name}/LLM/topk_candidate@5_history@5.csv", delimiter="\t")
read_example_df = read_example_df[["userid", "itemid_history", "pos_target_index", "itemid_candidate", "pair_itemid_candidate", "pair_pos_target_index"]]
for col in ['itemid_history', 'itemid_candidate', "pair_itemid_candidate", "pair_pos_target_index"]:
    read_example_df[col] = read_example_df[col].apply(lambda x: eval(x))
    
read_example_df

Unnamed: 0,userid,itemid_history,pos_target_index,itemid_candidate,pair_itemid_candidate,pair_pos_target_index
0,368,"[27737, 27781, 27790, 27796, 27826]",2,"[28426, 21108, 27850, 16683, 13451]","[[27850, 28426], [27850, 21108], [16683, 27850], [13451, 27850]]","[0, 0, 1, 1]"
1,208,"[4186, 6194, 8744, 11293, 23311]",2,"[20267, 18222, 23529, 2581, 25927]","[[23529, 20267], [23529, 18222], [2581, 23529], [25927, 23529]]","[0, 0, 1, 1]"
2,216,"[26957, 28656, 6368, 3468, 25025]",3,"[30264, 22236, 30606, 4696, 18873]","[[30264, 4696], [4696, 22236], [4696, 30606], [4696, 18873]]","[1, 0, 0, 0]"
3,345,"[4590, 18403, 18497, 19024, 10122]",4,"[24564, 5935, 6284, 18015, 9256]","[[9256, 24564], [9256, 5935], [9256, 6284], [18015, 9256]]","[0, 0, 0, 1]"
4,317,"[5386, 175, 2460, 2415, 6867]",2,"[8835, 19746, 7714, 2169, 25016]","[[8835, 7714], [7714, 19746], [7714, 2169], [7714, 25016]]","[1, 0, 0, 0]"
...,...,...,...,...,...,...
9085,398,"[29512, 3807, 29575, 29657, 22694]",2,"[2767, 19867, 17303, 28289, 29067]","[[2767, 17303], [19867, 17303], [28289, 17303], [29067, 17303]]","[1, 1, 1, 1]"
9086,595,"[6055, 20687, 20256, 14337, 16046]",2,"[16335, 21952, 20674, 12043, 15151]","[[20674, 16335], [21952, 20674], [20674, 12043], [20674, 15151]]","[0, 1, 0, 0]"
9087,16,"[5407, 5813, 10697, 13929, 3645]",2,"[3342, 5437, 15959, 1635, 3534]","[[3342, 15959], [15959, 5437], [1635, 15959], [3534, 15959]]","[1, 0, 1, 1]"
9088,203,"[7728, 11668, 9318, 23837, 896]",3,"[27678, 24409, 22184, 22454, 6852]","[[27678, 22454], [24409, 22454], [22454, 22184], [22454, 6852]]","[1, 1, 0, 0]"


In [4]:
read_example_df = pd.read_csv(f"./Dataset/{dataset_name}/LLM/topk_candidate@5_history@5.csv", delimiter="\t")
read_example_df = read_example_df[["userid", "pos_target_index", "itemid_candidate"]]
read_example_df["itemid_candidate"] = read_example_df["itemid_candidate"].apply(lambda x: x[1:-1])
candidate = read_example_df['itemid_candidate'].str.split(',', expand=True)
for col in candidate.columns:
    candidate[col] = candidate[col].apply(lambda x: int(x))
candidate.values

array([[28426, 21108, 27850, 16683, 13451],
       [20267, 18222, 23529,  2581, 25927],
       [30264, 22236, 30606,  4696, 18873],
       ...,
       [ 3342,  5437, 15959,  1635,  3534],
       [27678, 24409, 22184, 22454,  6852],
       [26769, 27124, 28331, 27772, 25304]])