In [1]:
import pandas as pd
import numpy as np
import warnings
import time
import itertools
import copy
warnings.filterwarnings("ignore")
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all'
pd.set_option('display.max_columns', None)
pd.set_option('max_colwidth',100)

In [2]:
dataset_name = "/Book"

# 1. Read Data

In [None]:
item_df = pd.read_csv('../Dataset/Amazon_Books/Amazon_Books.item',sep="\t",encoding="latin-1")
item_df.columns = ["itemid","sales_type","sales_rank","categories","title","price","brand"]
item_df.shape[0]
item_df = item_df[item_df["sales_type"]=="Books"].reset_index(drop=True)
item_df
(item_df.isna().sum()/item_df.shape[0]).sort_values(ascending=False)

In [None]:
item_df.drop(["categories","brand","sales_type","sales_rank","price"], axis=1, inplace=True)
item_df = item_df.dropna().reset_index(drop=True) # fliter no title
item_df

In [None]:
rating_df = pd.read_csv('../Dataset/Amazon_Books/Amazon_Books.inter',sep="\t")
rating_df.columns = ["userid","itemid","rating","timestamp"]
rating_df = rating_df[rating_df["itemid"].isin(item_df.itemid.tolist())].reset_index(drop=True) # fliter no title
rating_df
(rating_df.isna().sum()/rating_df.shape[0]).sort_values(ascending=False)
rating_df["userid"].nunique()
rating_df["itemid"].nunique()

In [None]:
item_interactions_count = rating_df[["userid","itemid"]].groupby(["itemid"]).count()
item_interactions_count.columns = ["count"]
item_interactions_count.sort_values(["count"] , inplace=True, ascending=False)
item_interactions_count

In [None]:
item_interactions_count[:10000]

# 2. Fliter

In [None]:
item_title_exist_idset = item_interactions_count[:10000].index.tolist()
rating_df_fliter = rating_df[rating_df["itemid"].isin(item_title_exist_idset)].reset_index(drop=True)
rating_df_fliter

In [None]:
rating_df_fliter["rating"].value_counts()

In [None]:
rating_df_fliter["label"] = rating_df_fliter["rating"].apply(lambda x: 1 if x>=4 else 0)
rating_df_fliter
rating_df_fliter[rating_df_fliter["label"]==1].shape[0] / rating_df_fliter.shape[0]

In [None]:
item_pop = rating_df_fliter[rating_df_fliter["label"]==1]["itemid"].value_counts().rename_axis("itemid").reset_index(name="pop")
item_pop

In [None]:
rating_df_fliter.sort_values(["userid", "timestamp"] , inplace=True, ascending=True) 
rating_df_fliter = rating_df_fliter.reset_index(drop=True)
sequence_df = rating_df_fliter.groupby(['userid']).agg(
    itemid_seq=("itemid", list),
    label_seq=("label", list)
).reset_index()
sequence_df

In [None]:
def get_seq(row):
    pos_seq = []
    neg_seq = []
    for i in range(len(row.label_seq)):
        if row.label_seq[i]==1:
            pos_seq.append(row.itemid_seq[i])
        else:
            neg_seq.append(row.itemid_seq[i])
    return pos_seq, neg_seq, len(row.itemid_seq), len(pos_seq), len(neg_seq)

sequence_df[["pos_seq","neg_seq", "seq_length", "pos_seq_length", "neg_seq_length"]] = sequence_df.apply(get_seq, axis=1, result_type="expand") 
sequence_df

In [None]:
sequence_df[(sequence_df["pos_seq_length"]>=6) & (sequence_df["neg_seq_length"]>=6)].shape[0]
sequence_df[(sequence_df["pos_seq_length"]>=12) & (sequence_df["neg_seq_length"]>=10)].shape[0]
sequence_df[(sequence_df["pos_seq_length"]>=13) & (sequence_df["neg_seq_length"]>=10)].shape[0]

In [None]:
sequence_df_fliter = sequence_df[(sequence_df["pos_seq_length"]>=13) & (sequence_df["neg_seq_length"]>=10)].reset_index(drop=True)
item_df = item_df[item_df["itemid"].isin(list(itertools.chain.from_iterable(sequence_df_fliter["itemid_seq"].values.tolist())))].reset_index(drop=True)
sequence_df_fliter
item_df

# 3. LabelEncoder

In [None]:
# LabelEncoder 
from sklearn import preprocessing

user_le = preprocessing.LabelEncoder()
user_le.fit(sequence_df_fliter['userid'].values.tolist())
print("user id unique nums:", len(user_le.classes_))
sequence_df_fliter['userid'] = pd.Series(user_le.transform(sequence_df_fliter['userid']))


item_le = preprocessing.LabelEncoder()
item_le.fit(list(itertools.chain.from_iterable(sequence_df_fliter["itemid_seq"].values.tolist())))
print("item id unique nums:", len(item_le.classes_))
sequence_df_fliter['pos_seq'] = sequence_df_fliter['pos_seq'].apply(lambda x: list(item_le.transform(x)))
sequence_df_fliter['neg_seq'] = sequence_df_fliter['neg_seq'].apply(lambda x: list(item_le.transform(x)))

sequence_df_fliter

In [None]:
item_pop = item_pop[item_pop["itemid"].isin(list(item_df['itemid'].values.tolist()))].reset_index(drop=True)
item_pop["itemid"] = pd.Series(item_le.transform(item_pop['itemid']))

item_df["itemid"] = pd.Series(item_le.transform(item_df['itemid']))

In [None]:
item_df.to_csv(f"./Dataset/{dataset_name}/processed/full_item_df.csv", sep="\t", index=False)

# 4. Gnerate Test Data for Top-K Ranking

In [None]:
def get_new_df(df, n):
    df_new = copy.deepcopy(df)
    df_new["pos_seq"] = df_new["pos_seq"].apply(lambda x: x[:-n]) 
    df_new["pos_seq_length"] = df_new["pos_seq"].apply(lambda x:len(x))

    return df_new

sequence_df_fliter["data_index"] = sequence_df_fliter["userid"].apply(lambda x: 0)
sequence_df_fliter_list = [sequence_df_fliter]
for i in range(1, 3):
    sequence_df_fliter_new = get_new_df(sequence_df_fliter, i)
    sequence_df_fliter_new["data_index"] = sequence_df_fliter_new["userid"].apply(lambda x: i)
    sequence_df_fliter_list.append(sequence_df_fliter_new)
    
full_sequence_df = pd.concat(sequence_df_fliter_list).reset_index(drop=True)
full_sequence_df

In [None]:
import random
random.seed(2023)

def get_pos_neg_target(row):
    return row.pos_seq[-1], row.neg_seq[-1]


def get_full_candidate(row, n_candidate):
    cache = copy.deepcopy(row.neg_seq)
    cache.remove(row.neg_id_target)
    candidate = [row.pos_id_target] + [row.neg_id_target] + (random.sample(cache, n_candidate-2))
    random.shuffle(candidate)
    
    return candidate


def get_sub_candidate(row, n_candidate):
    cand_cache = copy.deepcopy(row.full_candidate)
    cand_cache.remove(row.pos_id_target)
    cand_cache.remove(row.neg_id_target)
    candidate = [row.pos_id_target] + [row.neg_id_target] + (random.sample(list(cand_cache), n_candidate-2))
    random.shuffle(candidate)
    
    return candidate


def get_full_history(row, n_history):
    history = row.pos_seq[-1-n_history : -1]
    return history


def get_sub_history(row, n_history):
    history = row.full_history[0-n_history : ]
    return history

def get_pos_target_index(row):
    return row["itemid_candidate"].index(row["pos_id_target"])

In [None]:
full_sequence_df_new = full_sequence_df[["userid","pos_seq","neg_seq","data_index"]]
full_sequence_df_new[["pos_id_target", "neg_id_target"]] = full_sequence_df_new.apply(get_pos_neg_target, axis=1, result_type="expand")
full_sequence_df_new["full_candidate"] = full_sequence_df_new.apply(get_full_candidate, n_candidate=10, axis=1)

In [None]:
full_sequence_df_new["full_history"] = full_sequence_df_new.apply(get_full_history, n_history=10, axis=1)
full_sequence_df_new

In [None]:
full_sequence_df_new.to_csv(f"./Dataset/{dataset_name}/processed/full_sequence_df.csv", sep="\t", index=False)

In [None]:
def get_pairwise_data(row):
    candidate = row.itemid_candidate
    pos = candidate[row.pos_target_index]
    neg_list = copy.deepcopy(candidate)
    neg_list.remove(pos)
    pair_list = []
    answer_list = []
    for neg in neg_list:
        pair = [pos, neg]
        random.shuffle(pair)
        answer = pair.index(pos)
        pair_list.append(pair)
        answer_list.append(answer)
    
    return pair_list, answer_list

In [None]:
def get_topk_final_data(df):
    for n_candidate in [2,5,10]:
        for n_history in [1,3,5,10]:
            print(f"n_candidate:{n_candidate} ; n_history{n_history}")
            LLM_top1_data = copy.deepcopy(df)
            LLM_top1_data["itemid_candidate"] = LLM_top1_data.apply(get_sub_candidate, n_candidate=n_candidate, axis=1)
            LLM_top1_data["itemid_history"] = LLM_top1_data.apply(get_sub_history, n_history=n_history, axis=1)
            LLM_top1_data["pos_target_index"] = LLM_top1_data.apply(get_pos_target_index, axis=1)
            LLM_top1_data = LLM_top1_data.sample(frac=1.0, random_state=2023).reset_index(drop=True)
            LLM_top1_data[["pair_itemid_candidate", "pair_pos_target_index"]] = LLM_top1_data.apply(get_pairwise_data, axis=1, result_type="expand")
            LLM_top1_data.to_csv(f"./Dataset/{dataset_name}/LLM/topk_candidate@{n_candidate}_history@{n_history}.csv", sep="\t", index=False)

In [None]:
get_topk_final_data(full_sequence_df_new)

# 5. Generate Datamaps

In [None]:
def process_name(s):
    s = s.replace("&amp;", "&")
    s = s.replace("  ", " ")
    s = s.replace("\n", " ")
    s = s.replace("\\", " ")
    return s
item_df["title"] = item_df["title"].apply(lambda x: process_name(x))
item_df

In [None]:
id2item_dict = item_df.set_index("itemid")["title"].to_dict()
id2item_dict
item2id_dict = item_df.set_index("title")["itemid"].to_dict()
item2id_dict

In [None]:
datamaps = {}
datamaps["id2item_dict"] = id2item_dict
datamaps["item2id_dict"] = item2id_dict

import json
json_str = json.dumps(datamaps)
with open(f"./Dataset/{dataset_name}/LLM/datamaps.json", 'w') as out:
    out.write(json_str)
    

In [None]:
item2pop_dict = item_pop.set_index("itemid")["pop"].to_dict()
for i in id2item_dict.keys():
    if i not in item2pop_dict.keys():
        item2pop_dict[i] = 0
item2pop_dict

json_str = json.dumps(item2pop_dict)
with open(f"./Dataset/{dataset_name}/LLM/popularity_datamaps.json", 'w') as out:
    out.write(json_str)

## Example of Loading Final Evaluation Data 

In [3]:
read_example_df = pd.read_csv(f"./Dataset/{dataset_name}/LLM/topk_candidate@5_history@5.csv", delimiter="\t")
read_example_df = read_example_df[["userid", "itemid_history", "pos_target_index", "itemid_candidate", "pair_itemid_candidate", "pair_pos_target_index"]]
for col in ['itemid_history', 'itemid_candidate', "pair_itemid_candidate", "pair_pos_target_index"]:
    read_example_df[col] = read_example_df[col].apply(lambda x: eval(x))
    
read_example_df

Unnamed: 0,userid,itemid_history,pos_target_index,itemid_candidate,pair_itemid_candidate,pair_pos_target_index
0,2320,"[2155, 1452, 5497, 3648, 372]",3,"[4020, 2033, 3152, 8805, 113]","[[4020, 8805], [2033, 8805], [3152, 8805], [113, 8805]]","[1, 1, 1, 1]"
1,587,"[3943, 7409, 3670, 4575, 7972]",3,"[2769, 5957, 1958, 6116, 2768]","[[6116, 2769], [6116, 5957], [6116, 1958], [2768, 6116]]","[0, 0, 0, 1]"
2,3218,"[87, 7417, 109, 7957, 6329]",0,"[5548, 8912, 6000, 5207, 4549]","[[8912, 5548], [5548, 6000], [5548, 5207], [5548, 4549]]","[1, 0, 0, 0]"
3,3065,"[1023, 414, 5346, 5437, 1037]",0,"[1208, 4687, 1313, 1184, 4407]","[[1208, 4687], [1208, 1313], [1184, 1208], [4407, 1208]]","[0, 0, 1, 1]"
4,2677,"[4486, 1890, 703, 1267, 5135]",4,"[2215, 3863, 2815, 1216, 8989]","[[8989, 2215], [3863, 8989], [2815, 8989], [1216, 8989]]","[0, 1, 1, 1]"
...,...,...,...,...,...,...
10264,2429,"[5671, 6901, 7839, 2641, 1127]",4,"[3903, 9028, 612, 2218, 5541]","[[3903, 5541], [5541, 9028], [5541, 612], [5541, 2218]]","[1, 0, 0, 0]"
10265,2626,"[4589, 6106, 8470, 8536, 6394]",0,"[6395, 6393, 7761, 8500, 9252]","[[6393, 6395], [7761, 6395], [8500, 6395], [9252, 6395]]","[1, 1, 1, 1]"
10266,2743,"[3201, 6, 1256, 2733, 9204]",3,"[9055, 2614, 2878, 2237, 2617]","[[9055, 2237], [2237, 2614], [2878, 2237], [2237, 2617]]","[1, 0, 1, 0]"
10267,2234,"[4673, 4862, 5289, 6937, 379]",4,"[55, 2703, 191, 2233, 1513]","[[55, 1513], [2703, 1513], [191, 1513], [2233, 1513]]","[1, 1, 1, 1]"


In [4]:
read_example_df = pd.read_csv(f"./Dataset/{dataset_name}/LLM/topk_candidate@5_history@5.csv", delimiter="\t")
read_example_df = read_example_df[["userid", "pos_target_index", "itemid_candidate"]]
read_example_df["itemid_candidate"] = read_example_df["itemid_candidate"].apply(lambda x: x[1:-1])
candidate = read_example_df['itemid_candidate'].str.split(',', expand=True)
for col in candidate.columns:
    candidate[col] = candidate[col].apply(lambda x: int(x))
candidate.values

array([[4020, 2033, 3152, 8805,  113],
       [2769, 5957, 1958, 6116, 2768],
       [5548, 8912, 6000, 5207, 4549],
       ...,
       [9055, 2614, 2878, 2237, 2617],
       [  55, 2703,  191, 2233, 1513],
       [4850, 8556, 4648, 1105, 6980]])