In [1]:
import pandas as pd
import numpy as np
import warnings
import time
import itertools
import copy
warnings.filterwarnings("ignore")
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all'
pd.set_option('display.max_columns', None)
pd.set_option('max_colwidth',100)

In [2]:
dataset_name = "/News"

# 1. Read  Data

In [None]:
MIND_dev_news_df = pd.read_csv("../Dataset/MINDsmall_dev/news.tsv", header=None, delimiter="\t")
MIND_dev_news_df.columns = ["newsid","category","subcategory","title","abstract","url","title_entities","abstract_entities"]
MIND_dev_news_df.drop(['url','title_entities','abstract_entities'], axis=1, inplace=True)
MIND_dev_news_df
(MIND_dev_news_df.isna().sum()/MIND_dev_news_df.shape[0]).sort_values(ascending=False)

In [None]:
MIND_dev_news_df_fliter = MIND_dev_news_df.dropna().reset_index(drop=True)
MIND_dev_news_df_fliter

In [None]:
drop_newsid_list = pd.concat([MIND_dev_news_df, MIND_dev_news_df_fliter]).drop_duplicates(["newsid"],keep=False).newsid.values.tolist()
drop_newsid_list

In [None]:
MIND_dev_df = pd.read_csv("../Dataset/MINDsmall_dev/behaviors.tsv", header=None, delimiter="\t")
MIND_dev_df.columns = ["impressionid","userid","time","history","impression"]
MIND_dev_df["history"] = MIND_dev_df["history"].apply(lambda x: str(x).split(' '))
MIND_dev_df["impression"] = MIND_dev_df["impression"].apply(lambda x: str(x).split(' '))
MIND_dev_df

# 2. Fliter

In [None]:
def get_clean_impression_and_history(row):
    impression_list = []
    impression_pos_list = []
    label_list = []
    flag = 0
    for i in row.impression:
        impression, label = i.split("-")
        if impression in drop_newsid_list:
            continue
        label = int(label)
        if label == 1:
            impression_pos_list.append(impression)
            if flag == 1:
                continue
            else:
                impression_list.append(impression)
                label_list.append(label)
                flag=1
        else:
            impression_list.append(impression)
            label_list.append(label)
    
    if 1 not in label_list:
        pos_target = -1
    else:
        pos_target = impression_list[label_list.index(1)]
        
    history_list = copy.deepcopy(row.history)
    for i in row.history:
        if i in drop_newsid_list:
            history_list.remove(i)
     
    return [impression_list, label_list, pos_target, history_list, impression_pos_list]

In [None]:
MIND_dev_df[["impression_list", "label_list", "pos_target", "history_list", "impression_pos_list"]] = MIND_dev_df.apply(get_clean_impression_and_history, axis=1, result_type="expand")
MIND_dev_df.drop(['time','impression','history'], axis=1, inplace=True)
MIND_dev_df

In [None]:
impression_list = MIND_dev_df["impression_pos_list"].values.tolist()
history_list = MIND_dev_df["history_list"].values.tolist()
impression_list = list(itertools.chain(*impression_list))
history_list = list(itertools.chain(*history_list))
impression_list = pd.Series(impression_list + history_list)
item_pop = impression_list.value_counts().rename_axis("itemid").reset_index(name="pop")
item_pop

In [None]:
MIND_dev_df = MIND_dev_df[MIND_dev_df["pos_target"]!=-1].reset_index(drop=True)
MIND_dev_df["history_len"] = MIND_dev_df["history_list"].apply(lambda x: len(x))
MIND_dev_df["impression_len"] = MIND_dev_df["impression_list"].apply(lambda x: len(x))
MIND_dev_df_fliter = MIND_dev_df[(MIND_dev_df["history_len"]>=10) & (MIND_dev_df["impression_len"]>=10)].reset_index(drop=True)
MIND_dev_df_fliter

In [None]:
def truncat_data(row, n_candidate=10, n_history=10):
    # trunct candiadte
    cache = copy.deepcopy(row.impression_list)
    cache.remove(row.pos_target)
    neg_target = cache.pop(0)
    candidate = [row.pos_target] + [neg_target] + cache[:8]
    # trunct history
    history = row.history_list[-n_history:]
    
    return neg_target, candidate, history

MIND_dev_df_fliter[["neg_target", "full_candidate", "full_history"]] = MIND_dev_df_fliter.apply(truncat_data, n_candidate=10, n_history=10, axis=1, result_type="expand")
MIND_dev_df_fliter.shape[0]
MIND_dev_df_fliter

In [None]:
MIND_dev_df_fliter_new = MIND_dev_df_fliter[:10000].reset_index(drop=True)

In [None]:
test_user_set = set(MIND_dev_df_fliter_new["userid"].values.tolist())
test_item_set = set(itertools.chain.from_iterable(MIND_dev_df_fliter_new["full_candidate"].values.tolist())) | set(itertools.chain.from_iterable(MIND_dev_df_fliter_new["full_history"].values.tolist()))
len(set(itertools.chain.from_iterable(MIND_dev_df_fliter_new["full_candidate"].values.tolist())))
len(test_user_set)
len(test_item_set)

# 3. LabelEncoder

In [None]:
from sklearn import preprocessing

user_le = preprocessing.LabelEncoder()
user_le.fit(list(test_user_set))
print("user id unique nums:", len(user_le.classes_))

item_le = preprocessing.LabelEncoder()
item_le.fit(list(test_item_set))
print("item id unique nums:", len(item_le.classes_))

In [None]:
item_pop = item_pop[item_pop["itemid"].isin(list(set(itertools.chain.from_iterable(MIND_dev_df_fliter_new["full_candidate"].values.tolist()))))].reset_index(drop=True)
item_pop["itemid"] = pd.Series(item_le.transform(item_pop['itemid']))
item_pop

In [None]:
MIND_dev_df_fliter_new["userid"] = pd.Series(user_le.transform(MIND_dev_df_fliter_new['userid']))
MIND_dev_df_fliter_new["full_candidate"] = MIND_dev_df_fliter_new["full_candidate"].apply(lambda x: list(item_le.transform(x)))
MIND_dev_df_fliter_new["full_history"] = MIND_dev_df_fliter_new["full_history"].apply(lambda x: list(item_le.transform(x)))
MIND_dev_df_fliter_new["pos_id_target"] = pd.Series(item_le.transform(MIND_dev_df_fliter_new['pos_target']))
MIND_dev_df_fliter_new["neg_id_target"] = pd.Series(item_le.transform(MIND_dev_df_fliter_new['neg_target']))

MIND_dev_df_fliter_new
MIND_dev_df_fliter_new.to_csv(f"./Dataset/{dataset_name}/processed/MIND_dev_df_fliter.csv", sep="\t", index=False)

# 4. Gnerate Test Data for Top-K Ranking

In [None]:
def get_sub_history_and_candidate(row, n_candidate=10, n_history=10):
    cand_cache = list(copy.deepcopy(row.full_candidate))
    cand_cache.remove(row.pos_id_target)
    cand_cache.remove(row.neg_id_target)
    candidate = [int(row.pos_id_target)] + [int(row.neg_id_target)] + cand_cache[:n_candidate-2]
    random.shuffle(candidate)
    pos_target_index = candidate.index(row.pos_id_target)
    history = row.full_history[-n_history : ]
    
    return candidate, history, pos_target_index

def get_pairwise_data(row):
    candidate = row.itemid_candidate
    pos = candidate[row.pos_target_index]
    neg_list = copy.deepcopy(candidate)
    neg_list.remove(pos)
    pair_list = []
    answer_list = []
    for neg in neg_list:
        pair = [pos, neg]
        random.shuffle(pair)
        answer = pair.index(pos)
        pair_list.append(pair)
        answer_list.append(answer)
    
    return pair_list, answer_list

def get_topk_final_data(df):
    for n_candidate in [2,5,10]:
        for n_history in [1,3,5,10]:
            print(f"n_candidate:{n_candidate} ; n_history{n_history}")
            LLM_top1_data = copy.deepcopy(df)
            random.seed(2023)
            LLM_top1_data[["itemid_candidate", "itemid_history", "pos_target_index"]] = LLM_top1_data.apply(get_sub_history_and_candidate, n_candidate=n_candidate, n_history=n_history, axis=1, result_type="expand")
            LLM_top1_data = LLM_top1_data.sample(frac=1.0, random_state=2023).reset_index(drop=True)
            LLM_top1_data[["pair_itemid_candidate", "pair_pos_target_index"]] = LLM_top1_data.apply(get_pairwise_data, axis=1, result_type="expand")
            LLM_top1_data.to_csv(f"./Dataset/{dataset_name}/LLM/topk_candidate@{n_candidate}_history@{n_history}.csv", sep="\t", index=False)

In [None]:
get_topk_final_data(MIND_dev_df_fliter_new)

# 5. Generate Datamaps

In [None]:
MIND_dev_news_df_fliter = MIND_dev_news_df_fliter[MIND_dev_news_df_fliter["newsid"].isin(test_item_set)].reset_index(drop=True)
MIND_dev_news_df_fliter
(MIND_dev_news_df_fliter.isna().sum()/MIND_dev_news_df_fliter.shape[0]).sort_values(ascending=False)

In [None]:
MIND_dev_news_df_fliter["itemid"] = pd.Series(item_le.transform(MIND_dev_news_df_fliter['newsid']))
MIND_dev_news_df_fliter

In [None]:
MIND_dev_news_df_fliter.to_csv(f"./Dataset/{dataset_name}/processed/full_item.csv", sep="\t", index=False)

In [None]:
def process_name(s):
    s = s.replace("&amp;", "&")
    s = s.replace("  ", " ")
    s = s.replace("\n", " ")
    s = s.replace("\\", " ")
    return s
MIND_dev_news_df_fliter["title"] = MIND_dev_news_df_fliter["title"].apply(lambda x: process_name(x))
MIND_dev_news_df_fliter["abstract"] = MIND_dev_news_df_fliter["abstract"].apply(lambda x: process_name(x))


id2item_dict = MIND_dev_news_df_fliter.set_index("itemid")["title"].to_dict()
id2item_dict
item2id_dict = MIND_dev_news_df_fliter.set_index("title")["itemid"].to_dict()
item2id_dict

datamaps = {}
datamaps["id2item_dict"] = id2item_dict
datamaps["item2id_dict"] = item2id_dict

import json
json_str = json.dumps(datamaps)
with open(f"./Dataset/{dataset_name}/LLM/title_datamaps.json", 'w') as out:
    out.write(json_str)
    

In [None]:
id2item_dict = MIND_dev_news_df_fliter.set_index("itemid")["abstract"].to_dict()
id2item_dict
item2id_dict = MIND_dev_news_df_fliter.set_index("abstract")["itemid"].to_dict()
item2id_dict

datamaps = {}
datamaps["id2item_dict"] = id2item_dict
datamaps["item2id_dict"] = item2id_dict

import json
json_str = json.dumps(datamaps)
with open(f"./Dataset/{dataset_name}/LLM/abstract_datamaps.json", 'w') as out:
    out.write(json_str)

In [None]:
item2pop_dict = item_pop.set_index("itemid")["pop"].to_dict()
for i in id2item_dict.keys():
    if i not in item2pop_dict.keys():
        item2pop_dict[i] = 0
item2pop_dict

json_str = json.dumps(item2pop_dict)
with open(f"./Dataset/{dataset_name}/LLM/popularity_datamaps.json", 'w') as out:
    out.write(json_str)

## Example of Loading Final Evaluation Data 

In [3]:
read_example_df = pd.read_csv(f"./Dataset/{dataset_name}/LLM/topk_candidate@5_history@5.csv", delimiter="\t")
read_example_df = read_example_df[["userid", "itemid_history", "pos_target_index", "itemid_candidate", "pair_itemid_candidate", "pair_pos_target_index"]]
for col in ['itemid_history', 'itemid_candidate', "pair_itemid_candidate", "pair_pos_target_index"]:
    read_example_df[col] = read_example_df[col].apply(lambda x: eval(x))
    
read_example_df

Unnamed: 0,userid,itemid_history,pos_target_index,itemid_candidate,pair_itemid_candidate,pair_pos_target_index
0,2223,"[315, 4611, 11549, 9355, 6963]",3,"[10805, 5205, 1256, 11960, 8488]","[[11960, 10805], [11960, 5205], [1256, 11960], [11960, 8488]]","[0, 0, 1, 0]"
1,6951,"[12195, 11577, 7086, 89, 3786]",3,"[9766, 7805, 5777, 1374, 1663]","[[1374, 9766], [7805, 1374], [1374, 5777], [1663, 1374]]","[0, 1, 0, 1]"
2,8289,"[12301, 6691, 2020, 5144, 11053]",0,"[11436, 5073, 3589, 11386, 9675]","[[11436, 5073], [11436, 3589], [11436, 11386], [9675, 11436]]","[0, 0, 0, 1]"
3,6217,"[4806, 7064, 468, 8379, 7139]",4,"[9673, 3219, 11257, 8609, 4579]","[[9673, 4579], [4579, 3219], [11257, 4579], [4579, 8609]]","[1, 0, 1, 0]"
4,8188,"[3457, 5503, 11137, 4090, 4098]",4,"[5044, 6230, 3689, 11161, 4739]","[[4739, 5044], [6230, 4739], [4739, 3689], [4739, 11161]]","[0, 1, 0, 0]"
...,...,...,...,...,...,...
9995,6047,"[6392, 2427, 10769, 10951, 10751]",3,"[8704, 8855, 7041, 7981, 11518]","[[7981, 8704], [8855, 7981], [7981, 7041], [11518, 7981]]","[0, 1, 0, 1]"
9996,8165,"[5925, 1366, 5742, 4277, 6735]",3,"[9252, 5777, 10513, 9843, 7654]","[[9843, 9252], [9843, 5777], [9843, 10513], [9843, 7654]]","[0, 0, 0, 0]"
9997,6227,"[6994, 8853, 11844, 6063, 1286]",0,"[4757, 10240, 7886, 10350, 7529]","[[4757, 10240], [7886, 4757], [4757, 10350], [4757, 7529]]","[0, 1, 0, 0]"
9998,4203,"[7771, 5144, 11482, 9193, 12196]",3,"[9641, 11518, 6077, 2445, 11870]","[[9641, 2445], [2445, 11518], [6077, 2445], [2445, 11870]]","[1, 0, 1, 0]"


In [4]:
read_example_df = pd.read_csv(f"./Dataset/{dataset_name}/LLM/topk_candidate@5_history@5.csv", delimiter="\t")
read_example_df = read_example_df[["userid", "pos_target_index", "itemid_candidate"]]
read_example_df["itemid_candidate"] = read_example_df["itemid_candidate"].apply(lambda x: x[1:-1])
candidate = read_example_df['itemid_candidate'].str.split(',', expand=True)
for col in candidate.columns:
    candidate[col] = candidate[col].apply(lambda x: int(x))
candidate.values

array([[10805,  5205,  1256, 11960,  8488],
       [ 9766,  7805,  5777,  1374,  1663],
       [11436,  5073,  3589, 11386,  9675],
       ...,
       [ 4757, 10240,  7886, 10350,  7529],
       [ 9641, 11518,  6077,  2445, 11870],
       [10476,  3948,  5447, 10342,  2968]])