In [None]:
import pandas as pd
import gzip
import numpy as np


food_path = "dataset/meta_Grocery_and_Gourmet_Food.json.gz"
kitchen_path = 'dataset/meta_Home_and_Kitchen.json.gz'

movie_path = 'dataset/meta_Movies_and_TV.json.gz'
book_path = 'dataset/meta_Books.json.gz'

sport_path = 'dataset/meta_Sports_and_Outdoors_2018.json.gz'
cloth_path = 'dataset/meta_Clothing_Shoes_and_Jewelry_2018.json.gz'

paths = [food_path, kitchen_path, movie_path, book_path, sport_path, cloth_path]
domains = ['Food', 'Kitchen', 'Movie', 'Book', 'Sport', 'Clothing']


def get_item_list(file_path):
    """获取商品列表，商品id已设为id，使用iloc查询指定商品"""
    # 逐行读取文件
    with open(file_path, 'r', encoding='utf-8') as file:
        lines = file.readlines()

    # 处理每一行
    data = []
    for line in lines:
        row = line.strip().split('\t')  # 按制表符分割
        data.append(row)
    # 转换为DataFrame
    column = ['origin_index', 'asin', 'id']
    df = pd.DataFrame(data, columns=column)
    df.set_index('id', inplace=True)
    return df

def get_user_list(file_path):
    """使用dataset/Food-Kitchen/userlist.txt, 按行读取每一个用户id
    """
    # 逐行读取文件
    with open(file_path, 'r', encoding='utf-8') as file:
        lines = file.readlines()

    # 处理每一行
    data = []
    for line in lines:
        row = line.strip().split('\t')  # 按制表符分割
        data.append(row)
    # 转换为DataFrame
    column = ['asin', 'id']
    df = pd.DataFrame(data, columns=column)
    return df

In [None]:
def check_messing_value(df_item_list, df_meta_data):
    df_item_list['isin_meta_data'] = df_item_list['asin'].isin(df_meta_data['asin'])
    count = df_item_list['isin_meta_data'].value_counts()
    return count

In [None]:
# for i in range(len(domains)):
#     item_path = f'dataset-single_domain\{domains[i]}\list.txt'
#     df_item_list = get_item_list(item_path)
#     df_meta_data = get_meta_data(paths[i])
#     count = check_messing_value(df_item_list, df_meta_data)
#     print(f'{domains[i]}:')
#     print(count)

In [None]:
from openai import OpenAI
from tqdm import tqdm
import codecs
import tiktoken


client = OpenAI(
    base_url='https://dashscope.aliyuncs.com/compatible-mode/v1',
    api_key='sk-1bf40b7f0bdf41cf9e68509aa6dc1296'
)
encoding = tiktoken.get_encoding("cl100k_base")

def num_tokens_from_string(string: str, encoding_name: str) -> int:
    """Returns the number of tokens in a text string."""
    encoding = tiktoken.get_encoding(encoding_name)
    num_tokens = len(encoding.encode(string))
    return num_tokens

def get_item_embedding(df, asin):
    # 查询特定的asin
    item = df[df["asin"] == asin]
    if not item.empty:
        title = item['title'].item()
        category = item['categories'].item()
        description = item['description'].item()
        text_desc = f'title: {title}\ncategory: {category}\ndescription: {description}'
        if len(text_desc) >= 8192:
            text_desc = f'title: {title}\ncategory: {category}\n'
        response = client.embeddings.create(
            input=text_desc,
            model="text-embedding-v3",
            dimensions=1024,
            encoding_format="float"
        )

        return response.data[0].embedding
    else:
        return [0] * 1024
    
def get_user_embedding(user, df_meta_data, df_item, interactions):
    """使用用户交互序列直接生成用户画像"""
    titles = []
    str_list = []
    text_desc = "Below is a user's purchase record of items in the Book and Movie categories. Please note the temporal information: the higher the sequence number, the more recent the record. Use this sequence to characterize the user embedding.\n"
    for id in interactions[user]:
        asin = df_item.iloc[id]['asin']
        item = df_meta_data[df_meta_data["asin"] == asin]
        if not item.empty:
            title = item['title'].item()
            titles.append(title)
    num_tokens = len(encoding.encode(text_desc))
    for i in range(len(titles)):
        num_tokens += len(encoding.encode(f'{i+1}. {titles[i]}\n'))
        if num_tokens >= 6000:
            print(text_desc)
            str_list.append(text_desc)
            if len(str_list) >= 10:
                str_list = str_list[-10:]
            text_desc = ''
            num_tokens = 0
        text_desc += f'{i+1}. {titles[i]}\n'
    str_list.append(text_desc)
    print(f"User: {user}, list length: {len(str_list)}")
    response = client.embeddings.create(
        input=str_list,
        model="text-embedding-v3",
        dimensions=1024,
        encoding_format="float"
    )
    return response.data[0].embedding
    
def split_string(input_string, max_length=8192):
    str_list = [input_string[i:i + max_length] for i in range(0, len(input_string), max_length)]
    return '\n'.join(str_list)

def read_train_data(train_file):
    with codecs.open(train_file, "r", encoding="utf-8") as infile:
        train_data = []
        user = []
        for id, line in enumerate(infile):
            res = []
            line = line.strip().split("\t")
            user.append(int(line[0]))

            line = line[2:]  # 交互的一系列物品
            for w in line:
                w = w.split("|")
                res.append((int(w[0]), int(w[1])))
            res.sort(key=lambda x: x[1])  # 按照时间顺序排列

            res_2 = []
            for r in res:
                res_2.append(r[0])
            train_data.append(res_2)

    return train_data, user

In [None]:
# for i in range(len(domains)):
#     file_path = f'dataset-single_domain\{domains[i]}\list.txt'
#     df_item_list = get_item_list(file_path)
#     df_meta_data = get_meta_data(paths[i])
#     df_item_list['embedding'] = df_item_list['asin'].apply(lambda x: get_embedding(df_meta_data, x))
#     df_item_list.to_csv(f'dataset/embedded_{domains[i]}.csv')
#     print(f'{domains[i]} domain done!')

In [None]:
def save_to_np_embeddings(domains, user_num):
    """将用户及商品嵌入转化为numpy文件，便于读取。处理过程中，仅对训练集中的用户生成了用户画像，而测试集与验证集中的用户没有用户画像，即冷启动问题
    此处将缺失的用户画像补充为0向量，两个域的商品嵌入合并为一个列表
    """
    domain_x = domains.split('-')[0]
    domain_y = domains.split('-')[1]
    # df_x = pd.read_csv(f'dataset/{domains}/embedded_{domain_x}.csv', index_col='index')
    # df_y = pd.read_csv(f'dataset/{domains}/embedded_{domain_y}.csv', index_col='index')
    df_user = pd.read_csv(f'dataset/{domains}/embedded_reasoning_user.csv')
    # tqdm.pandas(desc=f"Loading {domain_x} Embeddings")
    # df_x['embedding'] = df_x.embedding.progress_apply(lambda x: np.array(eval(x), dtype=np.float32))
    # tqdm.pandas(desc=f"Loading {domain_y} Embeddings")
    # df_y['embedding'] = df_y.embedding.progress_apply(lambda x: np.array(eval(x), dtype=np.float32))
    tqdm.pandas(desc=f"Loading User Embeddings")
    df_user['embedding'] = df_user.embedding.progress_apply(lambda x: np.array(eval(x), dtype=np.float32))
    all_user_ids = list(range(0, user_num))
    # 找出缺失的用户ID
    missing_user_ids = [uid for uid in all_user_ids if uid not in df_user['id'].values]
    # 创建一个包含缺失用户ID的补充数据框
    supplement_df = pd.DataFrame({'id': missing_user_ids, 'embedding': [[0]*1024]*len(missing_user_ids)})
    # 将补充的数据框与原始数据框合并
    df_user = pd.concat([df_user, supplement_df]).sort_values(by='id').reset_index(drop=True)
    # array_x = np.vstack(df_x['embedding'].to_numpy())
    # array_y = np.vstack(df_y['embedding'].to_numpy())
    # item_embeddings = np.vstack((array_x, array_y, np.zeros(1024, dtype=np.float32)))
    user_embeddings = np.vstack(df_user['embedding'].to_numpy())
    # np.save(f'dataset/{domains}/item_embeddings.npy', item_embeddings)
    np.save(f'dataset/{domains}/reasoning_user_embeddings.npy', user_embeddings)

In [None]:
# load_embeddings('Food-Kitchen', 16579)

In [None]:
save_to_np_embeddings('Movie-Book', 15352)

In [None]:
def get_user_interaction(domains):
    """获取所有用户及其对应的用户交互
    """
    # 从训练集中抽取所有用户交互，保留时间排序
    train_file = f'dataset/{domains}/traindata_new.txt'
    with codecs.open(train_file, "r", encoding="utf-8") as infile:
            train_data = []
            user_list = []
            for id, line in enumerate(infile):
                res = []
                line = line.strip().split("\t")
                user_list.append(int(line[0]))

                line = line[2:]  # 交互的一系列物品
                for w in line:
                    w = w.split("|")
                    res.append((int(w[0]), int(w[1])))
                res.sort(key=lambda x: x[1])  # 按照时间顺序排列

                res_2 = []
                for r in res:
                    res_2.append(r[0])
                train_data.append(res_2)
    user_interaction = dict()
    for user, data in zip(user_list, train_data):
        user_interaction.setdefault(user, []).append(data)
    for user in user_interaction.keys():
        user_interaction[user] = list(dict.fromkeys(sum(user_interaction[user], [])))

In [None]:
domain = 'Sport-Clothing'
user_interaction = get_user_interaction(domain)
file_path_A = f'dataset/{domain}/Alist.txt'
file_path_B = f'dataset/{domain}/Blist.txt'
df_item_Alist = get_item_list(file_path_A)
df_item_Blist = get_item_list(file_path_B)
# item数量直接加和为两域之和
df_item = pd.concat([df_item_Alist, df_item_Blist])
df_meta_data_A = get_meta_data(sport_path)
df_meta_data_B = get_meta_data(cloth_path)
df_meta_data = pd.concat([df_meta_data_A, df_meta_data_B])
df_user = pd.DataFrame(user_interaction.keys(), columns=['id'])

In [None]:
# tqdm.pandas(desc="Getting User Embeddings")
# df_user['embedding'] = df_user['id'].apply(lambda x: get_user_embedding(x, df_meta_data, df_item, user_interaction))
# df_user.to_csv('dataset/Movie-Book/embedded_user.csv')

In [None]:
# df_text = pd.merge(df_item, df_meta_data, on='asin', how='left')

In [None]:
# filtered_df = df_text[df_text['id'].isin(user_interaction[7])]

In [None]:
# filtered_df = df_text.iloc[user_interaction[7]]

In [None]:
def get_interaction_metadata(df_text, user_interaction):
    text = ''
    for num, id in enumerate(user_interaction):
        item = df_text.iloc[id]
        title = str(item['title'])
        category = str(item['categories'])
        text += f'{num+1}. ' + title + '  ' + category + '\n'
    return text

In [None]:
import json

def generate_request_data(custom_id, content):
    """
    生成单个请求数据
    :param custom_id: 自定义ID
    :param content: 用户问题内容
    :return: 请求数据的字典格式
    """
    prompt = "Below is a user's purchase record of items in the Toy-Game domains. Please note the temporal information: the higher the sequence number, the more recent the record. Use this sequence to characterize the user profile. The user profile should be concise and refined. And please avoid inappropriate content!"
    request_data = {
        "custom_id": custom_id,
        "method": "POST",
        "url": "/v1/chat/completions",
        "body": {
            "model": "qwq-plus",
            "messages": [
                {"role": "system", "content": "You are a recommendation expert, capable of profiling users based on their interaction records. The user profile should be concise and refined."},
                {"role": "user", "content": prompt+content}
            ]
        }
    }
    return request_data

def generate_embedding_data(custom_id, content):
    """
    生成单个请求数据
    :param custom_id: 自定义ID
    :param content: 用户问题内容
    :return: 请求数据的字典格式
    """
    prompt = "Use this user profile to generate user embedding.\n"
    request_data = {
        "custom_id": custom_id,
        "method": "POST",
        "url": "/v1/embeddings",
        "body": {
            "model": "text-embedding-v3",
            "input": prompt+content,
            "encoding_format": "float"
        }
    }
    return request_data

def save_to_json_file(data_list, file_name):
    """
    将生成的数据列表保存为JSON文件
    :param data_list: 数据列表
    :param file_name: 保存的文件名
    """
    with open(file_name, "w", encoding="utf-8") as file:
        for data in data_list:
            json.dump(data, file, ensure_ascii=False)
            file.write("\n")  # 每条数据占一行

def get_user_reasoning_embedding(profile):
    """使用用户交互序列推理用户画像并生成用户嵌入"""
    response = client.embeddings.create(
        input=profile,
        model="text-embedding-v3",
        dimensions=1024,
        encoding_format="float"
    )
    return response.data[0].embedding

In [None]:
file_path_A = 'dataset/Movie-Book/Alist.txt'
file_path_B = 'dataset/Movie-Book/Blist.txt'
df_item_Alist = get_item_list(file_path_A)
df_item_Blist = get_item_list(file_path_B)
# item数量直接加和为两域之和
df_item = pd.concat([df_item_Alist, df_item_Blist])
df_meta_data_A = get_meta_data(movie_path)
df_meta_data_B = get_meta_data(book_path)
df_meta_data = pd.concat([df_meta_data_A, df_meta_data_B])
df_text = pd.merge(df_item, df_meta_data, on='asin', how='left')

# 从训练集中抽取所有用户交互，保留时间排序
train_file = 'dataset/Movie-Book/traindata_new.txt'
with codecs.open(train_file, "r", encoding="utf-8") as infile:
        train_data = []
        user_list = []
        for id, line in enumerate(infile):
            res = []
            line = line.strip().split("\t")
            user_list.append(int(line[0]))

            line = line[2:]  # 交互的一系列物品
            for w in line:
                w = w.split("|")
                res.append((int(w[0]), int(w[1])))
            res.sort(key=lambda x: x[1])  # 按照时间顺序排列

            res_2 = []
            for r in res:
                res_2.append(r[0])
            train_data.append(res_2)
user_interaction = dict()
for user, data in zip(user_list, train_data):
    user_interaction.setdefault(user, []).append(data)
for user in user_interaction.keys():
    user_interaction[user] = list(dict.fromkeys(sum(user_interaction[user], [])))
# 生成用户画像推理文件
data_list = []
for id in user_interaction.keys():
    data_list.append(generate_request_data(id, get_interaction_metadata(df_text, user_interaction[id])))

save_to_json_file(data_list, "movie-book_user_interaction.jsonl")

In [None]:
file_path_A = 'dataset/Movie-Book/Alist.txt'
file_path_B = 'dataset/Movie-Book/Blist.txt'
df_item_Alist = get_item_list(file_path_A)
df_item_Blist = get_item_list(file_path_B)
# item数量直接加和为两域之和
df_item = pd.concat([df_item_Alist, df_item_Blist])
df_meta_data_A = get_meta_data(movie_path)
df_meta_data_B = get_meta_data(book_path)
df_meta_data = pd.concat([df_meta_data_A, df_meta_data_B])
df_text = pd.merge(df_item, df_meta_data, on='asin', how='left')

# 从训练集中抽取所有用户交互，保留时间排序
train_file = 'dataset/Movie-Book/traindata_new.txt'
with codecs.open(train_file, "r", encoding="utf-8") as infile:
        train_data = []
        user_list = []
        for id, line in enumerate(infile):
            res = []
            line = line.strip().split("\t")
            user_list.append(int(line[0]))

            line = line[2:]  # 交互的一系列物品
            for w in line:
                w = w.split("|")
                res.append((int(w[0]), int(w[1])))
            res.sort(key=lambda x: x[1])  # 按照时间顺序排列

            res_2 = []
            for r in res:
                res_2.append(r[0])
            train_data.append(res_2)
user_interaction = dict()
for user, data in zip(user_list, train_data):
    user_interaction.setdefault(user, []).append(data)
for user in user_interaction.keys():
    user_interaction[user] = list(dict.fromkeys(sum(user_interaction[user], [])))
# 生成用户画像推理文件
data_list = []
for id in user_interaction.keys():
    # 使用原始id，确保交互列表中custom_id唯一
    data_list.append(generate_request_data(id, get_interaction_metadata(df_text, user_interaction[id])))

save_to_json_file(data_list, "movie-book_user_interaction.jsonl")

In [None]:
user_interaction = dict()
for user, data in zip(user_list, train_data):
    user_interaction.setdefault(user, []).append(data)
for user in user_interaction.keys():
    user_interaction[user] = list(dict.fromkeys(sum(user_interaction[user], [])))
# 生成用户画像推理文件
data_list = []
for id in user_interaction.keys():
    # 使用原始id，确保交互列表中custom_id唯一
    data_list.append(generate_request_data(id, get_interaction_metadata(df_text, user_interaction[id])))

save_to_json_file(data_list, "movie-book_user_interaction.jsonl")

In [None]:
# 生成用户画像批处理文件
file_path = 'dataset/Movie-Book/movie-book_user_profile.jsonl'

# 打开文件并逐行读取
data_list = []
with open(file_path, 'r', encoding='utf-8') as file:
    for line in file:
        # 解析每一行的 JSON 数据
        data = json.loads(line)
        # 获取 custom_id 和 content
        custom_id = data.get('custom_id')
        content = data.get('response', {}).get('body', {}).get('choices', [{}])[0].get('message', {}).get('content', '')
        data_list.append(generate_embedding_data(custom_id, content))

save_to_json_file(data_list, "dataset/Movie-Book/movie-book_embedding_request.jsonl")

In [None]:
def split_string(input_string, max_length=5000):
    """
    将输入字符串拆分为最大长度为max_length的字符串列表
    :param input_string: 需要拆分的字符串
    :param max_length: 每个子字符串的最大长度，默认为5000
    :return: 拆分后的字符串列表
    """
    return [input_string[i:i+max_length] for i in range(0, len(input_string), max_length)]


def online_user_embeddings(domains):
    """
    看起来不能使用批量推理生成嵌入，官方文档中对于能否使用text-embedding-v3模糊不清
    """
    file_path = f'dataset/{domains}/{domains.lower()}_user_profile.jsonl'
    # 打开文件并逐行读取
    data_list = []
    with open(file_path, 'r', encoding='utf-8') as file:
        for line in file:
            # 解析每一行的 JSON 数据
            data = json.loads(line)
            # 获取 custom_id 和 content
            custom_id = data.get('custom_id')
            content = data.get('response', {}).get('body', {}).get('choices', [{}])[0].get('message', {}).get('content', '')
            data_list.append((custom_id, split_string(content)))
    df_user_profile = pd.DataFrame(data_list, columns=['id', 'user profile'])
    tqdm.pandas(desc="Getting Reasoned User Embeddings")
    df_user_profile['embedding'] = df_user_profile['user profile'].progress_apply(lambda x: get_user_reasoning_embedding(x))
    df_user_profile[['id', 'embedding']].to_csv(f'dataset/{domains}/embedded_reasoning_user.csv')

In [None]:
online_user_embeddings('Movie-Book')

In [None]:
file_path = 'dataset/Movie-Book/movie-book_user_profile.jsonl'
# 打开文件并逐行读取
data_list = []
with open(file_path, 'r', encoding='utf-8') as file:
    for line in file:
        # 解析每一行的 JSON 数据
        data = json.loads(line)
        # 获取 custom_id 和 content
        custom_id = data.get('custom_id')
        content = data.get('response', {}).get('body', {}).get('choices', [{}])[0].get('message', {}).get('content', '')
        data_list.append((custom_id, content))
df_user_profile = pd.DataFrame(data_list, columns=['id', 'user profile'])

In [None]:
text = '''**User Profile:**  
A health-conscious, convenience-driven home cook with a growing interest in sleep comfort.  

**Key Insights:**  
1. **Home Cooking & Kitchen Tools**: Recent purchases (Toaster Oven Broiler, Spatter Cover) indicate a focus on kitchen appliances and gadgets for meal preparation.  
2. **Snack-Oriented Diet**: Frequent buys of dried fruits, nuts, and ready-to-eat meals (GoPicnic, Tropical Flora, Dried Peaches) suggest a preference for convenient, portable, and possibly healthy snacks. Bulk purchases (e.g., 225-pack snacks) imply bulk buying for regular consumption.  
3. **Bedding Comfort Upgrades**: Recent shifts toward cooling pillows and quilted pillows (most recent items) signal a priority for sleep comfort and ergonomic bedding solutions.  
4. **Occasional Gifting**: Dried fruit trays may indicate occasional gifting or presentation-focused purchases.  

**Behavioral Pattern**: Balances practicality (small appliances, snacks) with quality-of-life improvements (bedding upgrades), leaning toward convenience and health.

'''

print(text.replace('\n', '\\n').replace('"', '\\"'))

In [None]:
"User Profile:\nPrimary Interests (Recent & Frequent Purchases):\nCoffee Enthusiast: Regularly buys premium coffee (Senseo, Douwe Egberts) in pods and ground varieties, with a preference for decaf (items 1, 4, 6, 7, 9).\nHome Organization & Furniture: Prioritizes functional storage solutions (bookshelves, garment racks, shelves) and ergonomic furniture (home office desks, bed elevators, folding bookcases) (items 3, 5, 10, 12, 14–15).\nHome Comfort & Health: Invests in sleep/neck support (pillows, bed elevators) and maintenance (vacuum filters, can opener) (items 2, 5, 8, 11).\nSecondary Interests (Occasional/Decor):\nHome Décor: Adds aesthetic touches like posters, floating shelves, and themed shower curtains (items 13, 16).\nOn-the-Go Essentials: Uses travel mugs (item 17), suggesting a need for portable convenience.\nTemporal Trends:\nRecent purchases (last 3 items) emphasize functional decor (shelf, shower curtain, travel mug), indicating a focus on practical aesthetics.\nConsistent coffee buying reflects a stable routine, while home storage/organization items suggest ongoing efforts to optimize living/working spaces.\nKey Traits:\nEfficiency-Oriented: Prefers ready-to-use products (coffee pods, pre-packaged items).\nErgonomic Awareness: Chooses products for comfort and health (neck pillows, bed elevators).\nBrand Loyal: Repeats purchases from trusted brands (Senseo, Douwe Egberts, Umbra).\nRefined Summary:\nA coffee-loving, home-organization-focused individual optimizing their living/working spaces with ergonomic furniture, functional storage, and subtle decor, while maintaining routines with preferred coffee brands. Recent activity highlights a blend of practicality and aesthetic upgrades.\n"

### 处理Toy-Game数据集

In [1]:
import pandas as pd
import gzip
import json
import datetime

def parse(path):
  g = gzip.open(path, 'rb')
  for l in g:
    yield json.loads(l)

def getDF(path):
  i = 0
  df = {}
  for d in parse(path):
    df[i] = d
    i += 1
  return pd.DataFrame.from_dict(df, orient='index')

In [2]:
"""处理用户交互记录"""
columns = ['asin', 'reviewerID', 'rating', 'timestamp']
user_log_toy = pd.read_csv('dataset/generate_LLM_embeddings\Toy-Game\Toys_and_Games.csv', names=columns)
user_log_game = pd.read_csv('dataset/generate_LLM_embeddings\Toy-Game\Video_Games.csv', names=columns)
# 将时间戳转换为日期时间
user_log_toy['date'] = pd.to_datetime(user_log_toy['timestamp'], unit='s')
user_log_game['date'] = pd.to_datetime(user_log_game['timestamp'], unit='s')

# 定义2014年的开始时间
start_of_2014 = datetime.datetime(2014, 1, 1)

# 筛选2014年以后的数据
user_log_toy = user_log_toy[user_log_toy['date'] >= start_of_2014]
user_log_game = user_log_game[user_log_game['date'] >= start_of_2014]

In [3]:
user_log_toy

Unnamed: 0,asin,reviewerID,rating,timestamp,date
0,0020232233,A1IDMI31WEANAF,2.0,1474502400,2016-09-22
1,0020232233,A4BCEVVZ4Y3V3,1.0,1474156800,2016-09-18
2,0020232233,A2EZ9PY1IHHBX0,3.0,1473638400,2016-09-12
3,0020232233,A139PXTTC2LGHZ,5.0,1488412800,2017-03-02
4,0020232233,A3IB33V29XIL8O,1.0,1486512000,2017-02-08
...,...,...,...,...,...
8201226,B01HJBAKIO,A3OCDEVI6FGUWU,5.0,1512604800,2017-12-07
8201227,B01HJHA7GI,A1KTVUVADLKWZO,5.0,1453507200,2016-01-23
8201228,B01HJHA7GI,A2QCA9OE62IPZ4,5.0,1423353600,2015-02-08
8201229,B01HJHA7GI,A3N28JAZYS4L9O,5.0,1419984000,2014-12-31


In [4]:
# 对 user_log_toy 按 reviewerID 分组
grouped_toy = user_log_toy.groupby('reviewerID').apply(lambda x: [
    {'asin': asin, 'rating': rating, 'timestamp': timestamp}
    for asin, rating, timestamp in zip(x['asin'], x['rating'], x['timestamp'])
]).reset_index(name='data_toy')

# 对 user_log_game 按 reviewerID 分组
grouped_game = user_log_game.groupby('reviewerID').apply(lambda x: [
    {'asin': asin, 'rating': rating, 'timestamp': timestamp}
    for asin, rating, timestamp in zip(x['asin'], x['rating'], x['timestamp'])
]).reset_index(name='data_game')

  grouped_toy = user_log_toy.groupby('reviewerID').apply(lambda x: [
  grouped_game = user_log_game.groupby('reviewerID').apply(lambda x: [


In [18]:
grouped_toy

Unnamed: 0,reviewerID,data_toy
3,A0001528BGUBOEVR6T5U,"[{'asin': 'B0019PU8XE', 'rating': 5.0, 'timest..."
14,A0010158IH80M4C0LOJ1,"[{'asin': 'B00PHQG6GY', 'rating': 5.0, 'timest..."
15,A001170867ZBE9FORRQL,"[{'asin': 'B00H8OB6E0', 'rating': 5.0, 'timest..."
28,A00222906VX8GH7X6J6B,"[{'asin': 'B000A42YLY', 'rating': 5.0, 'timest..."
29,A0022678B6GE9F3FOSBS,"[{'asin': 'B005G14SGU', 'rating': 5.0, 'timest..."
...,...,...
3592100,AZZYVIRS854I7,"[{'asin': 'B007EA4UBY', 'rating': 5.0, 'timest..."
3592102,AZZYW4YOE1B6E,"[{'asin': 'B004S6DV2G', 'rating': 5.0, 'timest..."
3592107,AZZZ6G9WZTNNX,"[{'asin': 'B000IBPD76', 'rating': 5.0, 'timest..."
3592124,AZZZYAYJQSDOJ,"[{'asin': 'B00EBUVCW0', 'rating': 5.0, 'timest..."


In [19]:
# 确保两个域的交互记录在3条以上
grouped_toy = grouped_toy[grouped_toy['data_toy'].apply(lambda x: len(x) >= 3)]
grouped_game = grouped_game[grouped_game['data_game'].apply(lambda x: len(x) >= 3)]

# 合并两个分组结果
merged_data = pd.merge(grouped_game, grouped_toy, on='reviewerID', how='inner')
merged_data['all_data'] = merged_data.apply(lambda x: x['data_toy']+x['data_game'], axis=1)
merged_data

Unnamed: 0,reviewerID,data_game,data_toy,all_data
0,A0059486XI1Z0P98KP35,"[{'asin': 'B000WE8JES', 'rating': 5.0, 'timest...","[{'asin': 'B002QI4LYK', 'rating': 5.0, 'timest...","[{'asin': 'B002QI4LYK', 'rating': 5.0, 'timest..."
1,A0163982I33BFLFLDW0T,"[{'asin': 'B002EE4VQY', 'rating': 5.0, 'timest...","[{'asin': 'B008RDZCS2', 'rating': 5.0, 'timest...","[{'asin': 'B008RDZCS2', 'rating': 5.0, 'timest..."
2,A0220159ZRNBTRKLG08H,"[{'asin': 'B000084318', 'rating': 5.0, 'timest...","[{'asin': 'B00U9UB74O', 'rating': 3.0, 'timest...","[{'asin': 'B00U9UB74O', 'rating': 3.0, 'timest..."
3,A0266076X6KPZ6CCHGVS,"[{'asin': 'B00003OTI3', 'rating': 5.0, 'timest...","[{'asin': 'B00C2P72J8', 'rating': 5.0, 'timest...","[{'asin': 'B00C2P72J8', 'rating': 5.0, 'timest..."
4,A02836981FYG9912C66F,"[{'asin': 'B00JIJUB7G', 'rating': 4.0, 'timest...","[{'asin': 'B01AW1R5XQ', 'rating': 5.0, 'timest...","[{'asin': 'B01AW1R5XQ', 'rating': 5.0, 'timest..."
...,...,...,...,...
24967,AZZ1KF8RAO1BR,"[{'asin': 'B00BMFIXOW', 'rating': 2.0, 'timest...","[{'asin': 'B001B1VJJI', 'rating': 1.0, 'timest...","[{'asin': 'B001B1VJJI', 'rating': 1.0, 'timest..."
24968,AZZJBKJX833IV,"[{'asin': 'B00D7823Q6', 'rating': 5.0, 'timest...","[{'asin': 'B000VLXDAC', 'rating': 5.0, 'timest...","[{'asin': 'B000VLXDAC', 'rating': 5.0, 'timest..."
24969,AZZRS2XK17RFQ,"[{'asin': 'B00BT2BFKW', 'rating': 5.0, 'timest...","[{'asin': 'B0001XNTJA', 'rating': 5.0, 'timest...","[{'asin': 'B0001XNTJA', 'rating': 5.0, 'timest..."
24970,AZZT1ERHBSNQ8,"[{'asin': 'B00BU3ZLJQ', 'rating': 5.0, 'timest...","[{'asin': 'B00004TFT1', 'rating': 5.0, 'timest...","[{'asin': 'B00004TFT1', 'rating': 5.0, 'timest..."


In [21]:
# 过滤掉交互记录长度小于10的用户
filtered_data = merged_data[merged_data['all_data'].apply(lambda x: len(x) >= 10)]

# 计算每个用户的交互记录长度
filtered_data['length'] = filtered_data['all_data'].apply(len)

# 计算交互记录的平均长度
average_length = filtered_data['length'].mean()
average_length # = 15

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_data['length'] = filtered_data['all_data'].apply(len)


20.963336329780404

In [22]:
filtered_data

Unnamed: 0,reviewerID,data_game,data_toy,all_data,length
1,A0163982I33BFLFLDW0T,"[{'asin': 'B002EE4VQY', 'rating': 5.0, 'timest...","[{'asin': 'B008RDZCS2', 'rating': 5.0, 'timest...","[{'asin': 'B008RDZCS2', 'rating': 5.0, 'timest...",15
3,A0266076X6KPZ6CCHGVS,"[{'asin': 'B00003OTI3', 'rating': 5.0, 'timest...","[{'asin': 'B00C2P72J8', 'rating': 5.0, 'timest...","[{'asin': 'B00C2P72J8', 'rating': 5.0, 'timest...",12
4,A02836981FYG9912C66F,"[{'asin': 'B00JIJUB7G', 'rating': 4.0, 'timest...","[{'asin': 'B01AW1R5XQ', 'rating': 5.0, 'timest...","[{'asin': 'B01AW1R5XQ', 'rating': 5.0, 'timest...",10
6,A0422204VM5KZUEMVY96,"[{'asin': 'B00C7103DO', 'rating': 3.0, 'timest...","[{'asin': 'B000BMV0HE', 'rating': 3.0, 'timest...","[{'asin': 'B000BMV0HE', 'rating': 3.0, 'timest...",21
7,A049248150WLX2UGA57G,"[{'asin': 'B00DBDPOZ4', 'rating': 5.0, 'timest...","[{'asin': 'B007XLEKW8', 'rating': 5.0, 'timest...","[{'asin': 'B007XLEKW8', 'rating': 5.0, 'timest...",11
...,...,...,...,...,...
24964,AZY7LNNZZE02P,"[{'asin': 'B00DB9JV5W', 'rating': 3.0, 'timest...","[{'asin': 'B019NX6UMQ', 'rating': 5.0, 'timest...","[{'asin': 'B019NX6UMQ', 'rating': 5.0, 'timest...",18
24966,AZYU8M791SIFC,"[{'asin': 'B0000296O5', 'rating': 5.0, 'timest...","[{'asin': 'B005KY1KY6', 'rating': 5.0, 'timest...","[{'asin': 'B005KY1KY6', 'rating': 5.0, 'timest...",21
24968,AZZJBKJX833IV,"[{'asin': 'B00D7823Q6', 'rating': 5.0, 'timest...","[{'asin': 'B000VLXDAC', 'rating': 5.0, 'timest...","[{'asin': 'B000VLXDAC', 'rating': 5.0, 'timest...",12
24970,AZZT1ERHBSNQ8,"[{'asin': 'B00BU3ZLJQ', 'rating': 5.0, 'timest...","[{'asin': 'B00004TFT1', 'rating': 5.0, 'timest...","[{'asin': 'B00004TFT1', 'rating': 5.0, 'timest...",79


In [23]:
# 获取所有唯一的 reviewerID 和 asin
unique_reviewerIDs = pd.Series(filtered_data['reviewerID'].unique())
toy_unique_asins = pd.Series([item['asin'] for sublist in filtered_data['data_toy'] for item in sublist]).unique()
game_unique_asins = pd.Series([item['asin'] for sublist in filtered_data['data_game'] for item in sublist]).unique()
# 创建 reviewerID 和 asin 的映射字典
reviewerID_map = {reviewerID: idx for idx, reviewerID in enumerate(unique_reviewerIDs)}
toy_asin_map = {asin: idx for idx, asin in enumerate(toy_unique_asins)}
game_asin_map = {asin: idx for idx, asin in enumerate(game_unique_asins)}

In [24]:
len(reviewerID_map), len(toy_asin_map), len(game_asin_map)

(15574, 86491, 22832)

In [25]:
# 将 reviewerID 和 asin 映射为整数
filtered_data['reviewerID_int'] = filtered_data['reviewerID'].map(reviewerID_map)
filtered_data['toy_data_int'] = filtered_data['data_toy'].apply(
    lambda x: [{'asin_int': toy_asin_map[item['asin']], 'rating': item['rating'], 'timestamp': item['timestamp']} for item in x]
)
filtered_data['game_data_int'] = filtered_data['data_game'].apply(
    lambda x: [{'asin_int': game_asin_map[item['asin']]+len(toy_asin_map), 'rating': item['rating'], 'timestamp': item['timestamp']} for item in x]
)
filtered_data['all_data_int'] = filtered_data.apply(lambda x: x['toy_data_int']+x['game_data_int'], axis=1)
# 按时间戳排序
filtered_data['all_data_sorted'] = filtered_data['all_data_int'].apply(
    lambda x: sorted(x, key=lambda k: k['timestamp'])
)
# 查看转换后的数据
filtered_data[['reviewerID_int', 'all_data_sorted']]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_data['reviewerID_int'] = filtered_data['reviewerID'].map(reviewerID_map)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_data['toy_data_int'] = filtered_data['data_toy'].apply(
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_data['game_data_int'] = filtered_data['data_game'

Unnamed: 0,reviewerID_int,all_data_sorted
1,0,"[{'asin_int': 0, 'rating': 5.0, 'timestamp': 1..."
3,1,"[{'asin_int': 12, 'rating': 5.0, 'timestamp': ..."
4,2,"[{'asin_int': 86505, 'rating': 3.0, 'timestamp..."
6,3,"[{'asin_int': 30, 'rating': 5.0, 'timestamp': ..."
7,4,"[{'asin_int': 40, 'rating': 5.0, 'timestamp': ..."
...,...,...
24964,15569,"[{'asin_int': 5571, 'rating': 4.0, 'timestamp'..."
24966,15570,"[{'asin_int': 86456, 'rating': 5.0, 'timestamp..."
24968,15571,"[{'asin_int': 6001, 'rating': 5.0, 'timestamp'..."
24970,15572,"[{'asin_int': 77004, 'rating': 5.0, 'timestamp..."


In [27]:
filtered_data[['reviewerID_int', 'all_data_sorted']].to_csv("dataset/generate_LLM_embeddings/Toy-Game/megered_Toy-Game.csv")

In [28]:
reviewerID_map_df = pd.DataFrame(list(reviewerID_map.items()), columns=['reviewerID', 'reviewerID_index'])
toy_asin_map_df = pd.DataFrame(list(toy_asin_map.items()), columns=['asin', 'asin_index'])
game_asin_map_df = pd.DataFrame(list(game_asin_map.items()), columns=['asin', 'asin_index'])
reviewerID_map_df.to_csv('dataset/Toy-Game/user_list.csv', index=False)
toy_asin_map_df.to_csv('dataset/Toy-Game/Alist.csv', index=False)
game_asin_map_df.to_csv('dataset/Toy-Game/Blist.csv', index=False)

In [29]:
import datetime
# 拆分为训练集、验证集和测试集
train_ratio = 0.8
val_ratio = 0.1
test_ratio = 0.1

train_data = []
val_data = []
test_data = []

for _, row in filtered_data.iterrows():
    sequence = row['all_data_sorted']
    # 确保序列长度至少为3，以便拆分
    if len(sequence) >=3:
        # 按比例拆分
        train_end = int(len(sequence) * train_ratio)
        val_end = int(len(sequence) * (train_ratio + val_ratio))
        
        train_sequence = sequence[:train_end]
        val_sequence = sequence[train_end:val_end]
        test_sequence = sequence[val_end:]
        
        # 滑动窗口拆分序列，窗口大小为15
        def split_sequence(seq):
            result = []
            start = 0
            while start + 15 <= len(seq):
                result.append(seq[start:start + 15])
                start += 15
            # 处理剩余部分
            if start < len(seq):
                result.append(seq[start:])
            return result
        
        train_sequences = split_sequence(train_sequence)
        val_sequences = split_sequence(val_sequence)
        test_sequences = split_sequence(test_sequence)

        for seq in train_sequences:
            if len(seq) > 3:
                train_data.append({
                    'reviewerID_int': row['reviewerID_int'],
                    'sequence': seq
                })
        for seq in val_sequences:
            if len(seq) > 3:
                val_data.append({
                    'reviewerID_int': row['reviewerID_int'],
                    'sequence': seq
                })
        for seq in test_sequences:
            if len(seq) > 3:
                test_data.append({
                    'reviewerID_int': row['reviewerID_int'],
                    'sequence': seq
                })

# 将结果保存到文本文件
def save_sequence(data, filename):
    with open(filename, 'w') as f:
        for item in data:
            seq_str = '\t'.join([f"{x['asin_int']}|{x['timestamp']}|{datetime.datetime.fromtimestamp(x['timestamp']).strftime('%Y-%m-%d %H:%M:%S')}|" for x in item['sequence']])
            f.write(f"{item['reviewerID_int']}\t{len(item['sequence'])}\t{seq_str}\n")

save_sequence(train_data, 'dataset/Toy-Game/traindata.txt')
save_sequence(val_data, 'dataset/Toy-Game/validdata.txt')
save_sequence(test_data, 'dataset/Toy-Game/testdata.txt')

## Toy-Game LLM Embeddings

In [None]:
"""处理元数据"""
toy_meta = getDF('dataset/generate_LLM_embeddings/Toy-Game\meta_Toys_and_Games.json.gz')
game_meta = getDF('dataset/generate_LLM_embeddings/Toy-Game\meta_Video_Games.json.gz')

In [None]:
toy_item = pd.read_csv("dataset\Toy-Game\Alist.csv")
game_item = pd.read_csv("dataset\Toy-Game\Blist.csv")
df_toy = pd.merge(toy_item, toy_meta, on='asin', how='inner')
df_game = pd.merge(game_item, game_meta, on='asin', how='inner')

In [None]:
df_toy = df_toy.drop_duplicates(subset='asin')
df_game = df_game.drop_duplicates(subset='asin')

In [None]:
from openai import OpenAI
from tqdm import tqdm


client = OpenAI(
    base_url='https://dashscope.aliyuncs.com/compatible-mode/v1',
    api_key='sk-1bf40b7f0bdf41cf9e68509aa6dc1296'
)

def split_string(input_string, max_length=5000):
    """
    将输入字符串拆分为最大长度为max_length的字符串列表
    :param input_string: 需要拆分的字符串
    :param max_length: 每个子字符串的最大长度，默认为5000
    :return: 拆分后的字符串列表
    """
    li = [input_string[i:i+max_length] for i in range(0, len(input_string), max_length)]
    if len(li) > 8:
        li = li[:8]
    return li

def get_embedding(df, asin):
    # 查询特定的asin
    item = df[df["asin"] == asin]

    if not item.empty:
        title = item['title'].item()
        category = item['category'].item()
        description = item['description'].item()
        text_desc = f'title: {title}\ncategory: {category}\ndescription: {description}'
        response = client.embeddings.create(
            input=split_string(text_desc, 4000),
            model="text-embedding-v3",
            dimensions=1024,
            encoding_format="float"
        )

        return response.data[0].embedding
    else:
        return [0] * 1024

In [None]:
import os
embeddings = []
part_number = 0
total = len(game_item)

for index, series in tqdm(enumerate(game_item.iterrows()), total=total):
    embedding = get_embedding(df_game, series[1]['asin'])
    embeddings.append([series[1]['asin_index'], series[1]['asin'], embedding])
    
    # 每 5000 次迭代保存一次中间结果
    if (index + 1) % 5000 == 0:
        part_df = pd.DataFrame(embeddings, columns=['id', 'asin', 'embedding'])
        part_df.to_csv(f'dataset/game_embeddings_part_{part_number}.csv', index=False)
        embeddings = []  # 重置 embeddings 列表
        part_number += 1

# 保存最后剩余的部分（如果有的话）
if embeddings:
    part_df = pd.DataFrame(embeddings, columns=['id', 'asin', 'embedding'])
    part_df.to_csv(f'dataset/game_embeddings_part_{part_number}.csv', index=False)

In [None]:
# 合并所有部分
csv_files = [f for f in os.listdir('dataset') if f.startswith('game_embeddings_part_') and f.endswith('.csv')]
csv_files.sort()  # 按照文件名排序

# 读取所有中间文件并合并
final_df = pd.concat([pd.read_csv(os.path.join('dataset', f)) for f in csv_files], ignore_index=True)

# 保存最终的合并文件
final_df.to_csv('dataset/game_embeddings_final.csv', index=False)


In [None]:
embeddings = []
part_number = 13  # 假设已经保存到40000次
total = len(toy_item)

# 计算上次保存的索引位置
last_index = 65000 - 1  # 40000次迭代对应40000条数据

# 从上次保存的位置继续
for index in tqdm(range(total), total=total):
    if index <= last_index:
        continue  # 跳过已经处理的条目
    series = toy_item.iloc[index]
    embedding = get_embedding(df_toy, series['asin'])
    embeddings.append([series['asin_index'], series['asin'], embedding])
    
    # 每5000次迭代保存一次中间结果
    if (index + 1) % 5000 == 0:
        part_df = pd.DataFrame(embeddings, columns=['id', 'asin', 'embedding'])
        part_df.to_csv(f'dataset/toy_embeddings_part_{part_number}.csv', index=False)
        embeddings = []  # 重置embeddings列表
        part_number += 1

# 保存最后剩余的部分（如果有的话）
if embeddings:
    part_df = pd.DataFrame(embeddings, columns=['id', 'asin', 'embedding'])
    part_df.to_csv(f'dataset/toy_embeddings_part_{part_number}.csv', index=False)

In [None]:
# 合并所有部分
csv_files = [f for f in os.listdir('dataset') if f.startswith('toy_embeddings_part_') and f.endswith('.csv')]
csv_files.sort()  # 按照文件名排序

# 读取所有中间文件并合并
final_df = pd.concat([pd.read_csv(os.path.join('dataset', f)) for f in csv_files], ignore_index=True)

# 保存最终的合并文件
final_df.to_csv('dataset/toy_embeddings_final.csv', index=False)


In [None]:
# 合并所有部分
csv_files = [f for f in os.listdir('dataset') if f.startswith('toy_embeddings_part_') and f.endswith('.csv')]
csv_files.sort()  # 按照文件名排序

# 读取所有中间文件并合并
df_toy = pd.concat([pd.read_csv(os.path.join('dataset', f)) for f in csv_files], ignore_index=True)

# 合并所有部分
csv_files = [f for f in os.listdir('dataset') if f.startswith('game_embeddings_part_') and f.endswith('.csv')]
csv_files.sort()  # 按照文件名排序

# 读取所有中间文件并合并
df_game = pd.concat([pd.read_csv(os.path.join('dataset', f)) for f in csv_files], ignore_index=True)


In [None]:
import numpy as np
tqdm.pandas(desc=f"Loading toy Embeddings")
df_toy['embedding_np'] = df_toy.embedding.progress_apply(lambda x: np.array(eval(x), dtype=np.float32))
tqdm.pandas(desc=f"Loading game Embeddings")
df_game['embedding_np'] = df_game.embedding.progress_apply(lambda x: np.array(eval(x), dtype=np.float32))

In [None]:
array_x = np.vstack(df_toy['embedding_np'].to_numpy())
array_y = np.vstack(df_game['embedding_np'].to_numpy())
item_embeddings = np.vstack((array_x, array_y, np.zeros(1024, dtype=np.float32)))
np.save(f'dataset/Toy-Game/item_embeddings.npy', item_embeddings)

In [None]:

# all_user_ids = list(range(0, user_num))
# # 找出缺失的用户ID
# missing_user_ids = [uid for uid in all_user_ids if uid not in df_user['id'].values]
# # 创建一个包含缺失用户ID的补充数据框
# supplement_df = pd.DataFrame({'id': missing_user_ids, 'embedding': [[0]*1024]*len(missing_user_ids)})
# # 将补充的数据框与原始数据框合并
df_user = pd.concat([df_user, supplement_df]).sort_values(by='id').reset_index(drop=True)
# array_x = np.vstack(df_x['embedding'].to_numpy())
# array_y = np.vstack(df_y['embedding'].to_numpy())
# item_embeddings = np.vstack((array_x, array_y, np.zeros(1024, dtype=np.float32)))
user_embeddings = np.vstack(df_user['embedding_np'].to_numpy())
# np.save(f'dataset/{domains}/item_embeddings.npy', item_embeddings)
np.save(f'dataset/{domains}/reasoning_user_embeddings.npy', user_embeddings)

In [None]:
# 从训练集中抽取所有用户交互，保留时间排序
train_file = f'dataset/Toy-Game/traindata.txt'
with codecs.open(train_file, "r", encoding="utf-8") as infile:
    train_data = []
    user_list = []
    for id, line in enumerate(infile):
        res = []
        line = line.strip().split("\t")
        user_list.append(int(line[0]))

        line = line[2:]  # 交互的一系列物品
        for w in line:
            w = w.split("|")
            res.append((int(w[0]), int(w[1])))
        res.sort(key=lambda x: x[1])  # 按照时间顺序排列

        res_2 = []
        for r in res:
            res_2.append(r[0])
        train_data.append(res_2)
user_interaction = dict()
for user, data in zip(user_list, train_data):
    user_interaction.setdefault(user, []).append(data)
for user in user_interaction.keys():
    user_interaction[user] = list(dict.fromkeys(sum(user_interaction[user], [])))

In [None]:
file_path_A = 'dataset/Toy-Game/Alist.csv'
file_path_B = 'dataset/Toy-Game/Blist.csv'
df_item_Alist = pd.read_csv(file_path_A)
df_item_Blist = pd.read_csv(file_path_B)
# item数量直接加和为两域之和
df_item = pd.concat([df_item_Alist, df_item_Blist])
df_meta_data_A = getDF('dataset/generate_LLM_embeddings/Toy-Game\meta_Toys_and_Games.json.gz')
df_meta_data_B = getDF('dataset/generate_LLM_embeddings/Toy-Game\meta_Video_Games.json.gz')
df_meta_data = pd.concat([df_meta_data_A, df_meta_data_B])
df_text = pd.merge(df_item, df_meta_data, on='asin', how='left')

In [None]:
def generate_request_data(custom_id, content):
    """
    生成单个请求数据
    :param custom_id: 自定义ID
    :param content: 用户问题内容
    :return: 请求数据的字典格式
    """
    prompt = "Below is a user's purchase record of items in the Toy-Game domains. Please note the temporal information: the higher the sequence number, the more recent the record. Use this sequence to characterize the user profile. The user profile should be concise and refined. And please avoid inappropriate content!"
    request_data = {
        "custom_id": custom_id,
        "method": "POST",
        "url": "/v1/chat/completions",
        "body": {
            "model": "qwq-plus",
            "messages": [
                {"role": "system", "content": "You are a recommendation expert, capable of profiling users based on their interaction records. The user profile should be concise and refined."},
                {"role": "user", "content": prompt+content}
            ]
        }
    }
    return request_data

def get_interaction_metadata(df_text, user_interaction):
    text = ''
    for num, id in enumerate(user_interaction):
        item = df_text.iloc[id]
        title = str(item['title'])
        category = str(item['category'])
        text += f'{num+1}. ' + title + '  ' + category + '\n'
    return text

In [None]:
file_path = 'dataset/440e74b0-6410-4463-a0fb-e15cd9e1f04c_1746977751857_error.jsonl'
# 打开文件并逐行读取
error_id = []
with open(file_path, 'r', encoding='utf-8') as file:
    for line in file:
        # 解析每一行的 JSON 数据
        data = json.loads(line)
        # 获取 custom_id 和 content
        custom_id = data.get('custom_id')
        error_id.append(custom_id)

In [None]:
error_id = [int(x) for x in error_id]

In [None]:
data_list = []
for id in user_interaction.keys():
    # 使用原始id，确保交互列表中custom_id唯一
    if id in error_id:
        data_list.append(generate_request_data(id, get_interaction_metadata(df_text, user_interaction[id])))

save_to_json_file(data_list, "dataset/toy-game_user_interaction_2.jsonl")

In [None]:
file_path = 'dataset/toy-game_user_interaction_2.jsonl'
# 打开文件并逐行读取
data_list = []
with open(file_path, 'r', encoding='utf-8') as file:
    for line in file:
        # 解析每一行的 JSON 数据
        data = json.loads(line)
        # 获取 custom_id 和 content
        custom_id = data.get('custom_id')
        content = data.get('body').get('messages')[1].get('content', '')
        data_list.append((custom_id, content))
df_user_profile = pd.DataFrame(data_list, columns=['id', 'user profile'])

In [None]:
file_path = 'dataset/toy-game_user_profile.jsonl'
# 打开文件并逐行读取
data_list = []
with open(file_path, 'r', encoding='utf-8') as file:
    for line in file:
        # 解析每一行的 JSON 数据
        data = json.loads(line)
        # 获取 custom_id 和 content
        custom_id = data.get('custom_id')
        content = data.get('response', {}).get('body', {}).get('choices', [{}])[0].get('message', {}).get('content', '')
        data_list.append((custom_id, split_string(content)))
df_user_profile = pd.DataFrame(data_list, columns=['id', 'user profile'])

In [None]:
df_user_profile['id_int'] = df_user_profile['id'].apply(int) 

In [None]:
import os
embeddings = []
part_number = 0
total = len(df_user_profile)
i=0
for index, series in tqdm(enumerate(df_user_profile.iterrows()), total=total):
    embedding =  get_user_reasoning_embedding(series[1]['user profile'])
    embeddings.append([series[1]['id'], embedding])
    
    # 每 5000 次迭代保存一次中间结果
    if (index + 1) % 3000 == 0:
        part_df = pd.DataFrame(embeddings, columns=['id', 'embedding'])
        part_df.to_csv(f'dataset/user_embeddings_part_{part_number}.csv', index=False)
        embeddings = []  # 重置 embeddings 列表
        part_number += 1
    i += 1

# 保存最后剩余的部分（如果有的话）
if embeddings:
    part_df = pd.DataFrame(embeddings, columns=['id', 'embedding'])
    part_df.to_csv(f'dataset/user_embeddings_part_{part_number}.csv', index=False)

In [None]:
import os
# 合并所有部分
csv_files = [f for f in os.listdir('dataset') if f.startswith('user_embeddings_part_') and f.endswith('.csv')]
csv_files.sort()  # 按照文件名排序

# 读取所有中间文件并合并
df_user = pd.concat([pd.read_csv(os.path.join('dataset', f)) for f in csv_files], ignore_index=True)
df_user['embedding_np'] = df_user.embedding.apply(lambda x: np.array(eval(x), dtype=np.float32))

In [None]:
all_user_ids = list(range(0, 15574))
# 找出缺失的用户ID
missing_user_ids = [uid for uid in all_user_ids if uid not in df_user['id'].values]
# 创建一个包含缺失用户ID的补充数据框
supplement_df = pd.DataFrame({'id': missing_user_ids, 'embedding_np': [np.zeros([1024], dtype=np.float32)]*len(missing_user_ids)})
# supplement_df['embedding_np'] = supplement_df['embedding'].apply(lambda x: np.array(x, dtype=np.float32).squeeze())
# # 将补充的数据框与原始数据框合并
df_user = pd.concat([df_user, supplement_df]).sort_values(by='id').reset_index(drop=True)


In [None]:
df_user['embedding_np'].iloc[100]

In [None]:
user_embeddings = np.vstack(df_user['embedding_np'].to_numpy())
# np.save(f'dataset/{domains}/item_embeddings.npy', item_embeddings)
np.save(f'dataset/Toy-Game/reasoning_user_embeddings.npy', user_embeddings)