In [1]:
import pandas as pd
import numpy as np
import scipy.sparse as sp
import threading
import json
import pickle
import os

# source
name = "ml-100k"
inter_path = f"/media/data/dataset/{name}/{name}.inter"
user_path = f"/media/data/dataset/{name}/{name}.user"
item_path = f"/media/data/dataset/{name}/{name}.item"

# target
# /media/data/model/Llama-2-7b-hf
# /media/data/model/Qwen2.5-14B-Instruct
# /media/data/model/Llama-3-8B-Instruct
pre_model = 'Llama-3-8B-Instruct'
item_embeddings_path = f"/media/data/dataset/llm/{pre_model}/{name}/item_embeddings.npy"
user_embeddings_path = f"/media/data/dataset/llm/{pre_model}/{name}/user_embeddings.npy"
id_map_path = f"/media/data/dataset/llm/{pre_model}/{name}/id_map.pkl"
embedding_map_path = f"/media/data/dataset/llm/{pre_model}/{name}/embedding_map.pkl"
# 获取父级目录
parent_dir = os.path.dirname(item_embeddings_path)
# 判断父级目录是否存在，不存在则创建
if not os.path.exists(parent_dir):
    os.makedirs(parent_dir)
    print(f"创建目录: {parent_dir}")
else:
    print(f"目录已存在: {parent_dir}")

# other
model_path = f'/media/data/model/{pre_model}'
embedding_dim = 4096  # 嵌入维度, llama:4096,qwen:5120
global_idx = 0
span1 = 3
span2 = 3
is_load_generated = False
# 设置 CUDA_VISIBLE_DEVICES 使程序只看到 GPU 0,1,2,3
os.environ["CUDA_VISIBLE_DEVICES"] = "0,1,2,3"

创建目录: /media/data/dataset/llm/Llama-3-8B-Instruct/ml-100k


In [2]:
import logging
 
# 配置日志系统
logging.basicConfig(filename='log/logging.log', filemode='w', level=logging.DEBUG, format='%(asctime)s - %(levelname)s - %(message)s')
 
# 测试打印输出同时记录到日志
# logging.debug('这是一个调试信息')
# logging.info('这是一个信息信息')
# logging.warning('这是一个警告信息')
# logging.error('这是一个错误信息')
# logging.critical('这是一个严重错误信息')

In [3]:
# 读数据
inter_data = pd.read_csv(inter_path, sep="\t").sort_values(by=['user_id:token'], ascending=[True])
user_data = pd.read_csv(user_path, sep="\t")
item_data = pd.read_csv(item_path, sep="\t")
inter_data

Unnamed: 0,user_id:token,item_id:token,rating:float,timestamp:float
41842,1,46,4,876893230
38751,1,257,4,874965954
8976,1,12,5,878542960
3248,1,74,1,889751736
3260,1,134,4,875073067
...,...,...,...,...
95594,943,217,3,888640067
77956,943,94,4,888639929
76855,943,943,5,888639614
94966,943,566,4,888639886


In [4]:
user_data = user_data[user_data['user_id:token'].isin(inter_data['user_id:token'])]
user_data

Unnamed: 0,user_id:token,age:token,gender:token,occupation:token,zip_code:token
0,1,24,M,technician,85711
1,2,53,F,other,94043
2,3,23,M,writer,32067
3,4,24,M,technician,43537
4,5,33,F,other,15213
...,...,...,...,...,...
938,939,26,F,student,33319
939,940,32,M,administrator,02215
940,941,20,M,student,97229
941,942,48,F,librarian,78209


In [5]:
item_data = item_data[item_data['item_id:token'].isin(inter_data['item_id:token'])]
item_data

Unnamed: 0,item_id:token,movie_title:token_seq,release_year:token,genre:token_seq
0,1,Toy Story,1995,Animation Children's Comedy
1,2,GoldenEye,1995,Action Adventure Thriller
2,3,Four Rooms,1995,Thriller
3,4,Get Shorty,1995,Action Comedy Drama
4,5,Copycat,1995,Crime Drama Thriller
...,...,...,...,...
1677,1678,Mat' i syn,1997,Drama
1678,1679,B. Monkey,1998,Romance Thriller
1679,1680,Sliding Doors,1998,Drama Romance
1680,1681,You So Crazy,1994,Comedy


In [6]:
uid_map = {'[PAD]': 0, '0': 0}
idx = 1
for id in user_data['user_id:token']:
    uid_map[str(id)] = idx
    idx += 1
    
iid_map = {'[PAD]': 0}
idx = 1
for id in item_data['item_id:token']:
    iid_map[str(id)] = idx
    idx += 1
id_map = {'uid': uid_map,'iid': iid_map}

# # 写入JSON字符串到文件
# with open(id_map_path, 'w') as file:
#     json_data = json.dumps(id_map)  # 将字典转换为JSON字符串
#    file.write(json_data)

# 将字典保存为 Pickle 文件
with open(id_map_path, 'wb') as pf:
    pickle.dump(id_map, pf)

# with open(id_map_path, 'rb') as pf:
#     pk_data = pickle.load(pf)
# print(pk_data)

In [7]:
if model_path.lower().__contains__('llama'):
    def get_llama_model():
        ## LLM处理
        from transformers import LlamaTokenizer, LlamaModel, PreTrainedTokenizerFast
        import torch
        
        # 加载LLaMA模型和tokenizer
        # 加载 tokenizer （不需要放到 CUDA 上）
        if model_path.lower().__contains__('llama-3'):
            tokenizer = PreTrainedTokenizerFast.from_pretrained(model_path)
        else:
            tokenizer = LlamaTokenizer.from_pretrained(model_path)
        # 加载模型并移动到 cuda 上
        llm_model = LlamaModel.from_pretrained(model_path,device_map='auto')
        return tokenizer,llm_model
    span,embedding_dim = span1,4096
    tokenizer,llm_model = get_llama_model()
elif model_path.lower().__contains__('qwen'):
    def get_qwen_model():
        from transformers import AutoTokenizer, AutoModel
        import torch
        
        # 模型路径，可以替换为 Hugging Face Hub 上的路径，如 'Qwen/Qwen-7B'
        # model_path = 'Qwen/Qwen-7B'
        
        # 加载 Qwen 模型和 tokenizer
        # 加载 tokenizer （不需要放到 CUDA 上）
        tokenizer = AutoTokenizer.from_pretrained(model_path)
        
        # 加载模型并移动到 CUDA 上
        llm_model = AutoModel.from_pretrained(model_path, device_map='auto', trust_remote_code=True)
        return tokenizer,llm_model
    span,embedding_dim = span2,5120
    tokenizer,llm_model = get_qwen_model()
span,embedding_dim

  from .autonotebook import tqdm as notebook_tqdm
Loading checkpoint shards: 100%|██████████| 4/4 [00:17<00:00,  4.31s/it]


(3, 4096)

In [8]:
def get_text_embedding(texts,count=0):
    """Generate embeddings for the given text using LLaMA model."""
    if texts == None or texts == []:
        return None
    try:
        # 将 eos_token 设置为 pad_token, 如果没有pad_token
        if tokenizer.pad_token is None:
            tokenizer.pad_token = tokenizer.eos_token
        # print(text)
        inputs = tokenizer(texts, return_tensors='pt', padding=True, truncation=True)
        # 移到 cuda
        inputs = {key: value.to('cuda') for key, value in inputs.items()}
        outputs = llm_model(**inputs)
        return outputs.last_hidden_state.mean(dim=1).detach().cpu().numpy()  # 取平均作为句子嵌入
    except Exception as e:
        print(f'报错{count}:', e)
        if count>5:
            return None
        count != 1
        return get_text_embedding(texts,count)
    

# 生成用户嵌入
def generate_user_embedding(rows):
    texts = []
    for i,row in rows.iterrows():
        if name.lower().__contains__('ml-'):
            text = f"User id: user_{row['user_id:token']}, Age: {row['age:token']}, Gender: {row['gender:token']}, Occupation: {row['occupation:token']}"
            # text = f"User id: user_{row['user_id:token']}, Age: {row['age:token']}, Gender: {row['gender:token']}, Occupation: {row['occupation:token']}, Favorite categories: {get_favorite_category(row)[:10]}, Best rated categories: {get_best_rated_category(row)[:10]}"
        elif name.lower().__contains__('yelp'):
            text = f"User id: user_{id_map['uid'][row['user_id:token']]}, Username: {row['user_name:token']}, User review count: {row['user_review_count:float']}, Yelping since: {row['yelping_since:float']}, User useful: {row['user_useful:float']}, User funny: {row['user_funny:float']}, User cool: {row['user_cool:float']}, Elite: {row['elite:token']}, Fans: {row['fans:float']}, Average stars: {row['average_stars:float']}, Compliment hot: {row['compliment_hot:float']}, Compliment more: {row['compliment_more:float']}, Compliment profile: {row['compliment_profile:float']}, Compliment cute: {row['compliment_cute:float']}"
        texts.append(text)
    return get_text_embedding(texts)

# 生成物品嵌入
def generate_item_embedding(rows):
    texts = []
    for i,row in rows.iterrows():
        if name.lower().__contains__('ml-'):
            text = f"Item id: item_{row['item_id:token']}, Movie title: {row['movie_title:token_seq']}, Release year: {row['release_year:token']}, Category: {row['genre:token_seq']}"
            # text = f"Item id: item_{row['item_id:token']}, Movie title: {row['movie_title:token_seq']}, Release year: {row['release_year:token']}, Category: {category_map[row['class:token_seq']]}, Popularity degree: {get_popularity_degress(row)}"
        elif name.lower().__contains__('yelp'):
            text = f"Item id: item_{id_map['iid'][row['item_id:token']]}, Item name: {row['item_name:token_seq']}, Address: {row['address:token_seq']}, City: {row['city:token_seq']}, State: {row['state:token']}, Postal code: {row['postal_code:token']}, Latitude: {row['latitude:float']}, Longitude: {row['longitude:float']}, Item stars: {row['item_stars:float']}, Item review count: {row['item_review_count:float']}, Categories: {row['categories:token_seq']}"
        texts.append(text)
    return get_text_embedding(texts)

In [9]:
# 创建一个锁
lock = threading.Lock()
embedding_map = {'u':{},'i':{}}
if is_load_generated and os.path.exists(embedding_map_path):
    with open(embedding_map_path, 'rb') as pf:
        pk_data = pickle.load(pf)
        embedding_map['u'] = {k:np.array(v,np.float32) for k,v in pk_data['u'].items()}
        embedding_map['i'] = {k:np.array(v,np.float32) for k,v in pk_data['i'].items()}

# 定义一个简单的线程任务
def task(name,rows,generate_embedding,_embedding_map):
    i = 0
    while True:
        with lock:
            global global_idx
            i = global_idx
            global_idx += span
        # print('i:',i)
        if i>len(rows):
            break
        mini_rows = rows.iloc[i:i+span]
        logging.info(i)
        # 假设 generate_embedding() 是你生成嵌入的函数，返回一个 (embedding_dim,) 的向量
        i1 = i+1
        if _embedding_map.__contains__(i1) and _embedding_map[i1] != None:
            continue
        es = generate_embedding(mini_rows)
        for e in es:
            print(f'{name}.idx:{i1}', end=',')
            if e is None:
                print(f'{i1} is None')
            _embedding_map[i1] = np.array(e).astype(np.float32)
            i1 += 1

In [10]:
# 假设你有一个嵌入维度为 embedding_dim 的嵌入向量，并且你知道将要存储的行数
def preprocess_save(file_path, rows, generate_embedding):
    global global_idx
    global_idx = 0 # 置零
    embeddings = []
    if file_path.__contains__('user'):
        _embedding_map = embedding_map['u']
    else:
        _embedding_map = embedding_map['i']
    # 第0行为随机数据, 因为recbole对于数据集开始编号是1的, 会自动创建一行编号为0的数据, 与其对齐
    # embeddings.append(np.random.rand(1,embedding_dim))  # 这里用随机数模拟嵌入
    print(0, end=',')
    logging.info(0)
    if not _embedding_map.__contains__(0):
        _embedding_map[0] = np.random.rand(embedding_dim).astype(np.float32)

    ts = []
    for i in range(len(os.environ["CUDA_VISIBLE_DEVICES"].split(','))):
        # 创建线程
        t = threading.Thread(target=task, args=(f"t{i}",rows,generate_embedding,_embedding_map))
        ts.append(t)
    for t in ts:
        # 启动线程
        t.start()
    for t in ts:
        # 等待线程执行完毕
        t.join()

    print("All tasks completed.")
    # print(_embedding_map)
    for key in list(sorted(_embedding_map.keys())):
        embeddings += [_embedding_map[key]]

    embeddings = np.array(embeddings)
    np.save(file_path, embeddings)

# embeddings = []
# for key in list(sorted(_embedding_map.keys())):
#     embeddings += _embedding_map[key].tolist()

# embeddings = np.array(embeddings)
# np.save(item_embeddings_path, embeddings)

# user_embeddings = np.load(user_embeddings_path).astype(np.float32)
# user_embeddings

In [11]:
preprocess_save(user_embeddings_path,user_data,generate_user_embedding)
preprocess_save(item_embeddings_path,item_data,generate_item_embedding)

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.
Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.
Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.
We detected that you are passing `past_key_values` as a tuple and this is deprecated and will be removed in v4.43. Please use an appropriate `Cache` class (https://huggingface.co/docs/transformers/v4.41.3/en/internal/generation_utils#transformers.Cache)


0,t2.idx:7,t2.idx:8,t2.idx:9,t1.idx:4,t1.idx:5,t1.idx:6,t0.idx:1,t0.idx:2,t0.idx:3,t3.idx:10,t3.idx:11,t3.idx:12,t2.idx:13,t2.idx:14,t2.idx:15,t1.idx:16,t1.idx:17,t1.idx:18,t3.idx:22,t3.idx:23,t3.idx:24,t0.idx:19,t0.idx:20,t0.idx:21,t2.idx:25,t2.idx:26,t2.idx:27,t1.idx:28,t1.idx:29,t1.idx:30,t3.idx:31,t3.idx:32,t3.idx:33,t0.idx:34,t0.idx:35,t0.idx:36,t2.idx:37,t2.idx:38,t2.idx:39,t1.idx:40,t1.idx:41,t1.idx:42,t3.idx:43,t3.idx:44,t3.idx:45,t0.idx:46,t0.idx:47,t0.idx:48,t2.idx:49,t2.idx:50,t2.idx:51,t1.idx:52,t1.idx:53,t1.idx:54,t3.idx:55,t3.idx:56,t3.idx:57,t0.idx:58,t0.idx:59,t0.idx:60,t2.idx:61,t2.idx:62,t2.idx:63,t1.idx:64,t1.idx:65,t1.idx:66,t0.idx:70,t0.idx:71,t0.idx:72,t3.idx:67,t3.idx:68,t3.idx:69,t2.idx:73,t2.idx:74,t2.idx:75,t1.idx:76,t1.idx:77,t1.idx:78,t0.idx:79,t0.idx:80,t0.idx:81,t3.idx:82,t3.idx:83,t3.idx:84,t2.idx:85,t2.idx:86,t2.idx:87,t1.idx:88,t1.idx:89,t1.idx:90,t0.idx:91,t0.idx:92,t0.idx:93,t3.idx:94,t3.idx:95,t3.idx:96,t2.idx:97,t2.idx:98,t2.idx:99,t1.idx:100,t1.idx

In [12]:
# print(embedding_map)
# 写入JSON字符串到文件
with open(embedding_map_path, 'wb') as pf:
    pickle.dump(embedding_map, pf)

In [13]:
user_embeddings = np.load(user_embeddings_path).astype(np.float32)
item_embeddings = np.load(item_embeddings_path).astype(np.float32)

In [17]:
user_embeddings.shape,item_embeddings.shape,user_embeddings,item_embeddings

((944, 4096),
 (1683, 4096),
 array([[ 0.16461854,  0.8668479 ,  0.7278544 , ...,  0.6194043 ,
          0.53017735,  0.63462377],
        [ 1.4085696 , -0.6134708 , -1.1689081 , ...,  1.4445527 ,
         -0.03760876, -0.18269014],
        [ 1.2879995 , -0.71393013, -1.1580178 , ...,  1.1581529 ,
          0.04795215,  0.04411626],
        ...,
        [ 1.2510451 , -0.90023917, -0.90365666, ...,  1.0010967 ,
         -0.06259985,  0.0562149 ],
        [ 1.3481363 , -1.0954129 , -1.0587771 , ...,  1.1806401 ,
          0.17428328, -0.00409546],
        [ 1.2543057 , -0.8039214 , -0.91442794, ...,  1.0759352 ,
         -0.05160716,  0.04265343]], dtype=float32),
 array([[ 0.17525849,  0.8271101 ,  0.05368988, ...,  0.6110595 ,
          0.6907715 ,  0.8112663 ],
        [ 0.243016  , -1.5813007 ,  0.31605965, ..., -1.139971  ,
          0.8699425 , -0.48286134],
        [-0.04603429, -1.0574435 , -0.2554197 , ..., -1.1478007 ,
          1.2476206 , -0.05056527],
        ...,
        [ 

(944, 4096)