In [1]:
import pandas as pd
import numpy as np
import scipy.sparse as sp
import threading
import json
import pickle
import os

# source
name = "lfm1b-tracks"
inter_path = f"/media/data/dataset/{name}/{name}.inter"
user_path = f"/media/data/dataset/{name}/{name}.user"
item_path = f"/media/data/dataset/{name}/{name}.item"

# target
# /media/data/model/Llama-2-7b-hf
# /media/data/model/Qwen2.5-14B-Instruct
# /media/data/model/Llama-3-8B-Instruct
pre_model = 'Llama-2-7b-hf'
item_embeddings_path = f"/media/data/dataset/llm/{pre_model}/{name}/item_embeddings.npy"
user_embeddings_path = f"/media/data/dataset/llm/{pre_model}/{name}/user_embeddings.npy"
id_map_path = f"/media/data/dataset/llm/{pre_model}/{name}/id_map.pkl"
embedding_map_path = f"/media/data/dataset/llm/{pre_model}/{name}/embedding_map.pkl"
# 获取父级目录
parent_dir = os.path.dirname(item_embeddings_path)
# 判断父级目录是否存在，不存在则创建
if not os.path.exists(parent_dir):
    os.makedirs(parent_dir)
    print(f"创建目录: {parent_dir}")
else:
    print(f"目录已存在: {parent_dir}")

# other
model_path = f'/media/data/model/{pre_model}'
embedding_dim = 4096  # 嵌入维度, llama:4096,qwen:5120
global_idx = 0
span1 = 3
span2 = 3
is_load_generated = False
# 设置 CUDA_VISIBLE_DEVICES 使程序只看到 GPU 0,1,2,3
os.environ["CUDA_VISIBLE_DEVICES"] = "0,1,2,3"

创建目录: /media/data/dataset/llm/Llama-2-7b-hf/lfm1b-tracks


In [2]:
import logging
 
# 配置日志系统
logging.basicConfig(filename='log/logging.log', filemode='w', level=logging.DEBUG, format='%(asctime)s - %(levelname)s - %(message)s')
 
# 测试打印输出同时记录到日志
# logging.debug('这是一个调试信息')
# logging.info('这是一个信息信息')
# logging.warning('这是一个警告信息')
# logging.error('这是一个错误信息')
# logging.critical('这是一个严重错误信息')

In [3]:
# 读数据
inter_data = pd.read_csv(inter_path, sep="\t").sort_values(by=['user_id:token'], ascending=[True])
user_data = pd.read_csv(user_path, sep="\t")
item_data = pd.read_csv(item_path, sep="\t")
inter_data

Unnamed: 0,user_id:token,item_id:token,tracks_id:token,timestamp:float,num_repeat:float
318575,14308,25422,25422,1335299391,22
318576,14308,25425,25425,1335298720,7
318577,14308,25426,25426,1335298488,8
318578,14308,2896,2896,1334584044,1
318579,14308,2470,2470,1334337839,4
...,...,...,...,...,...
843148,50062636,17013,17013,1340600044,1
843139,50062636,17012,17012,1343970154,3
843147,50062636,22004,22004,1340636088,1
843145,50062636,17011,17011,1340687478,1


In [10]:
inter_data = inter_data.drop(["tracks_id:token"], axis=1)
inter_data

Unnamed: 0,user_id:token,item_id:token,timestamp:float,num_repeat:float
318575,14308,25422,1335299391,22
318576,14308,25425,1335298720,7
318577,14308,25426,1335298488,8
318578,14308,2896,1334584044,1
318579,14308,2470,1334337839,4
...,...,...,...,...
843148,50062636,17013,1340600044,1
843139,50062636,17012,1343970154,3
843147,50062636,22004,1340636088,1
843145,50062636,17011,1340687478,1


In [5]:
user_data = user_data[user_data['user_id:token'].isin(inter_data['user_id:token'])]
user_data

Unnamed: 0,user_id:token,country:token,age:float,gender:token,playcount:float,registered_timestamp:float,novelty_artist_avg_month:float,novelty_artist_avg_6months:float,novelty_artist_avg_year:float,mainstreaminess_avg_month:float,...,relative le per hour14:float,relative le per hour15:float,relative le per hour16:float,relative le per hour17:float,relative le per hour18:float,relative le per hour19:float,relative le per hour20:float,relative le per hour21:float,relative le per hour22:float,relative le per hour23:float
16,14308,BR,31,m,70351,1058659200,0.135882,0.059137,0.210488,0.052656,...,0.0748,0.0903,0.0848,0.0755,0.0981,0.1405,0.1345,0.1096,0.0705,0.0328
23,1000450,US,36,m,36710,1044900323,0.285544,0.029612,0.457858,0.054102,...,0.0142,0.0086,0.0104,0.0276,0.0527,0.0397,0.0456,0.0405,0.0379,0.0533
44,1006640,,-1,,1027,1053198502,0.421583,0.031030,0.641081,0.036840,...,0.0078,0.0078,0.0229,0.0397,0.0565,0.0453,0.0432,0.0449,0.0341,0.0272
58,1009632,US,53,m,8191,1057177534,0.468647,0.033494,0.588588,0.042126,...,0.0135,0.0049,0.0113,0.0091,0.0073,0.0081,0.0124,0.0221,0.0202,0.0254
68,1018238,SE,31,m,31877,1071605554,0.551111,0.034087,0.648918,0.058300,...,0.0739,0.0766,0.0844,0.0827,0.0885,0.0682,0.0525,0.0289,0.0254,0.0191
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
119618,50003016,,-1,n,228,1338062841,0.721986,0.043962,0.814765,0.050861,...,0.0118,,0.0017,0.0084,0.0860,0.0556,0.0152,0.0304,0.0911,0.0691
119640,50004548,IT,16,m,964,1338069715,0.170627,0.070566,0.340557,0.030124,...,0.0879,0.0804,0.0633,0.0880,0.0940,0.0656,0.0346,0.0456,0.0777,0.0922
119862,50025079,CA,20,m,660,1338174147,0.783784,0.000000,,0.000000,...,0.0293,0.0247,0.0262,0.0231,,,,0.0062,,0.0200
120063,50049082,VN,18,m,13547,1338300055,0.093783,0.035657,0.160526,0.053951,...,0.0723,0.0787,0.0792,0.0590,0.0484,0.0336,0.0122,0.0111,0.0094,0.0095


In [12]:
item_data['item:token'] = item_data['tracks_id:token']
item_data = item_data.drop(["tracks_id:token"], axis=1)
item_data

Unnamed: 0,name:token_seq,artists_id:token,item:token
0,A Matter of Time,3,1
1,Hangar 18,1,2
2,Up the Downstair,4,3
6,Light Fuse And Get Away,7,7
7,Tornado Of Souls,1,8
...,...,...,...
27251,Dead Silence,4377,27252
27252,Swallowed Up by the Ocean,4377,27253
27261,One Day In My Garden,4399,27262
27262,The 2nd Law: Unsustainable,153,27263


In [13]:
item_data = item_data[['item:token','name:token_seq','artists_id:token']]
item_data

Unnamed: 0,item:token,name:token_seq,artists_id:token
0,1,A Matter of Time,3
1,2,Hangar 18,1
2,3,Up the Downstair,4
6,7,Light Fuse And Get Away,7
7,8,Tornado Of Souls,1
...,...,...,...
27251,27252,Dead Silence,4377
27252,27253,Swallowed Up by the Ocean,4377
27261,27262,One Day In My Garden,4399
27262,27263,The 2nd Law: Unsustainable,153


In [8]:
item_data = item_data[item_data['tracks_id:token'].isin(inter_data['tracks_id:token'])]
item_data

Unnamed: 0,tracks_id:token,name:token_seq,artists_id:token
0,1,A Matter of Time,3
1,2,Hangar 18,1
2,3,Up the Downstair,4
6,7,Light Fuse And Get Away,7
7,8,Tornado Of Souls,1
...,...,...,...
27251,27252,Dead Silence,4377
27252,27253,Swallowed Up by the Ocean,4377
27261,27262,One Day In My Garden,4399
27262,27263,The 2nd Law: Unsustainable,153


In [None]:
inter_data.to_csv(inter_path, sep="\t", index=False)  # index=False 不保存行索引
user_data.to_csv(user_path, sep="\t", index=False)  # index=False 不保存行索引
item_data.to_csv(item_path, sep="\t", index=False)  # index=False 不保存行索引