In [1]:
import pandas as pd
import os

base_path = "/mnt/data/wangshu/hcarag/FB15k"

node_df_raw = pd.read_csv(os.path.join(base_path, "./fb15k_description.tsv"), sep="\t")
relation_df_raw = pd.read_csv(
    os.path.join(base_path, "./freebase_mtr100_mte100-all.txt"), sep="\t", header=None
)
relation_df_raw.columns = ["head", "relation", "tail"]

print(node_df_raw.head(2))
relation_df_raw.head(2)

   node_id        mid                                   node_description
0        0  /m/02y21l                Sire Records, American record label
1        1  /m/02gyl0  Theodore Bikel, Austrian-American actor, folk ...


Unnamed: 0,head,relation,tail
0,/m/027rn,/location/country/form_of_government,/m/06cx9
1,/m/017dcd,/tv/tv_program/regular_cast./tv/regular_tv_app...,/m/06v8s0


In [3]:
# 将\N替换为空字符串
node_df = node_df_raw.copy()
# node_df['node_description'] = node_df['node_description'].replace('\\N', '')
node_df['node_description'] = node_df['node_description'].replace('\\N', None)

# 删除node_description为空的行
node_df = node_df[node_df['node_description'].notnull()]

# 使用str.split方法按第一个逗号分割
node_df[['name', 'description']] = node_df['node_description'].str.split(',', n=1, expand=True)


# 将空字符串替换为None
node_df['name'] = node_df['name'].replace('', None)
node_df['description'] = node_df['description'].replace('', None)
node_df['node_description'] = node_df['node_description'].replace('', None)

# 统计name和description字段中空缺的数量
name_missing_count = node_df['name'].isnull().sum()
description_missing_count = node_df['description'].isnull().sum()

# 打印结果
print(node_df['node_description'].isnull().sum())
print(f"name字段空缺的数量: {name_missing_count}")
print(f"description字段空缺的数量: {description_missing_count}")

node_df

0
name字段空缺的数量: 0
description字段空缺的数量: 114


Unnamed: 0,node_id,mid,node_description,name,description
0,0,/m/02y21l,"Sire Records, American record label",Sire Records,American record label
1,1,/m/02gyl0,"Theodore Bikel, Austrian-American actor, folk ...",Theodore Bikel,"Austrian-American actor, folk singer, musicia..."
2,2,/m/01zsfx,"Bulawayo, City in Zimbabwe",Bulawayo,City in Zimbabwe
3,3,/m/06lq2g,"Chill-out music, umbrella term for different t...",Chill-out music,umbrella term for different types of electron...
4,4,/m/01qzt1,"Classic rock, radio format",Classic rock,radio format
...,...,...,...,...,...
14946,14946,/m/0c6cwg,"Taliban insurgency, insurgency during the War ...",Taliban insurgency,insurgency during the War in Afghanistan
14947,14947,/m/04qy5,"Legion of Merit, military award of the United ...",Legion of Merit,military award of the United States Armed Forces
14948,14948,/m/014zz1,"Hurdy-gurdy, musical instrument",Hurdy-gurdy,musical instrument
14949,14949,/m/01l4g5,"Loreena McKennitt, Canadian musician and composer",Loreena McKennitt,Canadian musician and composer


In [4]:
relation_df = relation_df_raw.copy()
print(relation_df.shape)

# 创建 mid 到 node_id 的映射字典
mid_to_id = dict(zip(node_df["mid"], (node_df["node_id"])))
mid_to_name = dict(zip(node_df["mid"], node_df["name"]))

# 使用 map 添加 head_id 和 tail_id
relation_df["head_id"] = relation_df["head"].map(mid_to_id)
relation_df["tail_id"] = relation_df["tail"].map(mid_to_id)

# 将 head_id 和 tail_id 转换为整数类型
relation_df["head_id"] = relation_df["head_id"].astype('Int64')  # 使用 Int64 以处理 NaN
relation_df["tail_id"] = relation_df["tail_id"].astype('Int64')

relation_df['source'] = relation_df['head'].map(mid_to_name)
relation_df['target'] = relation_df['tail'].map(mid_to_name)

# 统计未找到的 head 或 tail 的行数
missing_heads = relation_df["head_id"].isnull().sum()
missing_tails = relation_df["tail_id"].isnull().sum()

# 统计 head_id 或 tail_id 为 NaN 的行数
missing_head_or_tail = (
    relation_df["head_id"].isna().sum() + relation_df["tail_id"].isna().sum()
)

# 输出结果
print(relation_df.head(2))
print(f"Number of rows with missing head: {missing_heads}")
print(f"Number of rows with missing tail: {missing_tails}")
print(f"有 {missing_head_or_tail} 行的 head 或 tail 没有找到对应的 id")

relation_df = relation_df[relation_df["head_id"].notnull()]
relation_df = relation_df[relation_df["tail_id"].notnull()]
print(relation_df.shape)

# 重命名列
relation_df = relation_df.rename(columns={"relation": "description"})
# 添加编号为 human_readable_id 列
relation_df['human_readable_id'] = relation_df.reset_index().index + 1  # 从1开始编号



(592213, 3)
        head                                           relation       tail  \
0   /m/027rn               /location/country/form_of_government   /m/06cx9   
1  /m/017dcd  /tv/tv_program/regular_cast./tv/regular_tv_app...  /m/06v8s0   

   head_id  tail_id                        source      target  
0      345    10667            Dominican Republic    Republic  
1     9796     1985  Mighty Morphin Power Rangers  Wendee Lee  
Number of rows with missing head: 40439
Number of rows with missing tail: 44324
有 84763 行的 head 或 tail 没有找到对应的 id
(511027, 7)


In [5]:
out_degree_counts = relation_df['head_id'].value_counts()

in_degree_counts = relation_df['tail_id'].value_counts()

# 合并两个计数
degree_counts = out_degree_counts.add(in_degree_counts, fill_value=0)

node_df['out_degree'] = node_df['node_id'].map(out_degree_counts).fillna(0).astype(int)
node_df['in_degree'] = node_df['node_id'].map(in_degree_counts).fillna(0).astype(int)

# 将结果添加到 node_df
node_df['degree'] = node_df['node_id'].map(degree_counts).fillna(0).astype(int)

node_df.head(2)


Unnamed: 0,node_id,mid,node_description,name,description,out_degree,in_degree,degree
0,0,/m/02y21l,"Sire Records, American record label",Sire Records,American record label,25,25,50
1,1,/m/02gyl0,"Theodore Bikel, Austrian-American actor, folk ...",Theodore Bikel,"Austrian-American actor, folk singer, musicia...",24,17,41


In [6]:
# 创建 mid 到 node_id 的映射字典
id_to_degree = dict(zip(node_df["node_id"], node_df["degree"]))

relation_df["source_degree"] = relation_df["head_id"].map(id_to_degree)
relation_df["target_degree"] = relation_df["tail_id"].map(id_to_degree)
relation_df.head(2)

Unnamed: 0,head,description,tail,head_id,tail_id,source,target,human_readable_id,source_degree,target_degree
0,/m/027rn,/location/country/form_of_government,/m/06cx9,345,10667,Dominican Republic,Republic,1,88,158
1,/m/017dcd,/tv/tv_program/regular_cast./tv/regular_tv_app...,/m/06v8s0,9796,1985,Mighty Morphin Power Rangers,Wendee Lee,2,18,30


In [18]:
import sys
import os

# 添加 src 文件夹到 sys.path
current_dir = os.getcwd()  # 获取当前工作目录
src_path = os.path.join(current_dir, "../../src")
print(src_path)
sys.path.append(os.path.abspath(src_path))

from src.utils import entity_embedding

class Args:
    def __init__(self):
        self.embedding_local = False
        # self.embedding_model_local = "nomic-embed-text-v1"
        self.embedding_api_key = "ollama"
        self.embedding_api_base = "http://localhost:11434/v1"
        self.embedding_model = "nomic-embed-text"
        # 其他参数...

args = Args()
# 处理 node_df 中的 description 字段
node_df["description"] = node_df["description"].replace(
    [None, "", "None"], "."
)  # 替换 None 和空字符串为 “.”

node_df = entity_embedding(node_df, args, embed_colname="description_embedding")

print(node_df.head(2))

/home/wangshu/rag/hier_graph_rag/dataset/WebQ/../../src
local is False


  6%|▌         | 850/14111 [02:46<43:13,  5.11it/s]


KeyboardInterrupt: 

In [68]:

save_path = "/mnt/data/wangshu/hcarag/FB15k/KG"
relation_save_name = os.path.join(save_path, "relationships.csv")
relation_df.to_csv(relation_save_name, sep=",", index=False)

entity_save_name = os.path.join(save_path, "entity.csv")
node_df.to_csv(entity_save_name, sep=",", index=False)


In [4]:
import pandas as pd
import os
save_path = "/mnt/data/wangshu/hcarag/FB15k/KG"
entity_save_name = os.path.join(save_path, "entity.csv")

node_df = pd.read_csv(entity_save_name, sep=",")
node_df['human_readable_id'] = node_df['node_id']
node_df.head(2)
entity_save_name_2 = os.path.join(save_path, "entity_df.csv")

node_df.to_csv(entity_save_name_2, sep=",", index=False)

In [1]:
import sys
import os
import pandas as pd

project_path = "/home/wangshu/rag/hier_graph_rag"
print(project_path)
sys.path.append(os.path.abspath(project_path))

from src.utils import relation_embedding


class Args:
    def __init__(self):
        self.embedding_local = False
        # self.embedding_model_local = "nomic-embed-text-v1"
        self.embedding_api_key = "ollama"
        # self.embedding_api_base = "http://localhost:11434/v1"
        self.embedding_api_base = "http://localhost:5000/forward"
        self.embedding_model = "nomic-embed-text"
        # 其他参数...


args = Args()

path = "/mnt/data/wangshu/hcarag/FB15k/KG/relationships.csv"
relation_df = pd.read_csv(path, sep=",")

relation_df = relation_embedding(
    relation_df, args, embed_colname="description_embedding", num_workers=28
)

relation_df.head(2)

/home/wangshu/rag/hier_graph_rag
local is False


Computing embeddings: 100%|██████████| 1294/1294 [00:13<00:00, 93.52it/s] 


        head                                        description       tail  \
0   /m/027rn               /location/country/form_of_government   /m/06cx9   
1  /m/017dcd  /tv/tv_program/regular_cast./tv/regular_tv_app...  /m/06v8s0   

   head_id  tail_id                        source      target  \
0      345    10667            Dominican Republic    Republic   
1     9796     1985  Mighty Morphin Power Rangers  Wendee Lee   

   human_readable_id  source_degree  target_degree  \
0                  1             88            158   
1                  2             18             30   

                               description_embedding  
0  [0.00531036, 0.05597023, -0.15895928, -0.03016...  
1  [-0.05882332, 0.0016993042, -0.1597247, -0.050...  


In [2]:
relation_df.to_csv(path, sep=",", index=False)

In [7]:
# 获取每个 description 首次出现的行
first_occurrences = relation_df.drop_duplicates(subset='description', keep='first')
# 选择需要的列并创建 unique_descriptions
unique_descriptions = first_occurrences[['description', 'description_embedding']].copy()

unique_descriptions['idx'] = range(len(unique_descriptions))

# 重置索引
unique_descriptions.reset_index(drop=True, inplace=True)
print(unique_descriptions.shape)
unique_descriptions

(1294, 3)


Unnamed: 0,description,description_embedding,idx
0,/location/country/form_of_government,"[0.00531036, 0.05597023, -0.15895928, -0.03016...",0
1,/tv/tv_program/regular_cast./tv/regular_tv_app...,"[-0.05882332, 0.0016993042, -0.1597247, -0.050...",1
2,/media_common/netflix_genre/titles,"[-0.010580016, 0.0088471165, -0.16270894, 0.03...",2
3,/award/award_winner/awards_won./award/award_ho...,"[0.01724843, -0.011446378, -0.10684617, 0.0080...",3
4,/soccer/football_team/current_roster./sports/s...,"[-0.021292571, -0.012995346, -0.18861972, -0.0...",4
...,...,...,...
1289,/sports/multi_event_tournament/sports,"[0.014797877, 0.07079412, -0.16517444, 0.03491...",1289
1290,/sports/sport/related_equipment,"[-0.005286605, 0.03573564, -0.16285726, 0.0294...",1290
1291,/base/crime/crime_victim/crime_type,"[0.072939, 0.036194876, -0.17190062, -0.016580...",1291
1292,/user/radiusrs/default_domain/astrology/relate...,"[0.029142885, 0.046695385, -0.14149785, -0.044...",1292


In [15]:
# 检查是否已存在 'embedding_idx' 列
if 'embedding_idx' not in relation_df.columns:
    relation_df = relation_df.merge(unique_descriptions[['description', 'idx']], on='description', how='left')
    relation_df.rename(columns={'idx': 'embedding_idx'}, inplace=True)
    
# 找到所有以 'embedding_idx' 开头的列
embedding_idx_cols = [col for col in relation_df.columns if 'embedding_idx' in col]

# 删除多余的 'embedding_idx' 列，保留第一个
if len(embedding_idx_cols) > 1:
    relation_df.drop(columns=embedding_idx_cols[1:], inplace=True)
relation_df
    

Unnamed: 0,head,description,tail,head_id,tail_id,source,target,human_readable_id,source_degree,target_degree,description_embedding,embedding_idx
0,/m/027rn,/location/country/form_of_government,/m/06cx9,345,10667,Dominican Republic,Republic,1,88,158,"[0.00531036, 0.05597023, -0.15895928, -0.03016...",0
1,/m/017dcd,/tv/tv_program/regular_cast./tv/regular_tv_app...,/m/06v8s0,9796,1985,Mighty Morphin Power Rangers,Wendee Lee,2,18,30,"[-0.05882332, 0.0016993042, -0.1597247, -0.050...",1
2,/m/07s9rl0,/media_common/netflix_genre/titles,/m/0170z3,848,6425,Drama (film and television),American History X,3,3496,58,"[-0.010580016, 0.0088471165, -0.16270894, 0.03...",2
3,/m/01sl1q,/award/award_winner/awards_won./award/award_ho...,/m/044mz_,5079,11653,Michelle Rodriguez,Naveen Andrews,4,110,114,"[0.01724843, -0.011446378, -0.10684617, 0.0080...",3
4,/m/0cnk2q,/soccer/football_team/current_roster./sports/s...,/m/02nzb8,4482,11293,Australia national soccer team,Midfielder,5,102,4546,"[-0.021292571, -0.012995346, -0.18861972, -0.0...",4
...,...,...,...,...,...,...,...,...,...,...,...,...
511022,/m/02x4x18,/award/award_category/nominees./award/award_no...,/m/0dgst_d,2914,8202,Independent Spirit Award for Best Female Lead,My Week with Marilyn,511023,280,131,"[0.008671421, 0.0016644014, -0.09234244, -0.03...",11
511023,/m/0bw20,/film/film/other_crew./film/film_crew_gig/film...,/m/09vw2b7,6536,14465,K-19: The Widowmaker,Special effects supervisor,511024,76,1594,"[-0.054025047, 0.0074842703, -0.18590993, 0.00...",40
511024,/m/0gs6vr,/film/actor/film./film/performance/film,/m/0gj96ln,5338,12077,Selena Gomez,Hotel Transylvania,511025,84,89,"[-0.040637504, -0.0069618616, -0.13999099, 0.0...",9
511025,/m/02lp0w,/award/award_category/nominees./award/award_no...,/m/015882,4277,11101,Tony Award for Best Actress in a Musical,Linda Ronstadt,511026,68,127,"[0.013273816, 0.008030098, -0.103796266, -0.03...",24


In [18]:
save_relation_df = relation_df.copy()

save_relation_df.drop(columns = ["description_embedding"], inplace=True)
save_relation_df.to_csv(path, sep=",", index=False)
ude_path = "/mnt/data/wangshu/hcarag/FB15k/KG/relationships_embedding.csv"
unique_descriptions.to_csv(ude_path, sep=",", index=False)
save_relation_df

Unnamed: 0,head,description,tail,head_id,tail_id,source,target,human_readable_id,source_degree,target_degree,embedding_idx
0,/m/027rn,/location/country/form_of_government,/m/06cx9,345,10667,Dominican Republic,Republic,1,88,158,0
1,/m/017dcd,/tv/tv_program/regular_cast./tv/regular_tv_app...,/m/06v8s0,9796,1985,Mighty Morphin Power Rangers,Wendee Lee,2,18,30,1
2,/m/07s9rl0,/media_common/netflix_genre/titles,/m/0170z3,848,6425,Drama (film and television),American History X,3,3496,58,2
3,/m/01sl1q,/award/award_winner/awards_won./award/award_ho...,/m/044mz_,5079,11653,Michelle Rodriguez,Naveen Andrews,4,110,114,3
4,/m/0cnk2q,/soccer/football_team/current_roster./sports/s...,/m/02nzb8,4482,11293,Australia national soccer team,Midfielder,5,102,4546,4
...,...,...,...,...,...,...,...,...,...,...,...
511022,/m/02x4x18,/award/award_category/nominees./award/award_no...,/m/0dgst_d,2914,8202,Independent Spirit Award for Best Female Lead,My Week with Marilyn,511023,280,131,11
511023,/m/0bw20,/film/film/other_crew./film/film_crew_gig/film...,/m/09vw2b7,6536,14465,K-19: The Widowmaker,Special effects supervisor,511024,76,1594,40
511024,/m/0gs6vr,/film/actor/film./film/performance/film,/m/0gj96ln,5338,12077,Selena Gomez,Hotel Transylvania,511025,84,89,9
511025,/m/02lp0w,/award/award_category/nominees./award/award_no...,/m/015882,4277,11101,Tony Award for Best Actress in a Musical,Linda Ronstadt,511026,68,127,24


In [20]:
path_ird = "/mnt/data/wangshu/hcarag/FB15k/hc_index_8b/relationship_df_index.csv"
index_relation_df = pd.read_csv(path_ird)
# 检查是否已存在 'embedding_idx' 列
if 'embedding_idx' not in index_relation_df.columns:
    index_relation_df = index_relation_df.merge(unique_descriptions[['description', 'idx']], on='description', how='left')
    index_relation_df.rename(columns={'idx': 'embedding_idx'}, inplace=True)
    
index_relation_df.to_csv(path_ird, sep=',', index=False)
index_relation_df

Unnamed: 0,head,description,tail,head_id,tail_id,source,target,human_readable_id,source_degree,target_degree,source_index_id,target_index_id,embedding_idx
0,/m/027rn,/location/country/form_of_government,/m/06cx9,345,10667,Dominican Republic,Republic,1,88,158,2369,12115,0
1,/m/017dcd,/tv/tv_program/regular_cast./tv/regular_tv_app...,/m/06v8s0,9796,1985,Mighty Morphin Power Rangers,Wendee Lee,2,18,30,11287,3911,1
2,/m/07s9rl0,/media_common/netflix_genre/titles,/m/0170z3,848,6425,Drama (film and television),American History X,3,3496,58,2840,8103,2
3,/m/01sl1q,/award/award_winner/awards_won./award/award_ho...,/m/044mz_,5079,11653,Michelle Rodriguez,Naveen Andrews,4,110,114,6842,13054,3
4,/m/0cnk2q,/soccer/football_team/current_roster./sports/s...,/m/02nzb8,4482,11293,Australia national soccer team,Midfielder,5,102,4546,6274,12714,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...
511022,/m/02x4x18,/award/award_category/nominees./award/award_no...,/m/0dgst_d,2914,8202,Independent Spirit Award for Best Female Lead,My Week with Marilyn,511023,280,131,4790,9786,11
511023,/m/0bw20,/film/film/other_crew./film/film_crew_gig/film...,/m/09vw2b7,6536,14465,K-19: The Widowmaker,Special effects supervisor,511024,76,1594,8206,15701,40
511024,/m/0gs6vr,/film/actor/film./film/performance/film,/m/0gj96ln,5338,12077,Selena Gomez,Hotel Transylvania,511025,84,89,7087,13458,9
511025,/m/02lp0w,/award/award_category/nominees./award/award_no...,/m/015882,4277,11101,Tony Award for Best Actress in a Musical,Linda Ronstadt,511026,68,127,6084,12528,24
