In [1]:
import pandas as pd
import os

base_path = "/mnt/data/wangshu/hcarag/mintaka/data"

node_df_raw = pd.read_csv(os.path.join(base_path, "node_summary.csv"))
relation_df_raw = pd.read_csv(os.path.join(base_path, "relation_ori.csv"))


print(node_df_raw.head(2))
relation_df_raw.head(2)

        entity  nodeid                                            summary  \
0   Diana Ross       0  Diana Ross (born March 26, 1944) is an America...   
1  Mark Antony       1  Marcus Antonius (14 January 83 BC – 1 August 3...   

                                    all_summary_text  
0  Diana Ross (born March 26, 1944) is an America...  
1  Marcus Antonius (14 January 83 BC – 1 August 3...  


Unnamed: 0,related_entity,triple,source,relation,tail
0,Diana Ross,"(Costume Institute, parent organization, Metro...",Costume Institute,parent organization,Metropolitan Museum of Art
1,Mark Antony,"(Hirtius, position held, consul)",Hirtius,position held,consul


In [2]:
node_df = node_df_raw.copy()
node_df.rename(columns={"summary": "description", "entity": "name", "nodeid":"node_id"}, inplace=True)

node_df = node_df[node_df["description"].notnull()]
node_df = node_df[node_df["name"].notnull()]

print(node_df.shape)
node_df["name"] = node_df["name"].replace("", None)
node_df["description"] = node_df["description"].replace("", None)

# 统计name和description字段中空缺的数量
name_missing_count = node_df["name"].isnull().sum()
description_missing_count = node_df["description"].isnull().sum()

# 打印结果
print(node_df["description"].isnull().sum())
print(f"name字段空缺的数量: {name_missing_count}")
print(f"name unique: {len(node_df['name'].unique())}")
print(f"description字段空缺的数量: {description_missing_count}")

node_df["node_id"] = range(len(node_df))

node_df.head(2)

(103420, 4)
0
name字段空缺的数量: 0
name unique: 103420
description字段空缺的数量: 0


Unnamed: 0,name,node_id,description,all_summary_text
0,Diana Ross,0,"Diana Ross (born March 26, 1944) is an America...","Diana Ross (born March 26, 1944) is an America..."
1,Mark Antony,1,Marcus Antonius (14 January 83 BC – 1 August 3...,Marcus Antonius (14 January 83 BC – 1 August 3...


In [3]:
relation_df = relation_df_raw.copy()
relation_df.rename(columns={"tail": "target"}, inplace=True)

print(relation_df.shape)

name_to_id = dict(zip(node_df["name"], node_df["node_id"]))

# 使用 map 添加 head_id 和 tail_id
relation_df["head_id"] = relation_df["source"].map(name_to_id)
relation_df["tail_id"] = relation_df["target"].map(name_to_id)

# 将 head_id 和 tail_id 转换为整数类型
relation_df["head_id"] = relation_df["head_id"].astype('Int64')  # 使用 Int64 以处理 NaN
relation_df["tail_id"] = relation_df["tail_id"].astype('Int64')


# 统计未找到的 head 或 tail 的行数
missing_heads = relation_df["head_id"].isnull().sum()
missing_tails = relation_df["tail_id"].isnull().sum()

# 统计 head_id 或 tail_id 为 NaN 的行数
missing_head_or_tail = (
    relation_df["head_id"].isna().sum() + relation_df["tail_id"].isna().sum()
)

# 输出结果
print(f"Number of rows with missing head: {missing_heads}")
print(f"Number of rows with missing tail: {missing_tails}")
print(f"有 {missing_head_or_tail} 行的 head 或 tail 没有找到对应的 id")

relation_df = relation_df[relation_df["head_id"].notnull()]
relation_df = relation_df[relation_df["tail_id"].notnull()]
print(relation_df.shape)

# 添加编号为 human_readable_id 列
relation_df['human_readable_id'] = relation_df.reset_index().index + 1  # 从1开始编号

relation_df.head(2)


(421596, 5)
Number of rows with missing head: 133402
Number of rows with missing tail: 120514
有 253916 行的 head 或 tail 没有找到对应的 id
(215701, 7)


Unnamed: 0,related_entity,triple,source,relation,target,head_id,tail_id,human_readable_id
2,Mao Zedong,"(Mao Xinyu, mother, Kong Dongmei)",Mao Xinyu,mother,Kong Dongmei,2686,13657,1
3,Pink,"(Irish, part of, Celtic languages)",Irish,part of,Celtic languages,1156,86403,2


In [4]:
out_degree_counts = relation_df['head_id'].value_counts()

in_degree_counts = relation_df['tail_id'].value_counts()

# 合并两个计数
degree_counts = out_degree_counts.add(in_degree_counts, fill_value=0)

node_df['out_degree'] = node_df['node_id'].map(out_degree_counts).fillna(0).astype(int)
node_df['in_degree'] = node_df['node_id'].map(in_degree_counts).fillna(0).astype(int)

# 将结果添加到 node_df
node_df['degree'] = node_df['node_id'].map(degree_counts).fillna(0).astype(int)

print(node_df.head(2))
# 创建 mid 到 node_id 的映射字典
id_to_degree = dict(zip(node_df["node_id"], node_df["degree"]))

relation_df["source_degree"] = relation_df["head_id"].map(id_to_degree)
relation_df["target_degree"] = relation_df["tail_id"].map(id_to_degree)
relation_df.head(2)


          name  node_id                                        description  \
0   Diana Ross        0  Diana Ross (born March 26, 1944) is an America...   
1  Mark Antony        1  Marcus Antonius (14 January 83 BC – 1 August 3...   

                                    all_summary_text  out_degree  in_degree  \
0  Diana Ross (born March 26, 1944) is an America...           6         15   
1  Marcus Antonius (14 January 83 BC – 1 August 3...          23         16   

   degree  
0      21  
1      39  


Unnamed: 0,related_entity,triple,source,relation,target,head_id,tail_id,human_readable_id,source_degree,target_degree
2,Mao Zedong,"(Mao Xinyu, mother, Kong Dongmei)",Mao Xinyu,mother,Kong Dongmei,2686,13657,1,2,2
3,Pink,"(Irish, part of, Celtic languages)",Irish,part of,Celtic languages,1156,86403,2,15,4


In [5]:
import sys
import os

# 添加 src 文件夹到 sys.path
src_path = "/home/wangshu/rag/hier_graph_rag"
sys.path.append(os.path.abspath(src_path))

from src.utils import entity_embedding, relation_embedding


In [6]:
class Args:
    def __init__(self):
        self.embedding_local = False
        # self.embedding_model_local = "nomic-embed-text-v1"
        self.embedding_api_key = "ollama"
        self.embedding_api_base = "http://localhost:5000/forward"
        self.embedding_model = "nomic-embed-text"
        # 其他参数...


args = Args()
# 处理 node_df 中的 description 字段
node_df["description"] = node_df["description"].replace(
    [None, "", "None"], "."
)  # 替换 None 和空字符串为 “.”

node_df = entity_embedding(
    entity_df=node_df, args=args, embed_colname="description_embedding", num_workers=56
)

print(node_df.head(2))

local is False


Computing embeddings: 100%|██████████| 103420/103420 [36:55<00:00, 46.67it/s] 


          name  node_id                                        description  \
0   Diana Ross        0  Diana Ross (born March 26, 1944) is an America...   
1  Mark Antony        1  Marcus Antonius (14 January 83 BC – 1 August 3...   

                                    all_summary_text  out_degree  in_degree  \
0  Diana Ross (born March 26, 1944) is an America...           6         15   
1  Marcus Antonius (14 January 83 BC – 1 August 3...          23         16   

   degree                              description_embedding  
0      21  [0.022161048, 0.033234067, -0.20706187, 0.0352...  
1      39  [0.039200757, 0.029146517, -0.16816428, -0.048...  


In [1]:
save_path = "/mnt/data/wangshu/hcarag/mintaka/KG"

entity_save_name = os.path.join(save_path, "entity_df.csv")
node_df.to_csv(entity_save_name, sep=",", index=False)
node_df.head(2)

NameError: name 'os' is not defined

In [4]:
import pandas as pd
import os
save_path = "/mnt/data/wangshu/hcarag/mintaka/KG"

entity_save_name = os.path.join(save_path, "entity_df.csv")
node_df = pd.read_csv(entity_save_name)
node_df['human_readable_id'] = node_df['node_id']
node_df.head(2)


Unnamed: 0,name,node_id,description,all_summary_text,out_degree,in_degree,degree,description_embedding,human_readable_id
0,Diana Ross,0,"Diana Ross (born March 26, 1944) is an America...","Diana Ross (born March 26, 1944) is an America...",6,15,21,"[0.022161048, 0.033234067, -0.20706187, 0.0352...",0
1,Mark Antony,1,Marcus Antonius (14 January 83 BC – 1 August 3...,Marcus Antonius (14 January 83 BC – 1 August 3...,23,16,39,"[0.039200757, 0.029146517, -0.16816428, -0.048...",1


In [7]:
relation_df = pd.read_csv(os.path.join(save_path, "relation_df.csv"))
relation_df.head(2)


Unnamed: 0,related_entity,triple,source,description,target,head_id,tail_id,human_readable_id,source_degree,target_degree,embedding_idx
0,Mao Zedong,"(Mao Xinyu, mother, Kong Dongmei)",Mao Xinyu,mother,Kong Dongmei,2686,13657,1,2,2,0
1,Pink,"(Irish, part of, Celtic languages)",Irish,part of,Celtic languages,1156,86403,2,15,4,1


In [11]:
relation_entity = list(set(relation_df["head_id"].tolist() + relation_df["tail_id"].tolist()))
print(len(relation_entity))

node_in_relation = node_df[node_df["node_id"].isin(relation_entity)]
print(node_in_relation.shape)
node_df_save = node_df[node_df["node_id"].isin(relation_entity)]
entity_save_name_2 = os.path.join(save_path, "entity_df_2.csv")
node_df_save.to_csv(entity_save_name_2, sep=",", index=False)
node_df_save.head(2)

83284
(83284, 9)


Unnamed: 0,name,node_id,description,all_summary_text,out_degree,in_degree,degree,description_embedding,human_readable_id
0,Diana Ross,0,"Diana Ross (born March 26, 1944) is an America...","Diana Ross (born March 26, 1944) is an America...",6,15,21,"[0.022161048, 0.033234067, -0.20706187, 0.0352...",0
1,Mark Antony,1,Marcus Antonius (14 January 83 BC – 1 August 3...,Marcus Antonius (14 January 83 BC – 1 August 3...,23,16,39,"[0.039200757, 0.029146517, -0.16816428, -0.048...",1


In [5]:
node_df.to_csv(entity_save_name, sep=",", index=False)

In [8]:
class Args:
    def __init__(self):
        self.embedding_local = False
        # self.embedding_model_local = "nomic-embed-text-v1"
        self.embedding_api_key = "ollama"
        self.embedding_api_base = "http://localhost:5000/forward"
        self.embedding_model = "nomic-embed-text"
        # 其他参数...


args = Args()

relation_df = relation_embedding(
    relation_df,
    args,
    e_colname="relation",
    embed_colname="description_embedding",
    num_workers=28,
)

relation_df.head(2)

local is False
the number of unique relation is 305


Computing embeddings: 100%|██████████| 305/305 [00:04<00:00, 69.42it/s] 


Unnamed: 0,related_entity,triple,source,relation,target,head_id,tail_id,human_readable_id,source_degree,target_degree,description_embedding
2,Mao Zedong,"(Mao Xinyu, mother, Kong Dongmei)",Mao Xinyu,mother,Kong Dongmei,2686,13657,1,2,2,"[0.07709218, 0.021358969, -0.12001489, 0.01746..."
3,Pink,"(Irish, part of, Celtic languages)",Irish,part of,Celtic languages,1156,86403,2,15,4,"[0.059459053, 0.040997144, -0.17339545, -0.024..."


In [11]:
# 获取每个 description 首次出现的行
first_occurrences = relation_df.drop_duplicates(subset='relation', keep='first')
# 选择需要的列并创建 unique_descriptions
unique_descriptions = first_occurrences[['relation', 'description_embedding']].copy()

unique_descriptions['idx'] = range(len(unique_descriptions))

# 重置索引
unique_descriptions.reset_index(drop=True, inplace=True)
print(unique_descriptions.shape)
ude_path = os.path.join(save_path, "relationships_embedding.csv")
unique_descriptions.to_csv(ude_path, sep=",", index=False)

unique_descriptions

(305, 3)


Unnamed: 0,relation,description_embedding,idx
0,mother,"[0.07709218, 0.021358969, -0.12001489, 0.01746...",0
1,part of,"[0.059459053, 0.040997144, -0.17339545, -0.024...",1
2,country,"[0.027936252, 0.018972259, -0.19067658, 0.0015...",2
3,time period,"[0.027292086, 0.0059205946, -0.19963102, -0.03...",3
4,relative,"[0.013979959, -0.016103141, -0.14521341, -0.05...",4
...,...,...,...
300,candidacy in,"[-0.029488137, 0.0102599505, -0.16553228, 0.00...",300
301,endemic,"[0.07727416, -0.018875534, -0.17241938, -0.002...",301
302,member of political,"[0.0042275824, 0.03479208, -0.14877744, -0.048...",302
303,owned,"[0.015508807, -0.015527339, -0.18126287, 0.012...",303


In [12]:
# 检查是否已存在 'embedding_idx' 列
if "embedding_idx" not in relation_df.columns:
    relation_df = relation_df.merge(
        unique_descriptions[["relation", "idx"]], on="relation", how="left"
    )
    relation_df.rename(
        columns={"idx": "embedding_idx", "relation": "description"}, inplace=True
    )

# 找到所有以 'embedding_idx' 开头的列
embedding_idx_cols = [col for col in relation_df.columns if "embedding_idx" in col]

# 删除多余的 'embedding_idx' 列，保留第一个
if len(embedding_idx_cols) > 1:
    relation_df.drop(columns=embedding_idx_cols[1:], inplace=True)

relation_df

Unnamed: 0,related_entity,triple,source,description,target,head_id,tail_id,human_readable_id,source_degree,target_degree,description_embedding,embedding_idx
0,Mao Zedong,"(Mao Xinyu, mother, Kong Dongmei)",Mao Xinyu,mother,Kong Dongmei,2686,13657,1,2,2,"[0.07709218, 0.021358969, -0.12001489, 0.01746...",0
1,Pink,"(Irish, part of, Celtic languages)",Irish,part of,Celtic languages,1156,86403,2,15,4,"[0.059459053, 0.040997144, -0.17339545, -0.024...",1
2,Nascar,"(Autódromo Hermanos Rodríguez, country, Mexican)",Autódromo Hermanos Rodríguez,country,Mexican,2687,3278,3,4,60,"[0.027936252, 0.018972259, -0.19067658, 0.0015...",2
3,Mao Zedong,"(Li Bai, time period, Tang dynasty)",Li Bai,time period,Tang dynasty,2689,28471,4,2,14,"[0.027292086, 0.0059205946, -0.19963102, -0.03...",3
4,Pizarro,"(Gonzalo Pizarro, relative, Pedro Pizarro)",Gonzalo Pizarro,relative,Pedro Pizarro,2691,86404,5,5,2,"[0.013979959, -0.016103141, -0.14521341, -0.05...",4
...,...,...,...,...,...,...,...,...,...,...,...,...
215696,one million,"(10,000,000, instance of, prime number)",10000000,instance of,prime number,85208,7482,215697,2,36,"[0.041665725, 0.013023066, -0.17593572, -0.069...",57
215697,one million,"(100, said to be the same as, 1,000,000)",100,said to be the same as,1000000,1191,85002,215698,12,15,"[0.02253822, -0.020812616, -0.12587284, -0.006...",7
215698,one million,"(1,000,000, instance of, prime number)",1000000,instance of,prime number,85002,7482,215699,15,36,"[0.041665725, 0.013023066, -0.17593572, -0.069...",57
215699,one million,"(1000, said to be the same as, 1,000,000)",1000,said to be the same as,1000000,48056,85002,215700,12,15,"[0.02253822, -0.020812616, -0.12587284, -0.006...",7


In [13]:
save_relation_df = relation_df.copy()
relation_df_save_path = os.path.join(save_path, "relation_df.csv")
save_relation_df.drop(columns = ["description_embedding"], inplace=True)
save_relation_df.to_csv(relation_df_save_path, sep=",", index=False)