In [1]:
from datasets import load_dataset
import os
from tqdm import tqdm
dataset_path = "/mnt/data/wangshu/hcarag/narrativeqa/ori_dataset"
cache_dir = os.path.join(dataset_path, "cache")
os.makedirs(cache_dir, exist_ok=True)

dataset = load_dataset(dataset_path, cache_dir=cache_dir)

Resolving data files:   0%|          | 0/25 [00:00<?, ?it/s]

Loading dataset shards:   0%|          | 0/18 [00:00<?, ?it/s]

In [2]:
print(dataset["train"])


def compute_doc_ids(split_col):
    document_id_list = []
    print(f"Number of data in {split_col} split: {len(dataset[split_col])}")
    
    for data in dataset[split_col]:
        doc = data["document"]
        document_id_list.append(doc["id"])
    print(f"Number of documents in {split_col} split: {len(document_id_list)}")
    document_id_set = set(document_id_list)
    print(f"Number of documents in {split_col} split: {len(document_id_set)}")
    return document_id_set


compute_doc_ids("train")
compute_doc_ids("test")
print("Done")

Dataset({
    features: ['document', 'question', 'answers'],
    num_rows: 32747
})
Number of data in train split: 32747
Number of documents in train split: 32747
Number of documents in train split: 1102
Number of data in test split: 10557
Number of documents in test split: 10557
Number of documents in test split: 355
Done


In [14]:
data_1 = dataset["train"][0]
for k, v in data_1["document"].items():
    if k == "text":
        continue
    # print(f"{k} -- {v}")

print(data_1["document"]["summary"].keys())

data_1 = dataset["train"][1]
for k, v in data_1["document"].items():
    if k == "text":
        continue
    # print(f"{k} -- {v}")

dict_keys(['text', 'tokens', 'url', 'title'])


In [6]:
from tqdm import tqdm
import os

In [3]:
def get_document(split_col):
    doc_list = []
    doc_id_dict = {}
    doc_cnt = 0
    for data in tqdm(dataset[split_col], total=len(dataset[split_col])):
        doc = data["document"]
        doc_id = doc["id"]
        if doc_id in doc_id_dict:
            continue
        doc_id_dict[doc_id] = doc_cnt
        doc_cnt += 1
        summary_str = doc["summary"]["title"] + "\n" + doc["summary"]["text"]
        doc_text = doc["text"]
        doc_str = summary_str + "\n" + doc_text
        doc_list.append(doc_str)
    print(len(doc_list))
    print(len(doc_id_dict))
    return doc_list, doc_id_dict

train_doc_list, train_doc_id_dict = get_document("train")
test_doc_list, test_doc_id_dict = get_document("test")

100%|██████████| 32747/32747 [00:29<00:00, 1096.23it/s]


1102
1102


100%|██████████| 10557/10557 [00:07<00:00, 1359.44it/s]

355
355





In [5]:
import pandas as pd

def get_all_document():
    doc_id_dict = {}
    doc_cnt = 0
    save_base_dir = "/mnt/data/wangshu/hcarag/ids_data/narrative/"
    for data in tqdm(dataset["train"], total=len(dataset["train"])):
        doc = data["document"]
        doc_id = doc["id"]
        if doc_id in doc_id_dict:
            continue
        doc_id_dict[doc_id] = doc_cnt
        summary_str = doc["summary"]["title"] + "\n" + doc["summary"]["text"]
        doc_text = doc["text"]
        doc_str = summary_str + "\n" + doc_text
        tmp_dict = {
            "ori_id": doc_id,
            "title": doc["summary"]["title"],
            "context": doc_str,
            "id": doc_id_dict[doc_id],
            "split": "train",
        }
        tmp_df = pd.DataFrame([tmp_dict])
        
        save_dir = os.path.join(save_base_dir, str(doc_id_dict[doc_id]))
        os.makedirs(save_dir, exist_ok=True)
        save_path = os.path.join(save_dir, "corpus.json")
        tmp_df.to_json(save_path, orient="records", lines=True)
        doc_cnt += 1
    
    for data in tqdm(dataset["test"], total=len(dataset["test"])):
        doc = data["document"]
        doc_id = doc["id"]
        if doc_id in doc_id_dict:
            continue
        doc_id_dict[doc_id] = doc_cnt
        summary_str = doc["summary"]["title"] + "\n" + doc["summary"]["text"]
        doc_text = doc["text"]
        doc_str = summary_str + "\n" + doc_text
        tmp_dict = {
            "ori_id": doc_id,
            "title": doc["summary"]["title"],
            "context": doc_str,
            "id": doc_id_dict[doc_id],
            "split": "test",
        }
        tmp_df = pd.DataFrame([tmp_dict])
        
        save_dir = os.path.join(save_base_dir, str(doc_id_dict[doc_id]))
        os.makedirs(save_dir, exist_ok=True)
        save_path = os.path.join(save_dir, "corpus.json")
        tmp_df.to_json(save_path, orient="records", lines=True)
        doc_cnt += 1
        
        
    print(len(doc_id_dict))
    return doc_id_dict

doc_id_dict_all = get_all_document()


100%|██████████| 32747/32747 [00:36<00:00, 904.69it/s] 
100%|██████████| 10557/10557 [00:10<00:00, 1041.78it/s]

1457





In [11]:
qa_all = pd.concat([qa_train, qa_test], axis=0)
qa_all["doc_idx"] = qa_all["document_id"].map(doc_id_dict_all)
print(qa_all.shape)

doc_list = [v for v in doc_id_dict_all.values()]
print(len(doc_list))
print(doc_list[:10])


(43304, 4)
1457
[0, 1, 2, 3, 4, 5, 6, 7, 8, 9]


In [12]:
save_base_dir = "/mnt/data/wangshu/hcarag/ids_data/narrative/"

for doc_idx in tqdm(doc_list):
    save_dir = os.path.join(save_base_dir, str(doc_idx))
    qa_save_path = os.path.join(save_dir, "narrativeqa.json") 
    
    doc_question_df = qa_all[qa_all["doc_idx"] == doc_idx]
    doc_question_df.to_json(qa_save_path, orient='records', lines=True)
print("Done")

100%|██████████| 1457/1457 [00:01<00:00, 1401.54it/s]

Done





In [9]:
base_path = "/mnt/data/wangshu/hcarag/narrativeqa/data"

train_base = base_path + "/train"
os.makedirs(train_base, exist_ok=True)
with open(train_base + "/0_doc_id_dict.txt", "w") as f:
    for k, v in train_doc_id_dict.items():
        f.write(f"{k}\t{v}\n")

test_base = base_path + "/test"
os.makedirs(test_base, exist_ok=True)
with open(test_base + "/0_doc_id_dict.txt", "w") as f:
    for k, v in test_doc_id_dict.items():
        f.write(f"{k}\t{v}\n")

for i, doc in enumerate(train_doc_list):
    doc_i_dir = train_base + f"/{i}"
    os.makedirs(doc_i_dir, exist_ok=True)
    
    graphrag_input_dir = doc_i_dir + "/input"
    os.makedirs(graphrag_input_dir, exist_ok=True)
    
    hcarag_dir = doc_i_dir + "/hcarag"
    os.makedirs(hcarag_dir, exist_ok=True)
    with open(graphrag_input_dir + "/doc.txt", "w") as f:
        f.write(doc)

for i, doc in enumerate(test_doc_list):
    doc_i_dir = test_base + f"/{i}"
    os.makedirs(doc_i_dir, exist_ok=True)
    
    graphrag_input_dir = doc_i_dir + "/input"
    os.makedirs(graphrag_input_dir, exist_ok=True)
    
    hcarag_dir = doc_i_dir + "/hcarag"
    os.makedirs(hcarag_dir, exist_ok=True)
    with open(graphrag_input_dir + "/doc.txt", "w") as f:
        f.write(doc)

print("Done")

Done


In [37]:
import sys
import os
import pandas as pd

sys.path.append(os.path.abspath("/home/wangshu/rag/hier_graph_rag/"))
from src.utils import num_tokens

len_tokens_list = []
for doc in tqdm(train_doc_list):
    len_tokens_list.append(num_tokens(doc))

print(f"average number of tokens in train: {sum(len_tokens_list) / len(len_tokens_list)}")
train_info = pd.DataFrame(train_doc_id_dict.items(), columns=["doc_id", "doc_idx"])
train_info["len_tokens"] = len_tokens_list

  0%|          | 0/1102 [00:00<?, ?it/s]


TypeError: cannot use a string pattern on a bytes-like object

In [18]:
len_tokens_list = []
for doc in tqdm(test_doc_list):
    len_tokens_list.append(num_tokens(doc))

print(f"average number of tokens in test: {sum(len_tokens_list) / len(len_tokens_list)}")
test_info = pd.DataFrame(test_doc_id_dict.items(), columns=["doc_id", "doc_idx"])
test_info["len_tokens"] = len_tokens_list

train_info.to_csv(train_base + "/doc_info.csv", index=False)
test_info.to_csv(test_base + "/doc_info.csv", index=False)

100%|██████████| 355/355 [00:19<00:00, 18.50it/s]

average number of tokens in test: 79503.3323943662





In [19]:
train_info

Unnamed: 0,doc_id,doc_idx,len_tokens
0,0029bdbe75423337b551e42bb31f9a102785376f,0,200781
1,00936497f5884881f1df23f4834f6739552cee8b,1,132963
2,00950a3641e6a28b04a6fabf6334140e2deaa9fd,2,21870
3,00ee9e01a0e581e0d8cbf7e865a895147c480c5e,3,68636
4,00f9dbb0a851bc6099d5216e5fa8719b2ac3b82b,4,52307
...,...,...,...
1097,fea54b235b1c054d1c90e87f57e3bfb64cbf3a5b,1097,48168
1098,febd3002298e75a9e9b5569500989766137608f8,1098,51218
1099,fee64a7ee5b0427d78d666770c064213f245884a,1099,27441
1100,ff53fd53a94f343b8365915645b79d7ad5b1528e,1100,62287


In [12]:
print(f"max number of tokens in train: {max(len_tokens_list)}")
print(f"min number of tokens in train: {min(len_tokens_list)}")

max number of tokens in train: 542289
min number of tokens in train: 5146


In [13]:
temp_path = "/home/wangshu/rag/graphrag/ragtest/input/thinkos.txt"
with open(temp_path, "r") as f:
    str_list = f.readlines()
    print(num_tokens("".join(str_list)))

33750


In [8]:
save_dir = "/mnt/data/wangshu/hcarag/narrativeqa/docs_graphrag/input"
for i in range(len(train_doc_list)):
    with open(f"{save_dir}/train_{i}.txt", "w") as f:
        f.write(train_doc_list[i])

In [None]:
corpus_save_path = "/mnt/data/wangshu/hcarag/narrativeqa/dataset/corpus.txt"
with open(corpus_save_path, "w") as f:
    for doc in train_doc_list:
        f.write(doc + "\n")
print("done")

done


In [6]:
import pandas as pd



def get_qa(split_col):
    qa_list = []
    for data in tqdm(dataset[split_col], total=len(dataset[split_col])):
        ans = data["answers"]
        ans_list = []
        for ans_text in ans:
            text = ans_text["text"]
            text = text.strip().split(",")
            ans_list.extend(text)
        ans_list = list(set(ans_list))
        question_str = data["question"]["text"]
        document_id = data['document']["id"]
        qa_list.append({"question": question_str, "answers": ans_list, "document_id": document_id})
    qa_df = pd.DataFrame(qa_list)
    return qa_df

qa_train = get_qa('train')
qa_test = get_qa('test')

qa_train.head(2)

100%|██████████| 32747/32747 [00:24<00:00, 1332.39it/s]
100%|██████████| 10557/10557 [00:07<00:00, 1357.06it/s]


Unnamed: 0,question,answers,document_id
0,Who is Miss Delmer?,[the elderly spinster aunt of the Earl de Vers...,0029bdbe75423337b551e42bb31f9a102785376f
1,Who does Arabella Mason wed?,"[ Delmar's valet, Ben Keene]",0029bdbe75423337b551e42bb31f9a102785376f


In [41]:
base_path = "/mnt/data/wangshu/hcarag/narrativeqa/data"

train_base = base_path + "/train"
test_base = base_path + "/test"
train_info = pd.read_csv(train_base + "/doc_info.csv")
test_info = pd.read_csv(test_base + "/doc_info.csv")
train_map = dict(zip(train_info["doc_id"], train_info["doc_idx"]))
test_map = dict(zip(test_info["doc_id"], test_info["doc_idx"]))

print(train_info.head(2))

qa_train["doc_idx"] = qa_train["document_id"].map(train_map)
qa_test["doc_idx"] = qa_test["document_id"].map(test_map)
qa_train.head(2)

                                     doc_id  doc_idx  len_tokens
0  0029bdbe75423337b551e42bb31f9a102785376f        0      200781
1  00936497f5884881f1df23f4834f6739552cee8b        1      132963


Unnamed: 0,question,answers,document_id,doc_idx
0,Who is Miss Delmer?,[the elderly spinster aunt of the Earl de Vers...,0029bdbe75423337b551e42bb31f9a102785376f,0
1,Who does Arabella Mason wed?,"[ Delmar's valet, Ben Keene]",0029bdbe75423337b551e42bb31f9a102785376f,0


In [23]:
def process_gb_answer(x):
        if isinstance(x, list):
            return "|".join(map(str, x))
        elif isinstance(x, str):
            return x
qa_train["label"] = qa_train["answers"].apply(process_gb_answer)
qa_train['q_id'] = qa_train.apply(lambda row: f"train_{row.name}", axis=1)
qa_train.tail(2)

Unnamed: 0,question,answers,document_id,doc_idx,label,q_id
32745,Who is the father of Stacy's baby?,"[Damone, Mike Damone]",ffae045d630abf7e4c282849d16819ceff60c2b0,1101,Damone|Mike Damone,train_32745
32746,How does Brad get promoted to manager at Mi-T-...,"[He stops a robbery, he stopped a robbery in t...",ffae045d630abf7e4c282849d16819ceff60c2b0,1101,He stops a robbery|he stopped a robbery in the...,train_32746


In [24]:
file_path_save_qa = "/mnt/data/wangshu/hcarag/narrativeqa/dataset/narrativeqa_train.json"
qa_train.to_json(file_path_save_qa, orient='records', lines=True )


qa_train.head(2)

Unnamed: 0,question,answers,document_id,doc_idx,label,q_id
0,Who is Miss Delmer?,[the elderly spinster aunt of the Earl de Vers...,0029bdbe75423337b551e42bb31f9a102785376f,0,the elderly spinster aunt of the Earl de Verse...,train_0
1,Who does Arabella Mason wed?,"[ Delmar's valet, Ben Keene]",0029bdbe75423337b551e42bb31f9a102785376f,0,Delmar's valet|Ben Keene,train_1


In [25]:
qa_test["label"] = qa_test["answers"].apply(process_gb_answer)
qa_test['q_id'] = qa_test.apply(lambda row: f"train_{row.name}", axis=1)

file_path_save_qa = "/mnt/data/wangshu/hcarag/narrativeqa/dataset/narrativeqa_test.json"
qa_test.to_json(file_path_save_qa, orient='records', lines=True)


In [26]:
qa_test.head(2)

Unnamed: 0,question,answers,document_id,doc_idx,label,q_id
0,Who is Mark Hunter?,"[He is a high school student in Phoenix., A lo...",0025577043f5090cd603c6aea60f26e236195594,0,He is a high school student in Phoenix.|A lone...,train_0
1,Where does this radio station take place?,"[It takes place in Mark's parents basement., ...",0025577043f5090cd603c6aea60f26e236195594,0,It takes place in Mark's parents basement.| Ar...,train_1


In [27]:
qa_train.shape[0] + qa_test.shape[0]

43304

In [34]:
# combine two part

qa_all = pd.concat([qa_train, qa_test], axis=0)
file_path_save_qa_all = "/mnt/data/wangshu/hcarag/narrativeqa/dataset/narrativeqa_all.json"
qa_all.to_json(file_path_save_qa_all, orient='records', lines=True)
print("done")
qa_all.shape

done


(43304, 6)

## Group each question according to the document  

In [31]:
import shutil
# Train
train_doc_list = qa_train['doc_idx'].unique()

for doc_idx in tqdm(train_doc_list):
    doc_i_dir = train_base + f"/{doc_idx}"
    doc_question_df = qa_train[qa_train["doc_idx"] == doc_idx]
    # create dir
    save_dir = train_base + f"/{doc_idx}/qa_dataset/"
    os.makedirs(save_dir,exist_ok=True)
    qa_save_path = save_dir + "narrativeqa.json"
    doc_question_df.to_json(qa_save_path, orient='records', lines=True)
    # copy corpus file
    ori_corpus_file = train_base+f"/{doc_idx}/input/doc.txt"
    target_corpus_file = save_dir+"corpus.txt"
    if os.path.exists(ori_corpus_file):
        shutil.copyfile(ori_corpus_file, target_corpus_file)
print("Done")
        

100%|██████████| 1102/1102 [00:03<00:00, 280.28it/s]

Done





In [32]:
# Test
test_doc_list = qa_test['doc_idx'].unique()

for doc_idx in tqdm(test_doc_list):
    doc_i_dir = test_base + f"/{doc_idx}"
    doc_question_df = qa_test[qa_test["doc_idx"] == doc_idx]
    # create dir
    save_dir = test_base + f"/{doc_idx}/qa_dataset/"
    os.makedirs(save_dir,exist_ok=True)
    qa_save_path = save_dir + "narrativeqa.json"
    doc_question_df.to_json(qa_save_path, orient='records', lines=True)
    # copy corpus file
    ori_corpus_file = test_base+f"/{doc_idx}/input/doc.txt"
    target_corpus_file = save_dir+"corpus.txt"
    if os.path.exists(ori_corpus_file):
        shutil.copyfile(ori_corpus_file, target_corpus_file)
print("Done")

100%|██████████| 355/355 [00:01<00:00, 250.69it/s]

Done





In [9]:
corpus_path = "/mnt/data/wangshu/hcarag/narrativeqa/graphrag_io/input/corpus.txt"
with open(corpus_path, "r") as f:
    corpus_data = f.readlines()
    print(len(corpus_data))
import sys
import os

sys.path.append(os.path.abspath("/home/wangshu/rag/hier_graph_rag/"))
from src.utils import num_tokens
from tqdm import tqdm


8870500


In [10]:
str_1 = """
-Goal-
Given a text document that is potentially relevant to this activity and a list of entity types, identify all entities of those types from the text and all relationships among the identified entities.
 
-Steps-
1. Identify all entities. For each identified entity, extract the following information:
- entity_name: Name of the entity, capitalized
- entity_type: One of the following types: [{entity_types}]
- entity_description: Comprehensive description of the entity's attributes and activities
Format each entity as ("entity"{tuple_delimiter}<entity_name>{tuple_delimiter}<entity_type>{tuple_delimiter}<entity_description>)
 
2. From the entities identified in step 1, identify all pairs of (source_entity, target_entity) that are *clearly related* to each other.
For each pair of related entities, extract the following information:
- source_entity: name of the source entity, as identified in step 1
- target_entity: name of the target entity, as identified in step 1
- relationship_description: explanation as to why you think the source entity and the target entity are related to each other
- relationship_strength: a numeric score indicating strength of the relationship between the source entity and target entity
 Format each relationship as ("relationship"{tuple_delimiter}<source_entity>{tuple_delimiter}<target_entity>{tuple_delimiter}<relationship_description>{tuple_delimiter}<relationship_strength>)
 
3. Return output in English as a single list of all the entities and relationships identified in steps 1 and 2. Use **{record_delimiter}** as the list delimiter.
 
4. When finished, output {completion_delimiter}
 
######################
-Examples-
######################
Example 1:
Entity_types: ORGANIZATION,PERSON
Text:
The Verdantis's Central Institution is scheduled to meet on Monday and Thursday, with the institution planning to release its latest policy decision on Thursday at 1:30 p.m. PDT, followed by a press conference where Central Institution Chair Martin Smith will take questions. Investors expect the Market Strategy Committee to hold its benchmark interest rate steady in a range of 3.5%-3.75%.
######################
Output:
("entity"{tuple_delimiter}CENTRAL INSTITUTION{tuple_delimiter}ORGANIZATION{tuple_delimiter}The Central Institution is the Federal Reserve of Verdantis, which is setting interest rates on Monday and Thursday)
{record_delimiter}
("entity"{tuple_delimiter}MARTIN SMITH{tuple_delimiter}PERSON{tuple_delimiter}Martin Smith is the chair of the Central Institution)
{record_delimiter}
("entity"{tuple_delimiter}MARKET STRATEGY COMMITTEE{tuple_delimiter}ORGANIZATION{tuple_delimiter}The Central Institution committee makes key decisions about interest rates and the growth of Verdantis's money supply)
{record_delimiter}
("relationship"{tuple_delimiter}MARTIN SMITH{tuple_delimiter}CENTRAL INSTITUTION{tuple_delimiter}Martin Smith is the Chair of the Central Institution and will answer questions at a press conference{tuple_delimiter}9)
{completion_delimiter}

######################
Example 2:
Entity_types: ORGANIZATION
Text:
TechGlobal's (TG) stock skyrocketed in its opening day on the Global Exchange Thursday. But IPO experts warn that the semiconductor corporation's debut on the public markets isn't indicative of how other newly listed companies may perform.

TechGlobal, a formerly public company, was taken private by Vision Holdings in 2014. The well-established chip designer says it powers 85% of premium smartphones.
######################
Output:
("entity"{tuple_delimiter}TECHGLOBAL{tuple_delimiter}ORGANIZATION{tuple_delimiter}TechGlobal is a stock now listed on the Global Exchange which powers 85% of premium smartphones)
{record_delimiter}
("entity"{tuple_delimiter}VISION HOLDINGS{tuple_delimiter}ORGANIZATION{tuple_delimiter}Vision Holdings is a firm that previously owned TechGlobal)
{record_delimiter}
("relationship"{tuple_delimiter}TECHGLOBAL{tuple_delimiter}VISION HOLDINGS{tuple_delimiter}Vision Holdings formerly owned TechGlobal from 2014 until present{tuple_delimiter}5)
{completion_delimiter}

######################
Example 3:
Entity_types: ORGANIZATION,GEO,PERSON
Text:
Five Aurelians jailed for 8 years in Firuzabad and widely regarded as hostages are on their way home to Aurelia.

The swap orchestrated by Quintara was finalized when $8bn of Firuzi funds were transferred to financial institutions in Krohaara, the capital of Quintara.

The exchange initiated in Firuzabad's capital, Tiruzia, led to the four men and one woman, who are also Firuzi nationals, boarding a chartered flight to Krohaara.

They were welcomed by senior Aurelian officials and are now on their way to Aurelia's capital, Cashion.

The Aurelians include 39-year-old businessman Samuel Namara, who has been held in Tiruzia's Alhamia Prison, as well as journalist Durke Bataglani, 59, and environmentalist Meggie Tazbah, 53, who also holds Bratinas nationality.
######################
Output:
("entity"{tuple_delimiter}FIRUZABAD{tuple_delimiter}GEO{tuple_delimiter}Firuzabad held Aurelians as hostages)
{record_delimiter}
("entity"{tuple_delimiter}AURELIA{tuple_delimiter}GEO{tuple_delimiter}Country seeking to release hostages)
{record_delimiter}
("entity"{tuple_delimiter}QUINTARA{tuple_delimiter}GEO{tuple_delimiter}Country that negotiated a swap of money in exchange for hostages)
{record_delimiter}
{record_delimiter}
("entity"{tuple_delimiter}TIRUZIA{tuple_delimiter}GEO{tuple_delimiter}Capital of Firuzabad where the Aurelians were being held)
{record_delimiter}
("entity"{tuple_delimiter}KROHAARA{tuple_delimiter}GEO{tuple_delimiter}Capital city in Quintara)
{record_delimiter}
("entity"{tuple_delimiter}CASHION{tuple_delimiter}GEO{tuple_delimiter}Capital city in Aurelia)
{record_delimiter}
("entity"{tuple_delimiter}SAMUEL NAMARA{tuple_delimiter}PERSON{tuple_delimiter}Aurelian who spent time in Tiruzia's Alhamia Prison)
{record_delimiter}
("entity"{tuple_delimiter}ALHAMIA PRISON{tuple_delimiter}GEO{tuple_delimiter}Prison in Tiruzia)
{record_delimiter}
("entity"{tuple_delimiter}DURKE BATAGLANI{tuple_delimiter}PERSON{tuple_delimiter}Aurelian journalist who was held hostage)
{record_delimiter}
("entity"{tuple_delimiter}MEGGIE TAZBAH{tuple_delimiter}PERSON{tuple_delimiter}Bratinas national and environmentalist who was held hostage)
{record_delimiter}
("relationship"{tuple_delimiter}FIRUZABAD{tuple_delimiter}AURELIA{tuple_delimiter}Firuzabad negotiated a hostage exchange with Aurelia{tuple_delimiter}2)
{record_delimiter}
("relationship"{tuple_delimiter}QUINTARA{tuple_delimiter}AURELIA{tuple_delimiter}Quintara brokered the hostage exchange between Firuzabad and Aurelia{tuple_delimiter}2)
{record_delimiter}
("relationship"{tuple_delimiter}QUINTARA{tuple_delimiter}FIRUZABAD{tuple_delimiter}Quintara brokered the hostage exchange between Firuzabad and Aurelia{tuple_delimiter}2)
{record_delimiter}
("relationship"{tuple_delimiter}SAMUEL NAMARA{tuple_delimiter}ALHAMIA PRISON{tuple_delimiter}Samuel Namara was a prisoner at Alhamia prison{tuple_delimiter}8)
{record_delimiter}
("relationship"{tuple_delimiter}SAMUEL NAMARA{tuple_delimiter}MEGGIE TAZBAH{tuple_delimiter}Samuel Namara and Meggie Tazbah were exchanged in the same hostage release{tuple_delimiter}2)
{record_delimiter}
("relationship"{tuple_delimiter}SAMUEL NAMARA{tuple_delimiter}DURKE BATAGLANI{tuple_delimiter}Samuel Namara and Durke Bataglani were exchanged in the same hostage release{tuple_delimiter}2)
{record_delimiter}
("relationship"{tuple_delimiter}MEGGIE TAZBAH{tuple_delimiter}DURKE BATAGLANI{tuple_delimiter}Meggie Tazbah and Durke Bataglani were exchanged in the same hostage release{tuple_delimiter}2)
{record_delimiter}
("relationship"{tuple_delimiter}SAMUEL NAMARA{tuple_delimiter}FIRUZABAD{tuple_delimiter}Samuel Namara was a hostage in Firuzabad{tuple_delimiter}2)
{record_delimiter}
("relationship"{tuple_delimiter}MEGGIE TAZBAH{tuple_delimiter}FIRUZABAD{tuple_delimiter}Meggie Tazbah was a hostage in Firuzabad{tuple_delimiter}2)
{record_delimiter}
("relationship"{tuple_delimiter}DURKE BATAGLANI{tuple_delimiter}FIRUZABAD{tuple_delimiter}Durke Bataglani was a hostage in Firuzabad{tuple_delimiter}2)
{completion_delimiter}

######################
-Real Data-
######################
Entity_types: {entity_types}
Text: {input_text}
######################
Output:"""

num_tokens(str_1)

2037

In [None]:

all_token = 0
for doc in tqdm(corpus_data):
    all_token += num_tokens(doc)

print(all_token)


In [43]:
import sys
import os
import pandas as pd

sys.path.append(os.path.abspath("/home/wangshu/rag/hier_graph_rag/"))
from src.utils import num_tokens
base_path = "/mnt/data/wangshu/hcarag/narrativeqa/data"

train_base = base_path + "/train"
test_base = base_path + "/test"
train_doc_list = qa_train['doc_idx'].unique()
# Test
test_doc_list = qa_test['doc_idx'].unique()

len(train_doc_list)

1102

In [47]:
from concurrent.futures import ThreadPoolExecutor
chunk_size = 1200


def get_chunk_content(content, chunk_size):
    tmp_list = []
    last_string = ""
    for i in range(len(content)):
        new_string = last_string + content[i]
        if num_tokens(new_string) > chunk_size:
            tmp_list.append(last_string)
            last_string = content[i]
        else:
            last_string = new_string

    return tmp_list


def process_doc(doc_idx, base_path, chunk_size):
    save_dir = base_path + f"/{doc_idx}/qa_dataset/"
    target_corpus_file = save_dir + "corpus.txt"
    tmp_list = []
    with open(target_corpus_file, "r") as f:
        corpus_data = f.readlines()
        title = corpus_data[0]
        tmp_list = get_chunk_content(corpus_data[1:], chunk_size)
        title_list = [title] * len(tmp_list)
    chunk_df = pd.DataFrame({"title": title_list, "content": tmp_list})
    chunk_df['id'] = range(len(chunk_df))
    chunk_save_path = save_dir + "corpus_chunk.json"
    chunk_df.to_json(chunk_save_path, orient='records', lines=True)

def split_all_chunk(base_path, doc_list, chunk_size):
    with ThreadPoolExecutor(max_workers=32) as executor:
        futures = [executor.submit(process_doc, doc_idx, base_path, chunk_size) for doc_idx in doc_list]
        for future in tqdm(futures, desc="Processing documents"):
            future.result()


# def split_all_chunk(base_path, doc_list, chunk_size):
#     for doc_idx in tqdm(doc_list):
#         save_dir = base_path + f"/{doc_idx}/qa_dataset/"
#         target_corpus_file = save_dir+"corpus.txt"
#         tmp_list = []
#         with open(target_corpus_file, "r") as f:
#             corpus_data = f.readlines()
#             title = corpus_data[0]
#             tmp_list = get_chunk_content(corpus_data[1:], chunk_size)
#             title_list = [title] * len(tmp_list)
#         chunk_df = pd.DataFrame({"title": title_list, "content": tmp_list})
#         chunk_df['id'] = range(len(chunk_df))
#         chunk_save_path = save_dir + "corpus_chunk.json"
#         chunk_df.to_json(chunk_save_path, orient='records', lines=True)

split_all_chunk(train_base, train_doc_list, chunk_size)
split_all_chunk(test_base, test_doc_list, chunk_size)

Processing documents: 100%|██████████| 1102/1102 [09:01<00:00,  2.03it/s]
Processing documents: 100%|██████████| 355/355 [02:53<00:00,  2.04it/s]
