# QA pair extraction from a given text

In [10]:
from llama_index.llms.openai import OpenAI
from llama_index.core.schema import MetadataMode
from llama_index.core import VectorStoreIndex, SimpleDirectoryReader

from llama_index.core.extractors import (
    SummaryExtractor,
    QuestionsAnsweredExtractor,
)
from llama_index.core.node_parser import TokenTextSplitter
from llama_index.core.ingestion import IngestionPipeline
from dotenv import load_dotenv

import nest_asyncio

nest_asyncio.apply()

load_dotenv()

True

In [11]:
llm = OpenAI(temperature=0.0, model="gpt-4o-mini")
# llm = OpenAI(temperature=0.0, model="gpt-4o-mini", max_tokens=512)

In [12]:
documents_txt = SimpleDirectoryReader("data/txt/full_text").load_data()

In [13]:
custom_prompt_template = """\
Here is the context:
{context_str}

Given the contextual information, \
generate {num_questions} questions this context can provide \
specific answers to which are unlikely to be found elsewhere.

Higher-level summaries of surrounding context may be provided \
as well. Try using these summaries to generate better questions \
that this context can answer.

Please return the question-answer pairs, summary, key topics, entities, and findings in json format.
"""

In [14]:
extractors_1 = [
    QuestionsAnsweredExtractor(
        questions=3,
        llm=llm,
        metadata_mode=MetadataMode.EMBED,
        prompt_template=custom_prompt_template,
    ),
]


extractors_2 = [
    SummaryExtractor(summaries=["prev", "self", "next"], llm=llm),
    QuestionsAnsweredExtractor(
        questions=3,
        llm=llm,
        metadata_mode=MetadataMode.EMBED,
        prompt_template=custom_prompt_template,
    ),
]

In [15]:

node_parser = TokenTextSplitter(
    separator=" ", chunk_size=512, chunk_overlap=128
)
nodes = node_parser.get_nodes_from_documents(documents_txt)


# process nodes with metadata extractors
# pipeline = IngestionPipeline(transformations=[node_parser, *extractors_1])
pipeline = IngestionPipeline(transformations=[node_parser, *extractors_2])

nodes_1 = pipeline.run(nodes=nodes, in_place=False)

100%|██████████| 29/29 [00:28<00:00,  1.01it/s]
100%|██████████| 29/29 [00:37<00:00,  1.31s/it]


In [16]:
print(nodes_1[0].get_content(metadata_mode="all"))

[Excerpt from document]
file_path: c:\Users\ray\Desktop\src\leetcode\rag\data\txt\full_text\medical_full_text.txt
file_name: medical_full_text.txt
file_type: text/plain
file_size: 37452
creation_date: 2024-09-17
last_modified_date: 2024-09-17
next_section_summary: The section discusses a clinical trial evaluating the efficacy of dapagliflozin in patients with chronic kidney disease (CKD). Key findings include:

- **Trial Results**: Over a median follow-up of 2.4 years, dapagliflozin significantly reduced the occurrence of primary outcome events compared to placebo, with a hazard ratio of 0.61. The number needed to treat to prevent one primary outcome event was 19.
- **Composite Outcomes**: Dapagliflozin showed a lower risk for a sustained decline in estimated glomerular filtration rate (GFR), end-stage kidney disease, and death from renal causes (hazard ratio 0.56). It also reduced the risk of death from cardiovascular causes or hospitalization for heart failure (hazard ratio 0.71).
- 

In [17]:
print(nodes_1[1].get_content(metadata_mode="all"))

[Excerpt from document]
file_path: c:\Users\ray\Desktop\src\leetcode\rag\data\txt\full_text\medical_full_text.txt
file_name: medical_full_text.txt
file_type: text/plain
file_size: 37452
creation_date: 2024-09-17
last_modified_date: 2024-09-17
prev_section_summary: **Summary:**

The section discusses a clinical trial investigating the effects of dapagliflozin on patients with chronic kidney disease (CKD). Key topics and entities include:

- **Study Title:** Dapagliflozin in Patients with Chronic Kidney Disease
- **Authors:** A group of medical professionals including Hiddo J.L. Heerspink, M.D., and others involved in the DAPA-CKD Trial.
- **Background:** Chronic kidney disease patients face high risks of adverse kidney and cardiovascular outcomes, and the impact of dapagliflozin in this population is unclear.
- **Methods:** The trial involved 4304 participants with an estimated glomerular filtration rate (GFR) of 25 to 75 ml/min/1.73 m² and a urinary albumin-to-creatinine ratio of 200 t

In [19]:
# output text to txt files
for i, node in enumerate(nodes_1):
    with open(f"output/{i}.txt", "w", encoding="utf-8") as f:
        f.write(node.get_content(metadata_mode="all"))  # get full text with metadata
        