In [1]:
from dotenv import load_dotenv
load_dotenv()

True

In [2]:
from llama_index.core import VectorStoreIndex, SimpleDirectoryReader

documents = SimpleDirectoryReader("data").load_data()
index = VectorStoreIndex.from_documents(documents)
query_engine = index.as_query_engine()
response = query_engine.query("What is the title of this document?")
print(response)

The title of this document is "The New England Journal of Medicine."


In [3]:
response = query_engine.query("What are the authors of this document?")
print(response)

H.J.L.H., D.C.W., B.V.S., M.L., C.D.S., A.-M.L., R.C.-R., G.M.C., T.G., F.-F.H., J.F.E.M., J.J.V.M., P.R., R.D.T.


In [4]:
response = query_engine.query("Who have a high risk of adverse kidney?")
print(response)

Patients with chronic kidney disease, regardless of the presence or absence of diabetes, have a high risk of adverse kidney outcomes.


In [5]:
response = query_engine.query("How many participants are randomly assigned?")
print(response)

All participants who had undergone randomization and received at least one dose of dapagliflozin or placebo are included in the analysis.


# QA pair extraction from a given text

In [6]:
from llama_index.llms.openai import OpenAI
from llama_index.core.schema import MetadataMode

from llama_index.core.extractors import (
    SummaryExtractor,
    QuestionsAnsweredExtractor,
)

llm = OpenAI(temperature=0.0, model="gpt-4o-mini")
# llm = OpenAI(temperature=0.0, model="gpt-4o-mini", max_tokens=512)


In [7]:
documents_txt = SimpleDirectoryReader("data/txt").load_data()

In [68]:
custom_prompt_template = """\
Here is the context:
{context_str}

Given the contextual information, \
generate {num_questions} questions this context can provide \
specific answers to which are unlikely to be found elsewhere.

Higher-level summaries of surrounding context may be provided \
as well. Try using these summaries to generate better questions \
that this context can answer.

Please return the question-answer pairs, summary, key topics, entities, and findings in json format.
"""


extractors_1 = [

    QuestionsAnsweredExtractor(
        questions=3,
        llm=llm,
        metadata_mode=MetadataMode.EMBED,
        prompt_template=custom_prompt_template,
    ),

]


extractors_2 = [

    SummaryExtractor(summaries=["prev", "self", "next"], llm=llm),

    QuestionsAnsweredExtractor(

        questions=3,
        llm=llm,
        metadata_mode=MetadataMode.EMBED,
        prompt_template=custom_prompt_template,

    ),

]

In [69]:
import nest_asyncio

nest_asyncio.apply()

from llama_index.core.node_parser import TokenTextSplitter
from llama_index.core.ingestion import IngestionPipeline
node_parser = TokenTextSplitter(
    separator=" ", chunk_size=512, chunk_overlap=128
)
nodes = node_parser.get_nodes_from_documents(documents_txt)


# process nodes with metadata extractors
# pipeline = IngestionPipeline(transformations=[node_parser, *extractors_1])
pipeline = IngestionPipeline(transformations=[node_parser, *extractors_2])

nodes_1 = pipeline.run(nodes=nodes, in_place=False)

100%|██████████| 2/2 [00:04<00:00,  2.39s/it]
100%|██████████| 2/2 [00:04<00:00,  2.35s/it]


In [70]:
print(nodes_1[0].get_content(metadata_mode="all"))

[Excerpt from document]
file_path: c:\Users\ray\Desktop\src\leetcode\rag\data\txt\medical.txt
file_name: medical.txt
file_type: text/plain
file_size: 2370
creation_date: 2024-09-12
last_modified_date: 2024-09-12
next_section_summary: The section discusses the effects of dapagliflozin, a medication, on patients with chronic kidney disease (CKD). Key findings include:

- **Hazard Ratios**: 
  - The hazard ratio for renal causes was 0.56, indicating a significant reduction in risk (95% CI, 0.45 to 0.68; P<0.001).
  - The hazard ratio for a composite outcome of death from cardiovascular causes or hospitalization for heart failure was 0.71 (95% CI, 0.55 to 0.92; P=0.009).
  
- **Mortality Rates**: 
  - Death occurred in 4.7% of participants in the dapagliflozin group compared to 6.8% in the placebo group, with a hazard ratio of 0.69 (95% CI, 0.53 to 0.88; P=0.004).

- **Patient Demographics**: 
  - The effects of dapagliflozin were consistent in both participants with type 2 diabetes and th

In [71]:
print(nodes_1[1].get_content(metadata_mode="all"))

[Excerpt from document]
file_path: c:\Users\ray\Desktop\src\leetcode\rag\data\txt\medical.txt
file_name: medical.txt
file_type: text/plain
file_size: 2370
creation_date: 2024-09-12
last_modified_date: 2024-09-12
prev_section_summary: **Summary:**

The section discusses a clinical trial investigating the effects of dapagliflozin on patients with chronic kidney disease (CKD). Key topics and entities include:

1. **Condition**: Chronic Kidney Disease (CKD) - Patients with CKD are at high risk for adverse kidney and cardiovascular outcomes.

2. **Intervention**: Dapagliflozin - A medication administered at a dosage of 10 mg once daily.

3. **Study Design**: 
   - Participants: 4304 individuals with an estimated glomerular filtration rate (GFR) of 25 to 75 ml/min/1.73 m² and a urinary albumin-to-creatinine ratio of 200 to 5000.
   - Randomized assignment to either dapagliflozin or placebo.

4. **Primary Outcome**: A composite measure including a sustained decline in GFR of at least 50%, end

In [72]:
len(nodes_1)

2