In [2]:
%load_ext autoreload
%autoreload 2

from outlines import generate, models
from curverag import utils
from curverag.transformations import chunk_text
from curverag.graph import create_graph, KnowledgeGraph, Node, Edge, generate_prompt, create_graph_dataset
from curverag.utils import save_kg_dataset, split_triples, generate_to_skip, create_atth_dataset

import llama_cpp

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


# Create dataset

In [3]:
max_tokens = 10000
n_ctx=10000

docs = [
    "The patient was diagnosed with type 2 diabetes mellitus and prescribed metformin 500mg twice daily.",
    "MRI scan revealed a small lesion in the left temporal lobe suggestive of low-grade glioma.",
    "Administer 5mg of lorazepam intravenously for acute seizure management.",
    "Blood tests showed elevated ALT and AST levels, indicating possible liver inflammation.",
    "The subject reported chronic lower back pain, managed with physical therapy and NSAIDs.",
    "CT angiography confirmed the presence of a pulmonary embolism in the right lower lobe.",
    "The patient underwent coronary artery bypass graft surgery without complications.",
    "Routine vaccination included MMR, tetanus, and influenza immunizations.",
    "Histopathology indicated ductal carcinoma in situ (DCIS) in the breast biopsy sample.",
    "The child presented with a persistent cough and fever, diagnosed as streptococcal pharyngitis."
]


# Load LLM

In [4]:
model = utils.load_model(
    llm_model_path="./models/Meta-Llama-3-8B-Instruct.Q6_K.gguf",
    tokenizer=llama_cpp.llama_tokenizer.LlamaHFTokenizer.from_pretrained("unsloth/Llama-3.2-1B-Instruct"),
    n_ctx=n_ctx,
    max_tokens=max_tokens
)

llama_model_load_from_file_impl: using device Metal (Apple M4 Pro) - 16383 MiB free
llama_model_loader: loaded meta data with 27 key-value pairs and 291 tensors from ./models/Meta-Llama-3-8B-Instruct.Q6_K.gguf (version GGUF V3 (latest))
llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
llama_model_loader: - kv   0:                       general.architecture str              = llama
llama_model_loader: - kv   1:                               general.type str              = model
llama_model_loader: - kv   2:                               general.name str              = Models
llama_model_loader: - kv   3:                         general.size_label str              = 8.0B
llama_model_loader: - kv   4:                            general.license str              = llama3
llama_model_loader: - kv   5:                               general.tags arr[str,6]       = ["facebook", "meta", "pytorch", "llam...
llama_model_loader: - kv   6:            

# Create and save graph

In [5]:
graph = create_graph(model, docs, chunk_size=3000, max_tokens=max_tokens)



"<|begin_of_text|><|start_header_id|>system<|end_header_id|>
       You are a world class AI model who extracts nodes and entities from documents to add to an exiting Knowledge Graph creation task. Put yur reply in JSON<|eot_id|>
        <|start_header_id|>user<|end_header_id|>
        Here's the json schema you must adhere to: <schema>{'$defs': {'Edge': {'properties': {'source': {'description': 'ID of the source node edge', 'title': 'Source', 'type': 'integer'}, 'target': {'description': 'ID of the target node edge', 'title': 'Target', 'type': 'integer'}, 'name': {'description': 'Name of the relationship for the edge. Maximum of 20 characters.', 'title': 'Name', 'type': 'string'}, 'is_directed': {'description': 'If true its a directed edge', 'title': 'Is Directed', 'type': 'boolean'}, 'description': {'description': 'A short description of the edge. Maximum of 50 characters.', 'title': 'Description', 'type': 'string'}}, 'required': ['source', 'target', 'name', 'is_directed', 'descripti

llama_perf_context_print:        load time =    5220.97 ms
llama_perf_context_print: prompt eval time =    5219.89 ms /   613 tokens (    8.52 ms per token,   117.44 tokens per second)
llama_perf_context_print:        eval time =   24346.94 ms /   521 runs   (   46.73 ms per token,    21.40 tokens per second)
llama_perf_context_print:       total time =   32783.23 ms /  1134 tokens
 10%|██████▉                                                              | 1/10 [00:32<04:55, 32.79s/it]

nodes=[Node(id=1, name='Patient', description='A person who is receiving medical treatment', alias=['individual', 'person']), Node(id=2, name='Diabetes Mellitus', description='A group of metabolic disorders characterized by high blood sugar levels', alias=['diabetes', 'DM']), Node(id=3, name='Metformin', description='A medication used to treat type 2 diabetes', alias=['glucophage', 'metformin hydrochloride']), Node(id=4, name='Type 2 Diabetes', description='A type of diabetes that is caused by insulin resistance and impaired insulin secretion', alias=['T2D', 'non-insulin-dependent diabetes']), Node(id=5, name='Daily', description='A unit of time', alias=['day', '24 hours']), Node(id=6, name='500mg', description='A dosage of medication', alias=['half a gram', '0.5 grams']), Node(id=7, name='Twice', description='A frequency of medication administration', alias=['every 12 hours', 'b.i.d.'])] edges=[Edge(source=1, target=2, name='has_disease', is_directed=True, description='The patient has

llama_perf_context_print:        load time =    5220.97 ms
llama_perf_context_print: prompt eval time =    7152.21 ms /  1005 tokens (    7.12 ms per token,   140.52 tokens per second)
llama_perf_context_print:        eval time =   19420.87 ms /   374 runs   (   51.93 ms per token,    19.26 tokens per second)
llama_perf_context_print:       total time =   29628.57 ms /  1379 tokens
 20%|█████████████▊                                                       | 2/10 [01:02<04:07, 30.93s/it]

nodes=[Node(id=8, name='MRI', description='A medical imaging technique', alias=['magnetic resonance imaging']), Node(id=9, name='Lesion', description='An abnormality in the tissue', alias=['lesion', 'abnormality']), Node(id=10, name='Temporal lobe', description='A part of the brain', alias=['temporal', 'lobe']), Node(id=11, name='Glioma', description='A type of brain tumor', alias=['glioma', 'brain tumor']), Node(id=12, name='Low-grade', description='A classification of glioma', alias=['low grade', 'grade 1'])] edges=[Edge(source=1, target=8, name='uses', is_directed=True, description='The MRI uses a specific technique'), Edge(source=8, target=9, name='reveals', is_directed=True, description='The MRI reveals a lesion'), Edge(source=9, target=10, name='located_in', is_directed=True, description='The lesion is located in the temporal lobe'), Edge(source=10, target=11, name='affected_by', is_directed=True, description='The temporal lobe is affected by the glioma'), Edge(source=11, target=

llama_perf_context_print:        load time =    5220.97 ms
llama_perf_context_print: prompt eval time =    9545.04 ms /  1280 tokens (    7.46 ms per token,   134.10 tokens per second)
llama_perf_context_print:        eval time =   22509.16 ms /   420 runs   (   53.59 ms per token,    18.66 tokens per second)
llama_perf_context_print:       total time =   36099.58 ms /  1700 tokens
 30%|████████████████████▋                                                | 3/10 [01:38<03:53, 33.29s/it]

nodes=[Node(id=13, name='Lorazepam', description='A medication used to treat anxiety and seizures', alias=['lorazepam', 'ativan']), Node(id=14, name='Seizure', description='A sudden, uncontrolled electrical discharge of the brain', alias=['seizure', 'convulsion']), Node(id=15, name='Acute', description='A sudden and severe condition', alias=['acute', 'sudden']), Node(id=16, name='Intravenous', description='A method of administering medication', alias=['iv', 'intravenous']), Node(id=17, name='Management', description='The process of controlling or treating a condition', alias=['management', 'treatment'])] edges=[Edge(source=1, target=13, name='administers', is_directed=True, description='The patient is administered the medication'), Edge(source=1, target=14, name='has_condition', is_directed=True, description='The patient has the condition'), Edge(source=13, target=14, name='treats', is_directed=True, description='The medication treats the condition'), Edge(source=1, target=15, name='ha

llama_perf_context_print:        load time =    5220.97 ms
llama_perf_context_print: prompt eval time =   12497.22 ms /  1591 tokens (    7.85 ms per token,   127.31 tokens per second)
llama_perf_context_print:        eval time =   18643.38 ms /   345 runs   (   54.04 ms per token,    18.51 tokens per second)
llama_perf_context_print:       total time =   34889.67 ms /  1936 tokens
 40%|███████████████████████████▌                                         | 4/10 [02:13<03:23, 33.93s/it]

nodes=[Node(id=18, name='Blood tests', description='A medical examination of the blood', alias=['blood test', 'blood examination']), Node(id=19, name='ALT', description='An enzyme in the liver', alias=['alanine aminotransferase', 'ALT enzyme']), Node(id=20, name='AST', description='An enzyme in the liver', alias=['aspartate aminotransferase', 'AST enzyme']), Node(id=21, name='Liver inflammation', description='An inflammation of the liver', alias=['liver inflammation', 'hepatitis'])] edges=[Edge(source=1, target=18, name='performed', is_directed=True, description='The blood tests were performed'), Edge(source=18, target=19, name='measured', is_directed=True, description='The ALT levels were measured'), Edge(source=18, target=20, name='measured', is_directed=True, description='The AST levels were measured'), Edge(source=19, target=21, name='indicates', is_directed=True, description='The elevated ALT levels indicate liver inflammation'), Edge(source=20, target=21, name='indicates', is_dir

llama_perf_context_print:        load time =    5220.97 ms
llama_perf_context_print: prompt eval time =   14257.80 ms /  1850 tokens (    7.71 ms per token,   129.75 tokens per second)
llama_perf_context_print:        eval time = 7254927.59 ms /   367 runs   (19768.20 ms per token,     0.05 tokens per second)
llama_perf_context_print:       total time = 7275654.45 ms /  2217 tokens
 50%|███████████████████████████████▌                               | 5/10 [2:03:29<3:40:26, 2645.35s/it]

nodes=[Node(id=22, name='Subject', description='A person who is receiving medical treatment', alias=['patient', 'individual']), Node(id=23, name='Chronic', description='A long-term condition', alias=['long-term', 'persistent']), Node(id=24, name='Lower back pain', description='A type of pain', alias=['back pain', 'lumbar pain']), Node(id=25, name='Physical therapy', description='A type of treatment', alias=['PT', 'physical therapy']), Node(id=26, name='NSAIDs', description='A type of medication', alias=['nonsteroidal anti-inflammatory drugs', 'pain relievers'])] edges=[Edge(source=1, target=22, name='has_condition', is_directed=True, description='The patient has the condition'), Edge(source=22, target=23, name='has_characteristic', is_directed=True, description='The condition has a characteristic'), Edge(source=22, target=24, name='experiences', is_directed=True, description='The patient experiences the condition'), Edge(source=22, target=25, name='undergoes', is_directed=True, descrip

llama_perf_context_print:        load time =    5220.97 ms
llama_perf_context_print: prompt eval time =  862717.19 ms /  2123 tokens (  406.37 ms per token,     2.46 tokens per second)
llama_perf_context_print:        eval time =   13265.85 ms /   258 runs   (   51.42 ms per token,    19.45 tokens per second)
llama_perf_context_print:       total time =  879042.21 ms /  2381 tokens
 60%|█████████████████████████████████████▊                         | 6/10 [2:18:08<2:16:19, 2044.80s/it]

nodes=[Node(id=27, name='CT angiography', description='A medical imaging technique', alias=['CTA', 'computed tomography angiography']), Node(id=28, name='Pulmonary embolism', description='A blockage of the pulmonary artery', alias=['PE', 'pulmonary thromboembolism']), Node(id=29, name='Right lower lobe', description='A part of the lung', alias=['RLL', 'right lower lobe of the lung'])] edges=[Edge(source=22, target=27, name='uses', is_directed=True, description='The patient uses the medical imaging technique'), Edge(source=27, target=28, name='reveals', is_directed=True, description='The medical imaging technique reveals the presence of a pulmonary embolism'), Edge(source=28, target=29, name='located_in', is_directed=True, description='The pulmonary embolism is located in the right lower lobe')]
"<|begin_of_text|><|start_header_id|>system<|end_header_id|>
       You are a world class AI model who extracts nodes and entities from documents to add to an exiting Knowledge Graph creation ta

llama_perf_context_print:        load time =    5220.97 ms
llama_perf_context_print: prompt eval time =   21671.21 ms /  2312 tokens (    9.37 ms per token,   106.69 tokens per second)
llama_perf_context_print:        eval time =    8649.92 ms /   172 runs   (   50.29 ms per token,    19.88 tokens per second)
llama_perf_context_print:       total time =   32559.54 ms /  2484 tokens
 70%|████████████████████████████████████████████                   | 7/10 [2:18:40<1:09:20, 1386.96s/it]

nodes=[Node(id=30, name='Coronary Artery Bypass Graft Surgery', description='A surgical procedure to bypass blocked coronary arteries', alias=['CABG', 'coronary bypass surgery']), Node(id=31, name='Patient', description='A person who is receiving medical treatment', alias=['individual', 'person']), Node(id=32, name='Complication', description='An adverse event that occurs during or after a medical treatment', alias=['adverse event', 'complication'])] edges=[Edge(source=31, target=30, name='undergoes', is_directed=True, description='The patient undergoes the surgical procedure')]
"<|begin_of_text|><|start_header_id|>system<|end_header_id|>
       You are a world class AI model who extracts nodes and entities from documents to add to an exiting Knowledge Graph creation task. Put yur reply in JSON<|eot_id|>
        <|start_header_id|>user<|end_header_id|>
        Here's the json schema you must adhere to: <schema>{'$defs': {'Edge': {'properties': {'source': {'description': 'ID of the sour

llama_perf_context_print:        load time =    5220.97 ms
llama_perf_context_print: prompt eval time =   20744.05 ms /  2419 tokens (    8.58 ms per token,   116.61 tokens per second)
llama_perf_context_print:        eval time =   15224.73 ms /   281 runs   (   54.18 ms per token,    18.46 tokens per second)
llama_perf_context_print:       total time =   40078.35 ms /  2700 tokens
 80%|████████████████████████████████████████████████████▊             | 8/10 [2:19:20<31:56, 958.18s/it]

nodes=[Node(id=31, name='Routine vaccination', description='A regular vaccination schedule', alias=['routine vaccination', 'immunization']), Node(id=33, name='MMR', description='A type of vaccine', alias=['measles, mumps, rubella', 'measles, mumps, rubella vaccine']), Node(id=34, name='Tetanus', description='A type of vaccine', alias=['tetanus vaccine', 'tetanus toxoid']), Node(id=35, name='Influenza', description='A type of vaccine', alias=['influenza vaccine', 'flu vaccine'])] edges=[Edge(source=31, target=33, name='includes', is_directed=True, description='The routine vaccination includes the MMR vaccine'), Edge(source=31, target=34, name='includes', is_directed=True, description='The routine vaccination includes the tetanus vaccine'), Edge(source=31, target=35, name='includes', is_directed=True, description='The routine vaccination includes the influenza vaccine')]
"<|begin_of_text|><|start_header_id|>system<|end_header_id|>
       You are a world class AI model who extracts nodes 

llama_perf_context_print:        load time =    5220.97 ms
llama_perf_context_print: prompt eval time =   23472.20 ms /  2635 tokens (    8.91 ms per token,   112.26 tokens per second)
llama_perf_context_print:        eval time =   16867.76 ms /   299 runs   (   56.41 ms per token,    17.73 tokens per second)
llama_perf_context_print:       total time =   45167.25 ms /  2934 tokens
 90%|███████████████████████████████████████████████████████████▍      | 9/10 [2:20:05<11:12, 672.76s/it]

nodes=[Node(id=36, name='Histopathology', description='The study of the microscopic structure of tissues', alias=['histopathology', 'microscopic examination']), Node(id=37, name='Ductal Carcinoma In Situ', description='A type of breast cancer', alias=['DCIS', 'ductal carcinoma in situ', 'breast cancer']), Node(id=38, name='Breast', description='The organ responsible for milk production in females', alias=['breast', 'mammary gland']), Node(id=39, name='Biopsy', description='A medical procedure to remove a sample of tissue for examination', alias=['biopsy', 'tissue sample'])] edges=[Edge(source=36, target=37, name='indicates', is_directed=True, description='The histopathology indicates the presence of DCIS'), Edge(source=37, target=38, name='located_in', is_directed=True, description='The DCIS is located in the breast'), Edge(source=38, target=39, name='sample_of', is_directed=True, description='The breast is a sample of tissue for examination')]
"<|begin_of_text|><|start_header_id|>syst

llama_perf_context_print:        load time =    5220.97 ms
llama_perf_context_print: prompt eval time =   24945.40 ms /  2866 tokens (    8.70 ms per token,   114.89 tokens per second)
llama_perf_context_print:        eval time =   26930.68 ms /   466 runs   (   57.79 ms per token,    17.30 tokens per second)
llama_perf_context_print:       total time =   60078.26 ms /  3332 tokens
100%|█████████████████████████████████████████████████████████████████| 10/10 [2:21:06<00:00, 846.60s/it]

nodes=[Node(id=40, name='Child', description='A young human being', alias=['young person', 'juvenile']), Node(id=41, name='Cough', description='A sudden and involuntary expulsion of air from the lungs', alias=['coughing', 'coughs']), Node(id=42, name='Fever', description='An elevated body temperature', alias=['feverish', 'febrile']), Node(id=43, name='Streptococcal Pharyngitis', description='An infection of the pharynx caused by streptococcal bacteria', alias=['strep throat', 'pharyngitis']), Node(id=44, name='Pharynx', description='The part of the throat that connects the nose and mouth', alias=['throat', 'pharynx']), Node(id=45, name='Streptococcus', description='A type of bacteria', alias=['strep', 'streptococcus'])] edges=[Edge(source=40, target=41, name='experiences', is_directed=True, description='The child experiences a cough'), Edge(source=40, target=42, name='experiences', is_directed=True, description='The child experiences a fever'), Edge(source=40, target=43, name='has_cond




In [6]:
graph.nodes[:5]

[Node(id=1, name='Patient', description='A person who is receiving medical treatment', alias=['individual', 'person']),
 Node(id=2, name='Diabetes Mellitus', description='A group of metabolic disorders characterized by high blood sugar levels', alias=['diabetes', 'DM']),
 Node(id=3, name='Metformin', description='A medication used to treat type 2 diabetes', alias=['glucophage', 'metformin hydrochloride']),
 Node(id=4, name='Type 2 Diabetes', description='A type of diabetes that is caused by insulin resistance and impaired insulin secretion', alias=['T2D', 'non-insulin-dependent diabetes']),
 Node(id=5, name='Daily', description='A unit of time', alias=['day', '24 hours']),
 Node(id=6, name='500mg', description='A dosage of medication', alias=['half a gram', '0.5 grams']),
 Node(id=7, name='Twice', description='A frequency of medication administration', alias=['every 12 hours', 'b.i.d.']),
 Node(id=8, name='MRI', description='A medical imaging technique', alias=['magnetic resonance imag

In [7]:
graph.edges[:5]

[Edge(source=1, target=2, name='has_disease', is_directed=True, description='The patient has the disease'),
 Edge(source=1, target=3, name='takes_medication', is_directed=True, description='The patient is taking the medication'),
 Edge(source=3, target=4, name='treats', is_directed=True, description='The medication treats the disease'),
 Edge(source=1, target=5, name='takes_medication_at', is_directed=True, description='The patient takes the medication at a specific time'),
 Edge(source=3, target=6, name='has_dosage', is_directed=True, description='The medication has a specific dosage'),
 Edge(source=1, target=7, name='takes_medication_frequency', is_directed=True, description='The patient takes the medication at a specific frequency'),
 Edge(source=1, target=8, name='uses', is_directed=True, description='The MRI uses a specific technique'),
 Edge(source=8, target=9, name='reveals', is_directed=True, description='The MRI reveals a lesion'),
 Edge(source=9, target=10, name='located_in',

In [10]:
create_graph_dataset(graph, dataset_name="medical_docs")

<curverag.atth.kg_dataset.KGDataset at 0x176e7a770>