In [None]:
# saves you having to use print as all exposed variables are printed in the cell
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [None]:
import os
from nn_rag import Knowledge, Controller

### Set parameters as environment variables

In [None]:
# Data
os.environ['HADRON_KNOWLEDGE_SOURCE_URI'] = './hadron/source/euaiact.pdf'
os.environ['HADRON_KNOWLEDGE_EMBED_URI'] = './hadron/data/eu_ai_act_2024.embedding'
os.environ['HADRON_KNOWLEDGE_CHUNK_URI'] = './hadron/data/eu_ai_act_2024.parquet'

# Parameters
os.environ['HADRON_NUM_SENTENCE_CHUNK_SIZE'] = '5'
os.environ['HADRON_EMBEDDING_BATCH_SIZE'] = '32'
os.environ['HADRON_EMBEDDING_MODEL_NAME'] = 'all-mpnet-base-v2'
os.environ['HADRON_EMBEDDING_DEVICE'] = 'cpu'

### Instantiate capability

In [None]:
kn = Knowledge.from_env('demo', has_contract=False)

In [None]:
kn.set_description('a reusable component to embed documentation to build an AI knowledge base for a RAG')

In [None]:
kn.set_source_uri("${HADRON_KNOWLEDGE_SOURCE_URI}")
kn.set_persist_uri('${HADRON_KNOWLEDGE_EMBED_URI}')
kn.add_connector_uri('chunks', '${HADRON_KNOWLEDGE_CHUNK_URI}')

### Document

In [None]:
doc = kn.load_source_canonical()

In [None]:
# tidy the text
doc = kn.tools.pattern_replace(doc, 'text', '\n', ' ', intent_order=-1)
doc = kn.tools.pattern_replace(doc, 'text', '  ', ' ', intent_order=-1)

### Sentences

In [None]:
sentences = kn.tools.text_profiler(doc, intent_order=-1)

### Chunking

In [None]:
chunks = kn.tools.sentence_chunks(sentences, num_sentence_chunk_size=10, intent_order=-1)

In [None]:
kn.save_canonical('chunks', chunks)

### Embedding

In [None]:
embedding = kn.tools.chunk_embedding(chunks, batch_size='${HADRON_EMBEDDING_BATCH_SIZE}', 
                                     embedding_name='${HADRON_EMBEDDING_MODEL_NAME}', 
                                     device='${HADRON_EMBEDDING_DEVICE}', 
                                     intent_order=-1)

### Run capability

In [None]:
kn.run_component_pipeline()

In [None]:
kn.remove_canonical(kn.CONNECTOR_PERSIST)

### Controller

In [None]:
ctr = Controller.from_env(has_contract=False)
ctr.set_use_case(title='Rag Demo', domain='General', 
                 overview='A pipeline that allows the collection of documentation to embed for a RAG catalog.', 
                 situation='HUB requirement for better access to documentation', 
                 opportunity='Improve accessibility through a RAG', 
                 actions='build a catalog of embedded documents')

In [None]:
ctr.register.knowledge(task_name='demo', intent_level='knowledge_demo')

In [None]:
kn.remove_canonical(kn.CONNECTOR_PERSIST)

In [None]:
ctr.run_controller()

### Reports

#### controller

In [None]:
ctr.report_use_case()

In [None]:
ctr.report_intent()

#### knowledge

In [None]:
kn.report_task()

In [None]:
kn.report_intent()

In [None]:
kn.report_connectors()