In [1]:
# saves you having to use print as all exposed variables are printed in the cell
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [2]:
import os
from nn_rag import Knowledge, Controller

### Set parameters as environment variables

In [3]:
# Data
os.environ['HADRON_KNOWLEDGE_SOURCE_URI'] = './hadron/source/euaiact.pdf'
os.environ['HADRON_KNOWLEDGE_EMBED_URI'] = './hadron/data/eu_ai_act_2024.embedding'
os.environ['HADRON_KNOWLEDGE_CHUNK_URI'] = './hadron/data/eu_ai_act_2024.parquet'

# Parameters
os.environ['HADRON_NUM_SENTENCE_CHUNK_SIZE'] = '5'
os.environ['HADRON_EMBEDDING_BATCH_SIZE'] = '32'
os.environ['HADRON_EMBEDDING_MODEL_NAME'] = 'all-mpnet-base-v2'
os.environ['HADRON_EMBEDDING_DEVICE'] = 'cpu'

### Instantiate capability

In [4]:
kn = Knowledge.from_env('demo', has_contract=False)

In [5]:
kn.set_description('a reusable component to embed documentation to build an AI knowledge base for a RAG')

In [6]:
kn.set_source_uri("${HADRON_KNOWLEDGE_SOURCE_URI}")
kn.set_persist_uri('${HADRON_KNOWLEDGE_EMBED_URI}')
kn.add_connector_uri('chunks', '${HADRON_KNOWLEDGE_CHUNK_URI}')

<nn_rag.components.knowledge.Knowledge at 0x7fac9f0eee30>

<nn_rag.components.knowledge.Knowledge at 0x7fac9f0eee30>

### Document

In [7]:
doc = kn.load_source_canonical()

In [8]:
# tidy the text
doc = kn.tools.pattern_replace(doc, 'text', '\n', ' ', intent_order=-1)
doc = kn.tools.pattern_replace(doc, 'text', '  ', ' ', intent_order=-1)

### Sentences

In [9]:
sentences = kn.tools.text_profiler(doc, intent_order=-1)

### Chunking

In [10]:
chunks = kn.tools.sentence_chunks(sentences, num_sentence_chunk_size=10, intent_order=-1)

In [11]:
kn.save_canonical('chunks', chunks)

### Embedding

In [12]:
embedding = kn.tools.chunk_embedding(chunks, batch_size='${HADRON_EMBEDDING_BATCH_SIZE}', 
                                     embedding_name='${HADRON_EMBEDDING_MODEL_NAME}', 
                                     device='${HADRON_EMBEDDING_DEVICE}', 
                                     intent_order=-1)



### Run capability

In [13]:
kn.run_component_pipeline()



In [14]:
kn.remove_canonical(kn.CONNECTOR_PERSIST)

### Controller

In [15]:
ctr = Controller.from_env(has_contract=False)
ctr.set_use_case(title='Rag Demo', domain='General', 
                 overview='A pipeline that allows the collection of documentation to embed for a RAG catalog.', 
                 situation='HUB requirement for better access to documentation', 
                 opportunity='Improve accessibility through a RAG', 
                 actions='build a catalog of embedded documents')

In [16]:
ctr.register.knowledge(task_name='demo', intent_level='knowledge_demo')

(248, 768)

In [17]:
kn.remove_canonical(kn.CONNECTOR_PERSIST)

In [18]:
ctr.run_controller()



### Reports

#### controller

In [19]:
ctr.report_use_case()

Unnamed: 0,use_case,values
0,title,Rag Demo
1,domain,General
2,situation,HUB requirement for better access to documentation
3,opportunity,Improve accessibility through a RAG
4,actions,build a catalog of embedded documents


In [20]:
ctr.report_intent()

Unnamed: 0,level,order,intent,parameters,creator
0,knowledge_demo,0,knowledge,"[""task_name='demo'"", 'save_intent=False']",default


#### knowledge

In [21]:
kn.report_task()

Unnamed: 0,name,value
0,contract,knowledge
1,task,demo
2,description,a reusable component to embed documentation to build an AI knowledge base for a RAG
3,status,discovery
4,version,0.0.1


In [22]:
kn.report_intent()

Unnamed: 0,level,order,intent,parameters,creator
0,primary,0,pattern_replace,"[""header='text'"", ""pattern='\n'"", ""replacement=' '"", 'save_intent=False']",default
1,,1,pattern_replace,"[""header='text'"", ""pattern=' '"", ""replacement=' '"", 'save_intent=False']",default
2,,2,text_profiler,['save_intent=False'],default
3,,3,sentence_chunks,"['num_sentence_chunk_size=10', 'save_intent=False']",default
4,,4,chunk_embedding,"[""batch_size='${HADRON_EMBEDDING_BATCH_SIZE}'"", ""embedding_name='${HADRON_EMBEDDING_MODEL_NAME}'"", ""device='${HADRON_EMBEDDING_DEVICE}'"", 'save_intent=False']",default


In [23]:
kn.report_connectors()

Unnamed: 0,connector_name,uri,module_name,handler,version,kwargs,query,aligned
0,primary_source,{HADRON_KNOWLEDGE_SOURCE_URI},nn_rag.handlers.knowledge_handlers,KnowledgePersistHandler,0.0.1,,,False
1,primary_persist,{HADRON_KNOWLEDGE_EMBED_URI},nn_rag.handlers.knowledge_handlers,KnowledgePersistHandler,0.0.1,,,False
2,chunks,{HADRON_KNOWLEDGE_CHUNK_URI},nn_rag.handlers.knowledge_handlers,KnowledgePersistHandler,0.0.1,,,False
