In [1]:
# saves you having to use print as all exposed variables are printed in the cell
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [2]:
import os
from nn_rag import Knowledge, Controller

### Set parameters as environment variables

In [3]:
# Remote recipie
os.environ['HADRON_DOMAIN_REPO_PATH'] = 'https://raw.githubusercontent.com/project-hadron/neural_rag/main/jupyter/knowledge/hadron/contracts/'

# Data
os.environ['HADRON_KNOWLEDGE_SOURCE_URI'] = './source/Gen AI Best Practices.pdf'
os.environ['HADRON_KNOWLEDGE_PERSIST_URI'] = './hadron/data/gen_ai_best_practice.parquet'

# Parameters
os.environ['HADRON_REFERENCE_TEXT_NAME'] = 'GenAIBestPractice'
os.environ['HADRON_NUM_SENTENCE_CHUNK_SIZE'] = '10'
os.environ['HADRON_EMBEDDING_BATCH_SIZE'] = '32'
os.environ['HADRON_EMBEDDING_MODEL_NAME'] = 'all-mpnet-base-v2'
os.environ['HADRON_EMBEDDING_DEVICE'] = 'cpu'

### Instantiate capability

In [4]:
kn = Knowledge.from_env('demo', has_contract=False)

In [5]:
kn.set_description('Incubator to build a knowledge base for a RAG')

In [6]:
tbl = kn.set_source_uri("${HADRON_KNOWLEDGE_SOURCE_URI}").load_source_canonical()
kn.set_persist_uri('${HADRON_KNOWLEDGE_PERSIST_URI}')

### Tidy the text

In [7]:
tbl = kn.tools.correlate_replace(tbl, 'text', '\n', ' ', intent_order=-1)
tbl = kn.tools.correlate_replace(tbl, 'text', '  ', ' ', intent_order=-1)

In [9]:
profile = kn.tools.text_profiler(tbl, to_drop=[0, (2,4)], as_text=True, intent_order=-1)

ArrowInvalid: cannot mix list and non-list, non-null values

### Chunking

In [None]:
chunks = kn.tools.text_chunk(clean_text, text_name='GenAIBestPractice', num_sentence_chunk_size=10, intent_order=-1)

### Embedding

In [None]:
embedding = kn.tools.chunk_embedding(chunks, batch_size='${HADRON_EMBEDDING_BATCH_SIZE}', 
                                     embedding_name='${HADRON_EMBEDDING_MODEL_NAME}', 
                                     device='${HADRON_EMBEDDING_DEVICE}', 
                                     intent_order=-1)

### Run capability

In [None]:
kn.run_component_pipeline()

In [None]:
kn.remove_canonical(kn.CONNECTOR_PERSIST)

### Controller

In [None]:
ctr = Controller.from_env(has_contract=False)
ctr.set_use_case(title='Rag Demo')

In [None]:
ctr.register.knowledge(task_name='demo', intent_level='knowledge_demo')

In [None]:
kn.remove_canonical(kn.CONNECTOR_PERSIST)

In [None]:
ctr.run_controller()

### Reports
#### knowledge

In [None]:
kn.report_task()

In [None]:
kn.report_intent()

In [None]:
kn.report_connectors()

In [None]:
kn.report_task()

#### controller

In [None]:
ctr.report_task()

In [None]:
ctr.report_