In [1]:
# saves you having to use print as all exposed variables are printed in the cell
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [2]:
import os
import pyarrow as pa
import pyarrow.compute as pc
from nn_rag import Knowledge, Controller

### Set parameters as environment variables

In [3]:
os.environ['HADRON_PROFILE_NAME'] = 'GenAIBestPractice'
os.environ['HADRON_NUM_SENTENCE_CHUNK_SIZE'] = '10'
os.environ['HADRON_EMBEDDING_BATCH_SIZE'] = '32'
os.environ['HADRON_EMBEDDING_MODEL_NAME'] = 'all-mpnet-base-v2'
os.environ['HADRON_EMBEDDING_DEVICE'] = 'cpu'


### Instantiate capability

In [4]:
kn = Knowledge.from_env('demo', has_contract=False)

In [5]:
kn.set_description('Incubator to build a knowledge base for a RAG')

In [6]:
uri = "source/Gen AI Best Practices.pdf"
tbl = kn.set_source_uri(uri).load_source_canonical()
kn.set_persist_uri('./hadron/data/gen_ai_best_practice.parquet')

In [7]:
tbl.shape
tbl.column_names

(1, 1)

['text']

### Tidy the text

In [8]:
tbl = kn.tools.correlate_replace(tbl, 'text', '\n', ' ', intent_order=-1)
tbl = kn.tools.correlate_replace(tbl, 'text', '  ', ' ', intent_order=-1)

### Profling

In [9]:
profile = kn.tools.text_profiler(tbl, profile_name='${HADRON_PROFILE_NAME}', 
                                 num_sentence_chunk_size='${HADRON_NUM_SENTENCE_CHUNK_SIZE}', 
                                 intent_order=-1)
profile.shape
profile.column_names

(1, 10)

['text',
 'profile_name',
 'page_number',
 'char_count',
 'word_count',
 'sentence_count_raw',
 'token_count',
 'sentences',
 'sentence_chunks',
 'num_chunks']

### Chunking

In [10]:
chunks = kn.tools.sentence_chunk(profile, intent_order=-1)

In [11]:
chunks.shape
chunks.column_names

(40, 5)

['page_number',
 'sentence_chunk',
 'chunk_char_count',
 'chunk_word_count',
 'chunk_token_count']

### Embedding

In [12]:
embedding = kn.tools.chunk_embedding(chunks, batch_size='${HADRON_EMBEDDING_BATCH_SIZE}', 
                                     embedding_name='${HADRON_EMBEDDING_MODEL_NAME}', 
                                     device='${HADRON_EMBEDDING_DEVICE}', 
                                     intent_order=-1)

In [13]:
type(embedding)
embedding.shape

pyarrow.lib.Tensor

(40, 768)

### Run capability

In [14]:
kn.run_component_pipeline()
# kn.save_persist_canonical(embedding)

### Controller

In [15]:
ctr = Controller.from_env(has_contract=False)
ctr.set_use_case(title='Rag Demo')

In [16]:
ctr.register.knowledge(task_name='demo', intent_level='knowledge_demo')

(40, 768)

In [18]:
ctr.run_controller()