In [1]:
# saves you having to use print as all exposed variables are printed in the cell
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [2]:
import pyarrow as pa
import pyarrow.compute as pc
from nn_rag import Knowledge

### Instantiate capability

In [3]:
kn = Knowledge.from_env('knowledge_demo', has_contract=False)
kn.set_description('Incubator to build a knowledge base for a RAG')

In [4]:
uri = "source/Gen AI Best Practices.pdf"
tbl = kn.set_source_uri(uri).load_source_canonical()
kn.set_persist_uri('./hadron/data/gen_ai_best_practice.parquet')

In [5]:
tbl.shape
tbl.column_names

(1, 1)

['text']

### Tidy the text

In [6]:
tbl = kn.tools.correlate_replace(tbl, 'text', '\n', ' ', intent_order=-1)
tbl = kn.tools.correlate_replace(tbl, 'text', '  ', ' ', intent_order=-1)

### Profling

In [7]:
profile = kn.tools.text_profiler(tbl, profile_name='gen_ai_best_practice', num_sentence_chunk_size=10, intent_order=-1)
profile.shape
profile.column_names

(1, 10)

['text',
 'profile_name',
 'page_number',
 'char_count',
 'word_count',
 'sentence_count_raw',
 'token_count',
 'sentences',
 'sentence_chunks',
 'num_chunks']

### Chunking

In [8]:
chunks = kn.tools.sentence_chunk(profile, intent_order=-1)

In [9]:
chunks.shape
chunks.column_names

(40, 5)

['page_number',
 'sentence_chunk',
 'chunk_char_count',
 'chunk_word_count',
 'chunk_token_count']

### Embedding

In [10]:
embedding = kn.tools.chunk_embedding(chunks, batch_size=32, embedding_name='all-mpnet-base-v2', device='cpu', intent_order=-1)

In [11]:
type(embedding)
embedding.shape

pyarrow.lib.Tensor

(40, 768)

### Run Capability

In [13]:
kn.run_component_pipeline()

### Reports

In [14]:
kn.report_intent()

Unnamed: 0,level,order,intent,parameters,creator
0,primary,0,correlate_replace,"[""header='text'"", ""pattern='\n'"", ""replacement=' '"", 'save_intent=False']",default
1,,1,correlate_replace,"[""header='text'"", ""pattern=' '"", ""replacement=' '"", 'save_intent=False']",default
2,,2,text_profiler,"[""profile_name='gen_ai_best_practice'"", 'num_sentence_chunk_size=10', 'save_intent=False']",default
3,,3,sentence_chunk,['save_intent=False'],default
4,,4,chunk_embedding,"['batch_size=32', ""embedding_name='all-mpnet-base-v2'"", ""device='cpu'"", 'save_intent=False']",default


In [15]:
kn.report_connectors()

Unnamed: 0,connector_name,uri,module_name,handler,version,kwargs,query,aligned
0,primary_source,Gen AI Best Practices.pdf,nn_rag.handlers.knowledge_handlers,KnowledgePersistHandler,0.0.1,,,False
1,primary_persist,gen_ai_best_practice.parquet,nn_rag.handlers.knowledge_handlers,KnowledgePersistHandler,0.0.1,,,False


In [16]:
kn.report_task()

Unnamed: 0,name,value
0,contract,knowledge
1,task,knowledge_demo
2,description,Incubator to build a knowledge base for a RAG
3,status,discovery
4,version,0.0.1
