In [1]:
# saves you having to use print as all exposed variables are printed in the cell
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [2]:
# suppress warning message
import warnings
warnings.filterwarnings('ignore')

In [3]:
import os
import pyarrow as pa
import pyarrow.compute as pc
from nn_rag import Knowledge, Controller

### Vector Params
    MILVUS_EMBEDDING_NAME
    MILVUS_EMBEDDING_DEVICE
    MILVUS_EMBEDDING_BATCH_SIZE
    MILVUS_EMBEDDING_DIM
    MILVUS_RESPONSE_LIMIT
    MILVUS_INDEX_METRIC
    MILVUS_DOC_REF

### Set parameters as environment variables

In [4]:
# Data
os.environ['HADRON_KNOWLEDGE_SOURCE_URI'] = './hadron/source/Gen AI Best Practices.pdf'
os.environ['HADRON_KNOWLEDGE_EMBED_URI'] = 'milvus://localhost:19530/rai'
os.environ['HADRON_KNOWLEDGE_QUERY_URI'] = 'milvus://localhost:19530/rai'

# Parameters
os.environ['HADRON_NUM_SENTENCE_CHUNK_SIZE'] = '10'

# Vector Db
os.environ['MILVUS_DOC_REF'] = 'genai_best_prectice'
os.environ['MILVUS_EMBEDDING_DEVICE'] = 'cpu'
os.environ['MILVUS_RESPONSE_LIMIT'] = '4'


### Instantiate capability

In [5]:
kn = Knowledge.from_env('demo', has_contract=False)

In [6]:
kn.set_description('a reusable component to embed documentation to build an AI knowledge base for a RAG')

In [7]:
kn.set_source_uri("${HADRON_KNOWLEDGE_SOURCE_URI}")
kn.set_persist_uri('${HADRON_KNOWLEDGE_EMBED_URI}')
kn.add_connector_uri('query', '${HADRON_KNOWLEDGE_QUERY_URI}')

<nn_rag.components.knowledge.Knowledge at 0x7ff0c531ce80>

<nn_rag.components.knowledge.Knowledge at 0x7ff0c531ce80>

In [8]:
kn.remove_canonical(kn.CONNECTOR_PERSIST)

### Document

In [9]:
doc = kn.load_source_canonical()

In [10]:
# tidy the text
doc = kn.tools.pattern_replace(doc, 'text', '\n', ' ', intent_order=-1)
doc = kn.tools.pattern_replace(doc, 'text', '  ', ' ', intent_order=-1)

### Sentences

In [11]:
sentences = kn.tools.text_profiler(doc, intent_order=-1)

In [12]:
sentences.column_names

['sentence', 'sentence_num', 'char_count', 'word_count', 'token_count']

In [13]:
pc.min(sentences['char_count']).as_py()
pc.mean(sentences['char_count']).as_py()
pc.max(sentences['char_count']).as_py()

2

176.3341836734694

1283

### Chunking

In [14]:
chunks = kn.tools.sentence_chunks(sentences, num_sentence_chunk_size='${HADRON_NUM_SENTENCE_CHUNK_SIZE}', intent_order=-1)

In [15]:
chunks.column_names

['chunk_text', 'chunk_char_count', 'chunk_word_count', 'chunk_token_count']

In [16]:
pc.min(chunks['chunk_char_count']).as_py()
pc.mean(chunks['chunk_char_count']).as_py()
pc.max(chunks['chunk_char_count']).as_py()

1

9.461679088735762

10

### Embedding

In [17]:
kn.save_persist_canonical(chunks)

RPC error: [load_collection], <MilvusException: (code=100, message=collection not found[collection=450442375261323421])>, <Time:{'RPC start': '2024-06-13 12:55:59.293276', 'RPC error': '2024-06-13 12:55:59.297113'}>


MilvusException: <MilvusException: (code=100, message=collection not found[collection=450442375261323421])>

### Run capability

In [None]:
kn.run_component_pipeline()

In [None]:
kn.remove_canonical(kn.CONNECTOR_PERSIST)

### Controller

In [None]:
ctr = Controller.from_env(has_contract=False)
ctr.set_use_case(title='Rag Demo', domain='General', 
                 overview='A pipeline that allows the collection of documentation to embed for a RAG catalog.', 
                 situation='HUB requirement for better access to documentation', 
                 opportunity='Improve accessibility through a RAG', 
                 actions='build a catalog of embedded documents')

In [None]:
ctr.register.knowledge(task_name='demo', intent_level='knowledge_demo')

In [None]:
kn.remove_canonical(kn.CONNECTOR_PERSIST)

In [None]:
ctr.run_controller()

### Query

In [None]:
query = "What are the key risks?"

In [None]:
kn.load_canonical('query', query=query)