In [1]:
# saves you having to use print as all exposed variables are printed in the cell
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [2]:
# suppress warning message
import warnings
warnings.filterwarnings('ignore')

In [3]:
import os
import pyarrow as pa
import pyarrow.compute as pc
from nn_rag import Knowledge, Controller

### Vector Params
    MILVUS_EMBEDDING_NAME
    MILVUS_EMBEDDING_DEVICE
    MILVUS_EMBEDDING_BATCH_SIZE
    MILVUS_EMBEDDING_DIM
    MILVUS_RESPONSE_LIMIT
    MILVUS_INDEX_METRIC
    MILVUS_DOC_REF

### Set parameters as environment variables

In [4]:
# Data
os.environ['HADRON_KNOWLEDGE_SOURCE_URI'] = './hadron/source/Gen AI Best Practices.pdf'
os.environ['HADRON_KNOWLEDGE_EMBED_URI'] = 'milvus://localhost:19530/rai'
os.environ['HADRON_KNOWLEDGE_QUERY_URI'] = 'milvus://localhost:19530/rai'

# Parameters
os.environ['HADRON_NUM_SENTENCE_CHUNK_SIZE'] = '400'

# Vector Db
os.environ['MILVUS_DOC_REF'] = 'genai_best_prectice'
os.environ['MILVUS_EMBEDDING_DEVICE'] = 'cpu'
os.environ['MILVUS_RESPONSE_LIMIT'] = '4'


### Instantiate capability

In [5]:
kn = Knowledge.from_env('demo', has_contract=False)

In [6]:
kn.set_description('a reusable component to embed documentation to build an AI knowledge base for a RAG')

In [7]:
kn.set_source_uri("${HADRON_KNOWLEDGE_SOURCE_URI}")
kn.set_persist_uri('${HADRON_KNOWLEDGE_EMBED_URI}')
kn.add_connector_uri('query', '${HADRON_KNOWLEDGE_QUERY_URI}')

<nn_rag.components.knowledge.Knowledge at 0x7fd80c32c760>

<nn_rag.components.knowledge.Knowledge at 0x7fd80c32c760>

### Document

In [8]:
doc = kn.load_source_canonical()

In [9]:
# tidy the text
doc = kn.tools.pattern_replace(doc, 'text', '\n', ' ', intent_order=-1)
doc = kn.tools.pattern_replace(doc, 'text', '  ', ' ', intent_order=-1)

### Sentences

In [10]:
sentences = kn.tools.text_profiler(doc, intent_order=-1)

In [11]:
sentences.column_names

['sentence', 'sentence_num', 'char_count', 'word_count', 'token_count']

In [12]:
pc.min(sentences['char_count']).as_py()
pc.mean(sentences['char_count']).as_py()
pc.max(sentences['char_count']).as_py()

2

176.3341836734694

1283

### Chunking

In [13]:
chunks = kn.tools.sentence_chunks(sentences, num_sentence_chunk_size='${HADRON_NUM_SENTENCE_CHUNK_SIZE}', intent_order=-1)

In [14]:
chunks.column_names

['chunk_text', 'chunk_char_count', 'chunk_word_count', 'chunk_token_count']

In [15]:
pc.min(chunks['chunk_char_count']).as_py()
pc.mean(chunks['chunk_char_count']).as_py()
pc.max(chunks['chunk_char_count']).as_py()

2

171.07178217821783

400

### Embedding

In [16]:
kn.save_persist_canonical(chunks)

### Run capability

In [17]:
kn.run_component_pipeline()

In [18]:
kn.remove_canonical(kn.CONNECTOR_PERSIST)

### Controller

In [19]:
ctr = Controller.from_env(has_contract=False)
ctr.set_use_case(title='Rag Demo', domain='General', 
                 overview='A pipeline that allows the collection of documentation to embed for a RAG catalog.', 
                 situation='HUB requirement for better access to documentation', 
                 opportunity='Improve accessibility through a RAG', 
                 actions='build a catalog of embedded documents')

In [20]:
ctr.register.knowledge(task_name='demo', intent_level='knowledge_demo')

(404, 4)

In [21]:
kn.remove_canonical(kn.CONNECTOR_PERSIST)

In [22]:
ctr.run_controller()

### Query

In [23]:
query = "What are the key risks?"

In [24]:
kn.load_canonical('query', query=query)

pyarrow.Table
id: string
distance: float
source: string
----
id: [["genai_best_prectice_325","genai_best_prectice_430","genai_best_prectice_51","genai_best_prectice_3"]]
distance: [[0.7379541,0.81138736,0.8190517,0.83952427]]
source: [["Risk: Deﬁned in ISO standards, for example, as the “combination of the probability of occurrence of harm and the severity of that harm” (ISO IEC Guide 51, ISO 12100, ISO 14971), the RAI Institute evaluates risks in terms of risk, harm, threats, and impacts, identifying if a risk is mitigable or unmitigable, caused by intentional or unintentional action or systemic forces, and categorizing approp","Risk: Deﬁned in ISO standards, for example, as the “combination of the probability of occurrence of harm and the severity of that harm” (ISO IEC Guide 51, ISO 12100, ISO 14971), the RAI Institute ev","These risks extend beyond individual developers, affecting entire organizations and potentially leading to widespread societal impacts.","Without investing effor