In [1]:
# saves you having to use print as all exposed variables are printed in the cell
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [2]:
# suppress warning message
import warnings
warnings.filterwarnings('ignore')

In [3]:
import os
import pyarrow as pa
import pyarrow.compute as pc
from nn_rag import Knowledge, Controller

### Milvus Vector Params

    URI example
        uri = "milvus://host:port/database?collection=<name>&doc_ref=<name>"

    params:
        collection: The name of the collection
        doc_ref: a name to reference the document

    Environment Hyperparams:
        MILVUS_EMBEDDING_NAME
        MILVUS_EMBEDDING_DEVICE
        MILVUS_EMBEDDING_BATCH_SIZE
        MILVUS_EMBEDDING_DIM
        MILVUS_INDEX_CLUSTERS
        MILVUS_INDEX_SIMILARITY_TYPE
        MILVUS_QUERY_SEARCH_LIMIT
        MILVUS_QUERY_NUM_SIMILARITY
        

### Set parameters as environment variables

In [4]:
# Data
# os.environ['HADRON_KNOWLEDGE_SOURCE_URI'] = './hadron/source/Gen AI Best Practices.pdf'
os.environ['HADRON_KNOWLEDGE_SOURCE_URI'] = "./hadron/source/Human-Nutrition-2020-Edition.pdf"
os.environ['HADRON_KNOWLEDGE_EMBED_URI'] = 'milvus://localhost:19530/rai?collection=demo&doc_ref=human_nutrition'

# Parameters
os.environ['HADRON_KNOWLEDGE_EMBEDDING_NAME'] = 'all-mpnet-base-v2'
os.environ['HADRON_KNOWLEDGE_CHUNK_SIZE'] = '500'

# Vector Db
os.environ['MILVUS_EMBEDDING_NAME'] = 'all-mpnet-base-v2'
os.environ['MILVUS_EMBEDDING_DIM'] = '768'
os.environ['MILVUS_EMBEDDING_DEVICE'] = 'cpu'
os.environ['MILVUS_QUERY_SEARCH_LIMIT'] = '8'


### Instantiate capability

In [5]:
kn = Knowledge.from_env('demo', has_contract=False)

In [6]:
kn.set_description('a reusable component to embed documentation to build an AI knowledge base for a RAG')

In [7]:
kn.set_source_uri("${HADRON_KNOWLEDGE_SOURCE_URI}")
kn.set_persist_uri('${HADRON_KNOWLEDGE_EMBED_URI}')
kn.add_connector_uri('query', '${HADRON_KNOWLEDGE_EMBED_URI}')

<nn_rag.components.knowledge.Knowledge at 0x7fc45c8a8190>

<nn_rag.components.knowledge.Knowledge at 0x7fc45c8a8190>

### Delete the collection

In [8]:
kn.remove_collection(kn.CONNECTOR_PERSIST)

### Document

In [9]:
doc = kn.load_source_canonical()

In [10]:
# tidy the text
doc = kn.tools.str_pattern_replace(doc, 'text', '\n', ' ', intent_order=-1)
doc = kn.tools.str_pattern_replace(doc, 'text', '  ', ' ', intent_order=-1)

### Sentences

In [11]:
# sentences = kn.tools.text_profiler(doc, embedding_name='${HADRON_KNOWLEDGE_EMBEDDING_NAME}', intent_order=-1)
sentences = kn.tools.text_profiler(doc, intent_order=-1)

In [12]:
sentences.column_names

['sentence',
 'sentence_score',
 'sentence_num',
 'char_count',
 'word_count',
 'token_count']

In [13]:
pc.min(sentences['char_count']).as_py()
pc.mean(sentences['char_count']).as_py()
pc.max(sentences['char_count']).as_py()

2

119.34375553587245

3634

In [14]:
# pc.count(pc.filter(sentences['sentence_score'], pc.greater(sentences['sentence_score'], 0.90))).as_py()

### Chunking

In [15]:
# chunks = kn.tools.text_chunker(sentences, char_chunk_size='${HADRON_KNOWLEDGE_CHUNK_SIZE}', temperature=0.9, intent_order=-1)
chunks = kn.tools.text_chunker(sentences, char_chunk_size='${HADRON_KNOWLEDGE_CHUNK_SIZE}', intent_order=-1)

In [16]:
chunks.column_names

['chunk_text', 'chunk_char_count', 'chunk_word_count', 'chunk_token_count']

In [17]:
pc.min(chunks['chunk_char_count']).as_py()
pc.mean(chunks['chunk_char_count']).as_py()
pc.max(chunks['chunk_char_count']).as_py()

179

549.5802060338484

552

### Embedding
We are using the controller to populate the Vector RAG

In [18]:
# kn.save_persist_canonical(chunks)

-------------------
### Controller

In [19]:
ctr = Controller.from_env(has_contract=False)
ctr.set_use_case(title='Rag Demo', domain='General', 
                 overview='A pipeline that allows the collection of documentation to embed for a RAG catalog.', 
                 situation='HUB requirement for better access to documentation', 
                 opportunity='Improve accessibility through a RAG', 
                 actions='build a catalog of embedded documents')

In [20]:
ctr.register.knowledge(task_name='demo', intent_level='knowledge_demo')

In [21]:
# kn.remove_canonical(kn.CONNECTOR_PERSIST)

In [None]:
ctr.run_controller()

----------------
### Milvus Vector DB

In [None]:
result = kn.load_persist_canonical()
result.shape