In [1]:
# saves you having to use print as all exposed variables are printed in the cell
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [2]:
# suppress warning message
import warnings
warnings.filterwarnings('ignore')

In [3]:
import os
import pyarrow as pa
import pyarrow.compute as pc
from nn_rag import Knowledge, Controller

### Milvus Vector Params

    URI example
        uri = "milvus://host:port/database?collection=<name>&doc_ref=<name>"

    params:
        collection: The name of the collection
        doc_ref: a name to reference the document

    Environment Hyperparams:
        MILVUS_EMBEDDING_NAME
        MILVUS_EMBEDDING_DEVICE
        MILVUS_EMBEDDING_BATCH_SIZE
        MILVUS_EMBEDDING_DIM
        MILVUS_INDEX_CLUSTERS
        MILVUS_INDEX_SIMILARITY_TYPE
        MILVUS_QUERY_SEARCH_LIMIT
        MILVUS_QUERY_NUM_SIMILARITY
        

### Set parameters as environment variables

In [4]:
# Data
os.environ['HADRON_KNOWLEDGE_SOURCE_URI'] = './hadron/source/Gen AI Best Practices.pdf'
os.environ['HADRON_KNOWLEDGE_EMBED_URI'] = 'milvus://localhost:19530/rai?collection=&doc_ref=genai_best_prectice'

# Parameters
os.environ['HADRON_SENTENCE_SCORE_NAME'] = 'all-mpnet-base-v2'
os.environ['HADRON_SENTENCE_CHUNK_SIZE'] = '500'

# Vector Db
os.environ['MILVUS_EMBEDDING_NAME'] = 'all-mpnet-base-v2'
os.environ['MILVUS_EMBEDDING_DIM'] = '768'
os.environ['MILVUS_EMBEDDING_DEVICE'] = 'cpu'
os.environ['MILVUS_QUERY_SEARCH_LIMIT'] = '5'


### Instantiate capability

In [5]:
kn = Knowledge.from_env('demo', has_contract=False)

In [6]:
kn.set_description('a reusable component to embed documentation to build an AI knowledge base for a RAG')

In [7]:
kn.set_source_uri("${HADRON_KNOWLEDGE_SOURCE_URI}")
kn.set_persist_uri('${HADRON_KNOWLEDGE_EMBED_URI}')
kn.add_connector_uri('query', '${HADRON_KNOWLEDGE_EMBED_URI}')

<nn_rag.components.knowledge.Knowledge at 0x7ff67587fc70>

<nn_rag.components.knowledge.Knowledge at 0x7ff67587fc70>

In [8]:
kn.remove_canonical(kn.CONNECTOR_PERSIST)

### Document

In [9]:
doc = kn.load_source_canonical()

In [10]:
# tidy the text
doc = kn.tools.pattern_replace(doc, 'text', '\n', ' ', intent_order=-1)
doc = kn.tools.pattern_replace(doc, 'text', '  ', ' ', intent_order=-1)

### Sentences

In [11]:
sentences = kn.tools.text_profiler(doc, embedding_name='${HADRON_SENTENCE_SCORE_NAME}', intent_order=-1)

In [12]:
sentences.column_names

['sentence',
 'sentence_score',
 'sentence_num',
 'char_count',
 'word_count',
 'token_count']

In [13]:
pc.min(sentences['char_count']).as_py()
pc.mean(sentences['char_count']).as_py()
pc.max(sentences['char_count']).as_py()

2

176.3341836734694

1283

In [14]:
pc.count(pc.filter(sentences['sentence_score'], pc.greater(sentences['sentence_score'], 0.85))).as_py()

5

### Chunking

In [15]:
chunks = kn.tools.sentence_chunks(sentences, num_sentence_chunk_size='${HADRON_SENTENCE_CHUNK_SIZE}', intent_order=-1)

In [16]:
chunks.column_names

['chunk_text', 'chunk_char_count', 'chunk_word_count', 'chunk_token_count']

In [17]:
pc.min(chunks['chunk_char_count']).as_py()
pc.mean(chunks['chunk_char_count']).as_py()
pc.max(chunks['chunk_char_count']).as_py()

2

174.97215189873418

500

### Embedding

In [18]:
kn.save_persist_canonical(chunks)

### Controller

In [19]:
ctr = Controller.from_env(has_contract=False)
ctr.set_use_case(title='Rag Demo', domain='General', 
                 overview='A pipeline that allows the collection of documentation to embed for a RAG catalog.', 
                 situation='HUB requirement for better access to documentation', 
                 opportunity='Improve accessibility through a RAG', 
                 actions='build a catalog of embedded documents')

In [20]:
ctr.register.knowledge(task_name='demo', intent_level='knowledge_demo')

In [21]:
kn.remove_canonical(kn.CONNECTOR_PERSIST)

In [22]:
ctr.run_controller()

NameError: name 'seed' is not defined

### Query

In [None]:
query = "What are the key risks?"

In [None]:
kn.load_canonical('query', query=query)