In [1]:
# saves you having to use print as all exposed variables are printed in the cell
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [2]:
# suppress warning message
import warnings
warnings.filterwarnings('ignore')

In [3]:
import os
import pyarrow as pa
import pyarrow.compute as pc
from nn_rag import Knowledge, Controller

### Milvus Vector Params

    URI example
        uri = "milvus://host:port/database?collection=<name>&partition=<name>"

    params:
        collection: The name of the collection
        partition: The name of the document partition

    Environment Hyperparams:
        MILVUS_EMBEDDING_NAME
        MILVUS_EMBEDDING_DEVICE
        MILVUS_EMBEDDING_BATCH_SIZE
        MILVUS_EMBEDDING_DIM
        MILVUS_INDEX_CLUSTERS
        MILVUS_INDEX_SIMILARITY_TYPE
        MILVUS_QUERY_SEARCH_LIMIT
        MILVUS_QUERY_NUM_SIMILARITY


### Set parameters as environment variables

In [4]:
# Data
os.environ['HADRON_KNOWLEDGE_SOURCE_URI'] = './hadron/source/Gen AI Best Practices.pdf'
os.environ['HADRON_KNOWLEDGE_EMBED_URI'] = 'milvus://localhost:19530/rai?collection=demo&partition=gen_ai_best_practices'


### Instantiate capability

In [5]:
kn = Knowledge.from_env('base', has_contract=False)

In [6]:
kn.set_description('a reusable component to embed documentation to build an AI knowledge base for a RAG')

In [7]:
kn.set_source_uri("${HADRON_KNOWLEDGE_SOURCE_URI}")
kn.set_persist_uri('${HADRON_KNOWLEDGE_EMBED_URI}')
kn.add_connector_uri('query', '${HADRON_KNOWLEDGE_EMBED_URI}')

<nn_rag.components.knowledge.Knowledge at 0x7f92bc68b730>

<nn_rag.components.knowledge.Knowledge at 0x7f92bc68b730>

### Delete the collection

In [8]:
kn.remove_embedding()

### Document

In [9]:
doc = kn.load_source_canonical()

In [10]:
# tidy the text
# doc = kn.tools.str_pattern_replace(doc, 'text', '\n', ' ', intent_order=-1)
# doc = kn.tools.str_pattern_replace(doc, 'text', '  ', ' ', intent_order=-1)

### Sentences

In [11]:
# sentences = kn.tools.text_profiler(doc, embedding_name='${HADRON_KNOWLEDGE_EMBEDDING_NAME}', intent_order=-1)
sentences = kn.tools.text_(doc, intent_order=-1)

AttributeError: 'KnowledgeIntent' object has no attribute 'text_'

In [None]:
sentences.column_names

In [None]:
pc.min(sentences['char_count']).as_py()
pc.mean(sentences['char_count']).as_py()
pc.max(sentences['char_count']).as_py()

In [None]:
# pc.count(pc.filter(sentences['sentence_score'], pc.greater(sentences['sentence_score'], 0.90))).as_py()

### Chunking

In [None]:
# chunks = kn.tools.text_chunker(sentences, char_chunk_size='${HADRON_KNOWLEDGE_CHUNK_SIZE}', temperature=0.9, intent_order=-1)
chunks = kn.tools.text_chunker(sentences, char_chunk_size='${HADRON_KNOWLEDGE_CHUNK_SIZE}', intent_order=-1)

In [None]:
chunks.column_names

In [None]:
pc.min(chunks['chunk_char_count']).as_py()
pc.mean(chunks['chunk_char_count']).as_py()
pc.max(chunks['chunk_char_count']).as_py()

### Embedding
We are using the controller to populate the Vector RAG

In [None]:
# kn.save_persist_canonical(chunks)

-------------------
### Controller

In [None]:
ctr = Controller.from_env(has_contract=False)
ctr.set_use_case(title='Knowledge Base', domain='General', 
                 overview='A pipeline that allows the collection of documentation to embed for a RAG catalog.', 
                 situation='HUB requirement for better access to documentation', 
                 opportunity='Improve accessibility through a RAG', 
                 actions='build a catalog of embedded documents')

In [None]:
ctr.register.knowledge(task_name='base', intent_level='knowledge_builder')

In [None]:
# kn.remove_canonical(kn.CONNECTOR_PERSIST)

In [None]:
ctr.run_controller()

----------------
## Milvus Vector DB

### Query

In [None]:
import textwrap

def print_wrapped(text, wrap_length=80):
    wrapped_text = textwrap.fill(text, wrap_length)
    return wrapped_text

In [None]:
import random

# Nutrition-style questions generated with GPT4
gpt4_questions = [
    "What are the main risks associated with generative AI?",
    "How can businesses mitigate the risks of using generative AI?",
    "What is the role of a cross-functional team in GenAI strategy?",
    "What are the legal considerations for implementing generative AI?",
    "How should organizations ensure data quality in GenAI models?",
    "What is the importance of ongoing enhancement and monitoring in GenAI practices?"
]

# Manually created question list
manual_questions = []

query = random.choice(gpt4_questions + manual_questions)

### Model Answers
#### What are the main risks associated with generative AI?
Generative AI poses several risks including the creation of misleading content such as deepfakes, biased outputs due to biased training data, factually inaccurate outputs (hallucinations), security vulnerabilities (data breaches, model theft), and a lack of transparency in decision-making processes. These risks can lead to significant harm including privacy violations, intellectual property concerns, and increased carbon footprints.

#### How can businesses mitigate the risks of using generative AI?
Businesses can mitigate these risks by implementing Responsible AI (RAI) frameworks, which include gathering cross-functional teams, tracking legal requirements, ensuring high-quality and diverse training data, maintaining transparency, and engaging in continuous monitoring and enhancement of AI systems. Establishing governance structures and upholding legal and ethical standards are also crucial.

#### What is the role of a cross-functional team in GenAI strategy?
A cross-functional team centralizes AI expertise and capabilities across different departments, facilitating knowledge sharing, collaboration, and standardization. This team should oversee AI governance, resource pooling, and upskilling initiatives. They also play a key role in developing a long-term AI roadmap, promoting innovation, and ensuring alignment with company objectives and Responsible AI standards.

#### What are the legal considerations for implementing generative AI?
Legal considerations for implementing generative AI include ensuring compliance with intellectual property laws, privacy regulations, and monitoring the evolving legal landscape for new AI-specific rulings. Organizations must document and manage legal requirements, maintain robust data privacy controls, and continuously review contracts and licensing agreements related to AI models and their outputs.

#### How should organizations ensure data quality in GenAI models?
Organizations should ensure data quality by implementing stringent data acquisition, selection, and management processes. This includes verifying the provenance of training data, ensuring it is relevant to defined use cases, and maintaining high standards for data quality and diversity. Tracking metrics related to fairness, bias, and interpretability throughout the data lifecycle is also essential.

#### What is the importance of ongoing enhancement and monitoring in GenAI practices?
Ongoing enhancement and monitoring are crucial for adapting to the rapid evolution of GenAI technology. Continuous improvement ensures that AI systems remain effective, secure, and aligned with Responsible AI principles. Regular monitoring helps identify and mitigate new risks, maintain compliance with regulations, and uphold the quality and reliability of AI outputs.


In [None]:
print(f"Query: {query}\n")

answer = kn.load_canonical('query', query=query)

for i in range(answer.num_rows):
    s = answer.slice(i,1)
    print(f"Id: {s.column('id')[0]}.as_py()")
    print(f"Distance: {s.column('distance')[0].as_py()}")
    print(f"Answer: {print_wrapped(s.column('source')[0].as_py())}\n")

