In [1]:
# saves you having to use print as all exposed variables are printed in the cell
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [2]:
# suppress warning message
import warnings
warnings.filterwarnings('ignore')

In [3]:
import os
import pyarrow as pa
import pyarrow.compute as pc
from nn_rag import Knowledge, Retrieval

### Chroma Vector Params

        URI example in-memory
            uri = "chromadb:///<collection>?reference=<name>"
        URI example to file
                uri = "chromadb:///<path>/<collection>?reference=<name>"

        params:
            collection: The name of the collection
            reference: a prefix name to reference the document vector

        Environment:
            CHROMA_EMBEDDING_QUANTIZE
            CHROMA_QUERY_SEARCH_LIMIT



### Instantiate capability

In [4]:
kn = Knowledge.from_memory()

In [16]:
# tbl = kn.set_source_uri("./hadron/source/llama-Responsible-Use-Guide.pdf").load_source_canonical()
tbl = kn.set_source_uri("https://raw.githubusercontent.com/meta-llama/llama/main/Responsible-Use-Guide.pdf").load_source_canonical()
kn.set_persist_uri('chroma:///hadron/data/')

### Clean text

In [6]:
doc = kn.tools.replace_on_pattern(tbl)

### Sentences

In [7]:
sentences = kn.tools.text_to_sentences(doc, include_score=True)

building sentences:   0%|          | 0/6709 [00:00<?, ?it/s]

  0%|          | 0/258 [00:00<?, ?it/s]

In [8]:
print(f"Max Sentence Size {pc.max(sentences['char_count'])}")
percentile_values = pc.quantile(sentences['char_count'], q=[0.01, 0.25, 0.5, 0.75, 0.99])
print(f"Percentile Sentence Size {list(zip(['1%', '25%', '50%', '75%', '99%'], pc.round(percentile_values,0).to_pylist()))}")
print(f"Min Sentence Size {pc.min(sentences['char_count'])}")
print('')
percentile_values = pc.quantile(sentences['score'], q=[0.99, 0.995, 0.999, 0.9998])
print(f"Percentile Similarity Score {list(zip(['99%', '99.5%', '99.9%', '99.98%'], pc.round(percentile_values,3).to_pylist()))}")
print(f"Max Similarity Score {pc.round(pc.max(sentences['score']),3)}")

Max Sentence Size 777
Percentile Sentence Size [('1%', 1.0), ('25%', 97.0), ('50%', 137.0), ('75%', 192.0), ('99%', 502.0)]
Min Sentence Size 1

Percentile Similarity Score [('99%', 0.705), ('99.5%', 0.754), ('99.9%', 0.829), ('99.98%', 0.845)]
Max Similarity Score 0.849


### Threshold Similarity Scores

In [9]:
high_similarity = pc.filter(sentences, pc.greater(sentences['score'], 0.80)).column('index').to_pylist()
print(f"Threshold silimarity for {high_similarity}")

sentences_sim = kn.tools.filter_on_join(sentences, indices=high_similarity)
percentile_values = pc.quantile(sentences_sim['char_count'], q=[0.9, 0.95, 0.98, 0.99])
print(f"Percentile Sentence size {list(zip(['90%', '95%', '98%', '99%'], pc.round(percentile_values,0).to_pylist()))}")


Threshold silimarity for [94]
Percentile Sentence size [('90%', 250.0), ('95%', 290.0), ('98%', 439.0), ('99%', 502.0)]


### Remove Short Sentences

In [10]:
sentences_trim = kn.tools.filter_on_condition(sentences_sim, header='char_count', condition=[(5, 'less', None)])

In [11]:
print(f"Max Sentence Size {pc.max(sentences_trim['char_count'])}")
percentile_values = pc.quantile(sentences_trim['char_count'], q=[0.01, 0.25, 0.5, 0.75, 0.99])
print(f"Percentile Sentence Size {list(zip(['1%', '25%', '50%', '75%', '99%'], pc.round(percentile_values,3).to_pylist()))}")
print(f"Min Sentence Size {pc.min(sentences_trim['char_count'])}")

Max Sentence Size 777
Percentile Sentence Size [('1%', 19.32), ('25%', 100.0), ('50%', 144.0), ('75%', 193.0), ('99%', 508.44)]
Min Sentence Size 10


### Sentence Chunking

In [12]:
sentence_chunks = kn.tools.text_to_chunks(sentences_trim, chunk_size=768)

building chunks:   0%|          | 0/245 [00:00<?, ?it/s]

In [13]:
print(f"Max Sentence size {pc.max(sentence_chunks['char_count'])}")
print(f"Mean Sentence size {pc.round(pc.mean(sentence_chunks['char_count']),0)}")
print(f"Min Sentence size {pc.min(sentence_chunks['char_count'])}")

Max Sentence size 768
Mean Sentence size 160.0
Min Sentence size 10


### Sentence Join

In [14]:
sentence_join = kn.tools.filter_on_join(sentence_chunks, chunk_size=768)
percentile_values = pc.quantile(sentence_join['char_count'], q=[0.001, 0.01, 0.25, 0.50])
print(f"Percentile Sentence size {list(zip(['0.1%', '1%', '25%', '50%'], pc.round(percentile_values,0).to_pylist()))}")

Percentile Sentence size [('0.1%', 407.0), ('1%', 426.0), ('25%', 630.0), ('50%', 690.0)]


### Embedding

In [17]:
kn.save_persist_canonical(sentence_join)

OSError: [Errno 30] Read-only file system: '/./hadron'

----------------
## Chroma Vector DB

### Query

In [None]:
import textwrap

def print_wrapped(text, wrap_length=80):
    wrapped_text = textwrap.fill(text, wrap_length)
    return wrapped_text

In [None]:
import random

questions = [
    "1. What are the core principles of responsible AI mentioned in the guide?",
    "2. How does Meta's open approach contribute to AI innovation?",
    "3. What are the stages of responsible LLM product development according to the guide?",
    "4. What are some examples of product-specific fine-tuning for LLMs?",
    "5. What considerations should be taken into account when defining content policies for LLMs?",
    "6. What are the benefits of democratizing access to large language models, as stated in the guide?"
]

query = random.choice(questions)

### Model Answers
1. **Core principles of responsible AI:**
   The guide outlines core principles of responsible AI, which include fairness and inclusion, robustness and safety, privacy and security, and transparency and control. Additionally, it emphasizes the importance of governance and accountability mechanisms to ensure these principles are upheld throughout the development and deployment of AI systems.

2. **Meta's open approach and AI innovation:**
   Meta's open approach to AI innovation involves open-sourcing code and datasets, contributing to the AI community's infrastructure, and making large language models available for research. This approach fosters a vibrant AI-innovation ecosystem, driving breakthroughs in various sectors and enabling exploratory research and large-scale production deployment. It also draws upon the collective wisdom and diversity of the AI community to improve and democratize AI technology.

3. **Stages of responsible LLM product development:**
   The guide identifies four stages of responsible LLM product development: determining the use case, fine-tuning for the product, addressing input- and output-level risks, and building transparency and reporting mechanisms in user interactions. Each stage involves specific considerations and mitigation strategies to ensure the safe and effective deployment of LLM-powered products.

4. **Examples of product-specific fine-tuning:**
   Product-specific fine-tuning examples provided in the guide include text summarization, question answering, and sentiment analysis. For instance, a pretrained language model can be fine-tuned on a dataset of long-form documents and summaries for text summarization, on a Q&A dataset for answering questions, and on labeled text reviews for sentiment analysis. These examples demonstrate how fine-tuning can tailor a model's capabilities to specific use cases, enhancing performance and applicability.

5. **Considerations for defining content policies:**
   When defining content policies for LLMs, developers should consider the intended use and audience of their product, legal and safety limitations, and the needs of specific user communities. Content policies should outline allowable content and safety limitations, which will guide data annotation and safety fine-tuning. It is also important to address potential biases in human feedback and data annotation processes to ensure fairness and objectivity.

6. **Benefits of democratizing access to large language models:**
   Democratizing access to large language models, as discussed in the guide, reduces barriers to entry for small businesses and fosters innovation across various sectors. By making these models widely available, small organizations can leverage advanced AI technology without incurring prohibitive costs, leading to economic growth and a more level playing field. This approach also promotes collaboration and collective improvement of AI models, ensuring that advancements benefit a broader range of users and applications.


In [None]:
rag = Retrieval.from_memory()
rag.set_source_uri('chroma:///hadron/data/')

In [None]:
print(f"Query: {query}\n")

answer = rag.tools.query_similarity(query, limit=5)
rag.table_report(answer)


In [None]:
print(f"Query: {query}\n")

answer = rag.tools.query_reranker(query)
rag.table_report(answer, headers='distance', drop=True)


### Tidy up

In [None]:
rag.remove_embedding()

###### 