In [1]:
# saves you having to use print as all exposed variables are printed in the cell
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [2]:
# suppress warning message
import warnings
warnings.filterwarnings('ignore')

In [3]:
import os
import pyarrow as pa
import pyarrow.compute as pc
from nn_rag import Knowledge, Retrieval

### Chroma Vector Params

        URI example in-memory
            uri = "chromadb:///<collection>?reference=<name>"
        URI example to file
                uri = "chromadb:///<path>/<collection>?reference=<name>"

        params:
            collection: The name of the collection
            reference: a prefix name to reference the document vector

        Environment:
            CHROMA_EMBEDDING_QUANTIZE
            CHROMA_QUERY_SEARCH_LIMIT



### Instantiate capability

In [4]:
kn = Knowledge.from_memory()

In [5]:
# tbl = kn.set_source_uri("./hadron/source/llama-Responsible-Use-Guide.pdf").load_source_canonical()
tbl = kn.set_source_uri("https://raw.githubusercontent.com/meta-llama/llama/main/Responsible-Use-Guide.pdf").load_source_canonical()
kn.set_persist_uri('chroma:///hadron/data/')

### Clean text

In [6]:
doc = kn.tools.replace_on_pattern(tbl)

### Sentences

In [7]:
sentences = kn.tools.text_to_sentences(doc, include_score=False)

building sentences:   0%|          | 0/6709 [00:00<?, ?it/s]

In [8]:
print(f"Max Sentence Size {pc.max(sentences['char_count'])}")
percentile_values = pc.quantile(sentences['char_count'], q=[0.01, 0.25, 0.5, 0.75, 0.99])
print(f"Percentile Sentence Size {list(zip(['1%', '25%', '50%', '75%', '99%'], pc.round(percentile_values,0).to_pylist()))}")
print(f"Min Sentence Size {pc.min(sentences['char_count'])}")
print('')
percentile_values = pc.quantile(sentences['score'], q=[0.99, 0.995, 0.999, 0.9998])
print(f"Percentile Similarity Score {list(zip(['99%', '99.5%', '99.9%', '99.98%'], pc.round(percentile_values,3).to_pylist()))}")
print(f"Max Similarity Score {pc.round(pc.max(sentences['score']),3)}")

Max Sentence Size 777
Percentile Sentence Size [('1%', 1.0), ('25%', 97.0), ('50%', 137.0), ('75%', 192.0), ('99%', 502.0)]
Min Sentence Size 1

Percentile Similarity Score [('99%', 0.0), ('99.5%', 0.0), ('99.9%', 0.0), ('99.98%', 0.0)]
Max Similarity Score 0


### Threshold Similarity Scores

In [9]:
high_similarity = pc.filter(sentences, pc.greater(sentences['score'], 0.80)).column('index').to_pylist()
print(f"Threshold silimarity for {high_similarity}")

sentences_sim = kn.tools.filter_on_join(sentences, indices=high_similarity)
percentile_values = pc.quantile(sentences_sim['char_count'], q=[0.9, 0.95, 0.98, 0.99])
print(f"Percentile Sentence size {list(zip(['90%', '95%', '98%', '99%'], pc.round(percentile_values,0).to_pylist()))}")


Threshold silimarity for []
Percentile Sentence size [('90%', 249.0), ('95%', 285.0), ('98%', 438.0), ('99%', 502.0)]


### Remove Short Sentences

In [10]:
sentences_trim = kn.tools.filter_on_condition(sentences_sim, header='char_count', condition=[(5, 'less', None)])

In [11]:
print(f"Max Sentence Size {pc.max(sentences_trim['char_count'])}")
percentile_values = pc.quantile(sentences_trim['char_count'], q=[0.01, 0.25, 0.5, 0.75, 0.99])
print(f"Percentile Sentence Size {list(zip(['1%', '25%', '50%', '75%', '99%'], pc.round(percentile_values,3).to_pylist()))}")
print(f"Min Sentence Size {pc.min(sentences_trim['char_count'])}")

Max Sentence Size 777
Percentile Sentence Size [('1%', 19.35), ('25%', 100.25), ('50%', 142.0), ('75%', 192.75), ('99%', 507.95)]
Min Sentence Size 10


### Sentence Chunking

In [12]:
sentence_chunks = kn.tools.text_to_chunks(sentences_trim, chunk_size=768)

building chunks:   0%|          | 0/246 [00:00<?, ?it/s]

In [13]:
print(f"Max Sentence size {pc.max(sentence_chunks['char_count'])}")
print(f"Mean Sentence size {pc.round(pc.mean(sentence_chunks['char_count']),0)}")
print(f"Min Sentence size {pc.min(sentence_chunks['char_count'])}")

Max Sentence size 768
Mean Sentence size 159.0
Min Sentence size 10


### Sentence Join

In [14]:
sentence_join = kn.tools.filter_on_join(sentence_chunks, chunk_size=768)
percentile_values = pc.quantile(sentence_join['char_count'], q=[0.001, 0.01, 0.25, 0.50])
print(f"Percentile Sentence size {list(zip(['0.1%', '1%', '25%', '50%'], pc.round(percentile_values,0).to_pylist()))}")

Percentile Sentence size [('0.1%', 407.0), ('1%', 426.0), ('25%', 630.0), ('50%', 690.0)]


### Embedding

In [15]:
kn.save_persist_canonical(sentence_join)

----------------
## Chroma Vector DB

### Query

In [16]:
import textwrap

def print_wrapped(text, wrap_length=80):
    wrapped_text = textwrap.fill(text, wrap_length)
    return wrapped_text

In [17]:
import random

questions = [
    "1. What are the core principles of responsible AI mentioned in the guide?",
    "2. How does Meta's open approach contribute to AI innovation?",
    "3. What are the stages of responsible LLM product development according to the guide?",
    "4. What are some examples of product-specific fine-tuning for LLMs?",
    "5. What considerations should be taken into account when defining content policies for LLMs?",
    "6. What are the benefits of democratizing access to large language models, as stated in the guide?"
]

query = random.choice(questions)

### Model Answers
1. **Core principles of responsible AI:**
   The guide outlines core principles of responsible AI, which include fairness and inclusion, robustness and safety, privacy and security, and transparency and control. Additionally, it emphasizes the importance of governance and accountability mechanisms to ensure these principles are upheld throughout the development and deployment of AI systems.

2. **Meta's open approach and AI innovation:**
   Meta's open approach to AI innovation involves open-sourcing code and datasets, contributing to the AI community's infrastructure, and making large language models available for research. This approach fosters a vibrant AI-innovation ecosystem, driving breakthroughs in various sectors and enabling exploratory research and large-scale production deployment. It also draws upon the collective wisdom and diversity of the AI community to improve and democratize AI technology.

3. **Stages of responsible LLM product development:**
   The guide identifies four stages of responsible LLM product development: determining the use case, fine-tuning for the product, addressing input- and output-level risks, and building transparency and reporting mechanisms in user interactions. Each stage involves specific considerations and mitigation strategies to ensure the safe and effective deployment of LLM-powered products.

4. **Examples of product-specific fine-tuning:**
   Product-specific fine-tuning examples provided in the guide include text summarization, question answering, and sentiment analysis. For instance, a pretrained language model can be fine-tuned on a dataset of long-form documents and summaries for text summarization, on a Q&A dataset for answering questions, and on labeled text reviews for sentiment analysis. These examples demonstrate how fine-tuning can tailor a model's capabilities to specific use cases, enhancing performance and applicability.

5. **Considerations for defining content policies:**
   When defining content policies for LLMs, developers should consider the intended use and audience of their product, legal and safety limitations, and the needs of specific user communities. Content policies should outline allowable content and safety limitations, which will guide data annotation and safety fine-tuning. It is also important to address potential biases in human feedback and data annotation processes to ensure fairness and objectivity.

6. **Benefits of democratizing access to large language models:**
   Democratizing access to large language models, as discussed in the guide, reduces barriers to entry for small businesses and fosters innovation across various sectors. By making these models widely available, small organizations can leverage advanced AI technology without incurring prohibitive costs, leading to economic growth and a more level playing field. This approach also promotes collaboration and collective improvement of AI models, ensuring that advancements benefit a broader range of users and applications.


In [18]:
rag = Retrieval.from_memory()
rag.set_source_uri('chroma:///hadron/data/')

<nn_rag.components.retrieval.Retrieval at 0x7fc1a10eabc0>

In [19]:
print(f"Query: {query}\n")

answer = rag.tools.query_similarity(query, limit=5)
rag.table_report(answer)


Query: 3. What are the stages of responsible LLM product development according to the guide?



Unnamed: 0,id,distance,source
0,general_15,1.326,"In addition to performing a variety of pretraining data-level investigations to help understand the potential capabilities and limitations of our models, we applied considerable safety mitigations to the fine-tuned versions of the model through supervised fine-tuning, reinforcement learning from human feedback (RLHF), and iterative red teaming (these steps are covered further in the section - Fine-tune for product). Information on pretraining data, model architecture and parameters, and pretrained evaluations are contained in the Llama 2 research paper. The paper also describes in further detail the steps to develop the fine-tuned versions, including detailed safety alignment efforts and evaluation results."
1,general_56,1.3865,"Here are some key considerations for implementing these components in unison: • Holistic optimization. Although each component has a specific role and optimization goal, components are not isolated entities. Over- optimization of one component without considering its interaction with others can lead to suboptimal outcomes. For instance, over- filtering training data for safety might make later fine-tuning less effective, as the model may not recognize and handle unsafe content appropriately. This is why different layers of safety mitigations throughout the development lifecycle are critical for creating high-performing, responsible products. • Alignment of objectives at each stage of development."
2,general_34,1.3887,"There are many complementary types of evaluations that are useful for measuring risks in models, including automatic benchmarks, manual annotations by human raters, and evaluations using an LLM itself as a rater. The Holistic Evaluation of Language Models discusses some of the commonly used automatic benchmarks. Evaluation strategies and processes to improve performance can include: • Automatic evaluation leverages automatic benchmarks and classifiers to judge the output with respect to a specific category of risk. Manual evaluation leverages human annotators or subject matter experts to judge the model’s output."
3,general_27,1.3968,"To mitigate these risks, carefully design the fine-tuning process by curating a high-quality dataset that is representative of your use case, conduct rigorous evaluations, and test your fine-tuned model’s potential use via red teaming (covered in step four - Evaluate and improve performance). STEP 3: TRAIN THE MODEL Fine-tuning involves training the model for a limited number of iterations. Once a pretrained model is loaded in the environment for fine-tuning, the training process involves setting up hyperparameters like epochs, batch size, and learning rate. The data are passed through the model, loss is computed, and weights are updated through backpropagation."
4,general_20,1.3993,"If you’re a developer who is not certain of a particular use case for which you would want to use the model, consider focusing on use cases that improve the lives of people and society, taking into consideration different ethical principles and values. Developing or adopting an internal risk assessment process can help identify potential risks for a specific use case and should focus on how your product’s end users and others could be affected. This understanding is critical for evaluating in-context safety for your product deployment, and can take forms such as surveys and interviews of potential users or market analysis of similar product applications."


In [20]:
print(f"Query: {query}\n")

answer = rag.tools.query_reranker(query)
rag.table_report(answer, headers='distance', drop=True)


Query: 3. What are the stages of responsible LLM product development according to the guide?



Unnamed: 0,cross-encoder_score,id,source
0,0.5686,general_34,"There are many complementary types of evaluations that are useful for measuring risks in models, including automatic benchmarks, manual annotations by human raters, and evaluations using an LLM itself as a rater. The Holistic Evaluation of Language Models discusses some of the commonly used automatic benchmarks. Evaluation strategies and processes to improve performance can include: • Automatic evaluation leverages automatic benchmarks and classifiers to judge the output with respect to a specific category of risk. Manual evaluation leverages human annotators or subject matter experts to judge the model’s output."
1,0.5665,general_14,"It is critical that developers examine each layer of the product to determine which potential risks may arise based on the product objectives and design, and implement mitigation strategies accordingly. The following section presents responsible AI considerations for the different stages of LLM product development. At each of these levels, we highlight best practices for mitigating potential risks. 5 JULY 2023 Llama 2 is a new version of the Llama 1 model, which was made available previously for research. The new pretrained and fine-tuned versions of the model have been updated for commercial release."
2,0.5249,general_18,"If you have terms of service or other relevant policies that apply to how individuals may interact with your LLM, you may wish to fine-tune your model to be aligned with those policies. It may also be necessary to establish new terms of service and policies specific to LLMs, or notify users about how their data or feedback provided will be used in fine-tuning. Development of the foundation model 6 JULY 2023 Developers will identify a specific product use case for the released model, and are responsible for assessing risks associated with that use case and applying best practices to ensure safety. This section outlines the considerations and mitigation strategies available at each stage of product development and deployment."
3,0.5078,general_8,Decisions to implement best practices should be evaluated based on the jurisdiction where your products will be deployed and should follow your company’s internal legal and risk management processes. How to use this guide This guide is a resource for developers that outlines common approaches to building responsibly at each level of an LLM-powered product. It covers best practices and considerations that developers should evaluate in the context of their specific use case and market. It also highlights some mitigation strategies and resources available to developers to address risks at various points in the system. These best practices should be considered holistically because strategies adopted at one level can impact the entire system.
4,0.4734,general_32,"The synthetic reward modeling data are then used to augment the reward model’s training data. THE RESPONSIBLE FINE-TUNING FLOW training progress is monitored using a validation set, and hyperparameters are adjusted as necessary. Fine-tuning an LLM for safety can involve a number of techniques, many of which the research paper on Llama 2 describes in greater depth. These techniques can include: • Supervised Fine-Tuning (SFT): Supervised fine- tuning using data annotated across helpfulness and safety. • Reinforcement Learning from Human Feedback (RLHF) or AI Feedback (RLAIF): Training safety and helpfulness reward models to support RLHF techniques iteratively improves models and makes them more robust to jailbreaking techniques."


### Tidy up

In [21]:
# rag.remove_embedding()

###### 