In [1]:
# saves you having to use print as all exposed variables are printed in the cell
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [2]:
# suppress warning message
import warnings
warnings.filterwarnings('ignore')

In [3]:
import os
import pyarrow as pa
import pyarrow.compute as pc
from nn_rag import Knowledge, Retrieval

### Chroma Vector Params

        URI example in-memory
            uri = "chromadb:///<collection>?reference=<name>"

        params:
            collection: The name of the collection
            reference: a prefix name to reference the document vector

        Environment:
            CHROMA_EMBEDDING_QUANTIZE
            CHROMA_QUERY_SEARCH_LIMIT



### Instantiate capability

In [4]:
kn = Knowledge.from_memory()

In [5]:
tbl = kn.set_source_uri("./hadron/source/llama-Responsible-Use-Guide.pdf").load_source_canonical()
kn.set_persist_uri('chroma:///')

### Clean text

In [6]:
doc = kn.tools.replace_on_pattern(tbl)

### Sentences

In [7]:
sentences = kn.tools.text_to_sentences(doc, disable_progress_bar=True)
print(f"Max Sentence size {pc.max(sentences['char_count'])}")
print(f"Mean Sentence size {pc.mean(sentences['char_count'])}")
print(f"Min Sentence size {pc.min(sentences['char_count'])}")

Max Sentence size 872
Mean Sentence size 164.91020408163266
Min Sentence size 13


### Sentence Chunking

In [8]:
sentence_chunks = kn.tools.text_to_chunks(sentences, chunk_size=500, disable_progress_bar=True)
print(f"Max Sentence size {pc.max(sentence_chunks['char_count'])}")
print(f"Mean Sentence size {pc.mean(sentence_chunks['char_count'])}")
print(f"Min Sentence size {pc.min(sentence_chunks['char_count'])}")

Max Sentence size 500
Mean Sentence size 162.99203187250995
Min Sentence size 13


### Sentence Join

In [9]:
sentence_join = kn.tools.filter_on_join(sentence_chunks, chunk_size=500, disable_progress_bar=True)
print(f"Max Sentence size {pc.max(sentence_join['char_count'])}")
print(f"Mean Sentence size {pc.mean(sentence_join['char_count'])}")
print(f"Min Sentence size {pc.min(sentence_join['char_count'])}")

Max Sentence size 500
Mean Sentence size 410.62
Min Sentence size 181


### Embedding

In [10]:
kn.save_persist_canonical(sentence_join)

----------------
## Chroma Vector DB

### Query

In [11]:
import textwrap

def print_wrapped(text, wrap_length=80):
    wrapped_text = textwrap.fill(text, wrap_length)
    return wrapped_text

In [12]:
import random

questions = [
    "1. What are the core principles of responsible AI mentioned in the guide?",
    "2. How does Meta's open approach contribute to AI innovation?",
    "3. What are the stages of responsible LLM product development according to the guide?",
    "4. What are some examples of product-specific fine-tuning for LLMs?",
    "5. What considerations should be taken into account when defining content policies for LLMs?",
    "6. What are the benefits of democratizing access to large language models, as stated in the guide?"
]

query = random.choice(questions)

### Model Answers
1. **Core principles of responsible AI:**
   The guide outlines core principles of responsible AI, which include fairness and inclusion, robustness and safety, privacy and security, and transparency and control. Additionally, it emphasizes the importance of governance and accountability mechanisms to ensure these principles are upheld throughout the development and deployment of AI systems.

2. **Meta's open approach and AI innovation:**
   Meta's open approach to AI innovation involves open-sourcing code and datasets, contributing to the AI community's infrastructure, and making large language models available for research. This approach fosters a vibrant AI-innovation ecosystem, driving breakthroughs in various sectors and enabling exploratory research and large-scale production deployment. It also draws upon the collective wisdom and diversity of the AI community to improve and democratize AI technology.

3. **Stages of responsible LLM product development:**
   The guide identifies four stages of responsible LLM product development: determining the use case, fine-tuning for the product, addressing input- and output-level risks, and building transparency and reporting mechanisms in user interactions. Each stage involves specific considerations and mitigation strategies to ensure the safe and effective deployment of LLM-powered products.

4. **Examples of product-specific fine-tuning:**
   Product-specific fine-tuning examples provided in the guide include text summarization, question answering, and sentiment analysis. For instance, a pretrained language model can be fine-tuned on a dataset of long-form documents and summaries for text summarization, on a Q&A dataset for answering questions, and on labeled text reviews for sentiment analysis. These examples demonstrate how fine-tuning can tailor a model's capabilities to specific use cases, enhancing performance and applicability.

5. **Considerations for defining content policies:**
   When defining content policies for LLMs, developers should consider the intended use and audience of their product, legal and safety limitations, and the needs of specific user communities. Content policies should outline allowable content and safety limitations, which will guide data annotation and safety fine-tuning. It is also important to address potential biases in human feedback and data annotation processes to ensure fairness and objectivity.

6. **Benefits of democratizing access to large language models:**
   Democratizing access to large language models, as discussed in the guide, reduces barriers to entry for small businesses and fosters innovation across various sectors. By making these models widely available, small organizations can leverage advanced AI technology without incurring prohibitive costs, leading to economic growth and a more level playing field. This approach also promotes collaboration and collective improvement of AI models, ensuring that advancements benefit a broader range of users and applications.


In [13]:
rag = Retrieval.from_memory()
rag.set_source_uri('chroma:///')

<nn_rag.components.retrieval.Retrieval at 0x7fe8b9d42cb0>

In [14]:
print(f"Query: {query}\n")

answer = rag.tools.query_similarity(query, limit=5)
rag.table_report(answer)


Query: 3. What are the stages of responsible LLM product development according to the guide?



Unnamed: 0,id,distance,source
0,general_24,1.3252,"In addition to performing a variety of pretraining data-level investigations to help understand the potential capabilities and limitations of our models, we applied considerable safety mitigations to the fine-tuned versions of the model through supervised fine-tuning, reinforcement learning from human feedback (RLHF), and iterative red teaming (these steps are covered further in the section - Fine-tune for product)."
1,general_57,1.3318,"The final stage is to evaluate the fine-tuned model on a test set to measure its performance on the specific task and against safety benchmarks, according to the use case. This includes analyzing the model’s strengths and weaknesses based on evaluation results, gathering more data to further enhance performance and safety, and iterating until satisfied with the model’s performance using holdout test datasets."
2,general_45,1.3493,"To mitigate these risks, carefully design the fine-tuning process by curating a high-quality dataset that is representative of your use case, conduct rigorous evaluations, and test your fine-tuned model’s potential use via red teaming (covered in step four - Evaluate and improve performance). STEP 3: TRAIN THE MODEL Fine-tuning involves training the model for a limited number of iterations."
3,general_33,1.3679,"Developing or adopting an internal risk assessment process can help identify potential risks for a specific use case and should focus on how your product’s end users and others could be affected. This understanding is critical for evaluating in-context safety for your product deployment, and can take forms such as surveys and interviews of potential users or market analysis of similar product applications."
4,general_96,1.3709,"This is why different layers of safety mitigations throughout the development lifecycle are critical for creating high-performing, responsible products. • Alignment of objectives at each stage of development. To yield a product that is optimized for your target use cases, it’s essential to have a consistent set of goals and outcomes that guide each stage of the process. From the data-collection stage to user feedback, be sure to keep your overall goal in mind."


In [15]:
print(f"Query: {query}\n")

answer = rag.tools.query_reranker(query)
rag.table_report(answer, headers='distance', drop=True)


Query: 3. What are the stages of responsible LLM product development according to the guide?



Unnamed: 0,source,id,cross-encoder_score
0,"At various points in the product development lifecycle, developers make decisions that shape the objectives and functionality of the feature, which can introduce potential risks. These decision points also provide opportunities to mitigate potential risks. It is critical that developers examine each layer of the product to determine which potential risks may arise based on the product objectives and design, and implement mitigation strategies accordingly.",general_22,0.5351
1,"This is why different layers of safety mitigations throughout the development lifecycle are critical for creating high-performing, responsible products. • Alignment of objectives at each stage of development. To yield a product that is optimized for your target use cases, it’s essential to have a consistent set of goals and outcomes that guide each stage of the process. From the data-collection stage to user feedback, be sure to keep your overall goal in mind.",general_96,0.4827
2,"There are many complementary types of evaluations that are useful for measuring risks in models, including automatic benchmarks, manual annotations by human raters, and evaluations using an LLM itself as a rater. The Holistic Evaluation of Language Models discusses some of the commonly used automatic benchmarks.",general_58,0.4608
3,ent stages 7 Determine use case 7 Fine-tune for product 8 The responsible fine-tuning flow 9 Step 1: Define content policies & mitigations 9 Step 2: Prepare data 10 Step 3: Train the model 10 Reinforcement Learning from Human Feedback (RLHF) 11 Reinforcement Learning from AI Feedback (RLAIF) 11 Step 4: Evaluate and improve performance 12,general_1,0.4256
4,"Information on pretraining data, model architecture and parameters, and pretrained evaluations are contained in the Llama 2 research paper. The paper also describes in further detail the steps to develop the fine-tuned versions, including detailed safety alignment efforts and evaluation results. Additional information is included in the model card accompanying the release.",general_25,0.4162


### Tidy up

In [16]:
rag.remove_embedding()

###### 