In [None]:
!pip install transformers
!pip install faiss-gpu
!pip install load_dotenv
!pip install tiktoken
!pip install langchain
!pip install sentence-transformers
!pip install farm-haystack[preprocessing]

In [None]:
%load_ext autoreload
%autoreload 2

import dotenv
import os
import numpy as np
import json
from langchain.chat_models import ChatOpenAI
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.schema import HumanMessage, SystemMessage
from huggingface_hub.inference_api import InferenceApi
from transformers import AutoTokenizer
from tqdm.notebook import tqdm
from scripts import result_exists, evaluate_response, generate_context
from embeddings import retrieve_relevant_excerpts_quickly

dotenv.load_dotenv()

In [None]:
needle_question_couples = [
    ("\nThe best thing to do in San Francisco is eat a sandwich and sit in Dolores Park on a sunny day.\n", "What is the most fun thing to do in San Francisco?"),
    ("\nThe most inspiring monument near the Hugging Face office in Paris is certainly the Louvre museum.\n", "What is the most inspiring monument near the Hugging Face office in Paris?"),
]

needle, question = needle_question_couples[1]

### Choose evaluation model and RAG embeddings

In [None]:
evaluation_model  = ChatOpenAI(model="gpt-4", temperature=0, openai_api_key = os.getenv('OPENAI_API_KEY', 'YourAPIKey'))

rag_embedding = HuggingFaceEmbeddings(
    model_name="BAAI/bge-large-en-v1.5",
    encode_kwargs={'normalize_embeddings': False},
    model_kwargs={'device': 'cuda'},
)

### Choose LLMs

In [None]:
openai_model_to_test = ChatOpenAI(model='gpt-4', temperature=0, openai_api_key = os.getenv('OPENAI_API_KEY', 'YourAPIKey'))

hf_model_id = "HuggingFaceH4/zephyr-7b-beta"
hf_tokenizer = AutoTokenizer.from_pretrained(hf_model_id)
hf_client = InferenceApi(
    repo_id=hf_model_id,
    token=os.getenv('HUGGINGFACEHUB_API_TOKEN', 'YourHuggingFaceToken'),
)

In [None]:
def get_answer_hf(context: str, question: str, hf_tokenizer, hf_client) -> str:
    messages = [
        {
            "role": "system",
            "content": "You are a helpful AI bot that answers questions for a user. Keep your response short and direct.",
        },
        {"role": "user", "content": """
        You will have to answer this question based only on the context: {question}
        Here is the context: {context}
        """},
        {"role": "user", "content": """
        Answer the following question in only one sentence: {question}
        Don't give information outside the document or repeat your findings.
        """}
    ]

    messages_chat = hf_tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)

    full_prompt = messages_chat.format(question=question, context=context)
    print(len(hf_tokenizer.encode(full_prompt)), hf_tokenizer.encode(full_prompt)[:10])
    response = hf_client(full_prompt)[0]['generated_text'][len(full_prompt):]
    return response

def get_answer_openai(context, question, model):
    # Prepare your message to send to the model you're going to evaluate
    messages = [
        SystemMessage(
            content="You are a helpful AI bot that answers questions for a user. Keep your response short and direct"
        ),
        HumanMessage(
            # This is the PG essays with your needle/random statement placed in it
            # This is your haystack with a needle placed in it.
            content=f"CONTEXT:\n{context}",
        ),
        HumanMessage(
            # This is the question you'll ask to the model to tr≠≠y and retrieve your random statement/needle.
            content=f"{question} - Don't give information outside the document or repeat your findings"
        ),
    ]
    return model(messages)

# Launch tests

In [None]:
COMPRESS_CONTEXT_WITH_RAG = True
USE_HF_MODEL = False
model_to_test_description = 'GPT4_RAG'

In [None]:
# The code will check to see if a context_length, depth percent and version number have already been checked yet
# Change the version # if you would like to run the results multiple times.
# If you're just testing, then leave as version=1
results_version = 2

# This will produce a list of context lengths for each experiment iteration. Make sure the max context length is within the bounds of your models limits.
context_lengths = np.round(np.linspace(1000, 128000, num=15, endpoint=True)).astype(int)

# This will product a list of document depths to place your random statement (needle) at.
# Suggestion: Try out different distributions (like a sigmoid) to test non-evenly space intervals
document_depth_percents = np.round(np.linspace(0, 100, num=15, endpoint=True)).astype(int)

In [None]:
# # Go generate the required length context and place your needle statement in

# test_needle, test_question = ("\nThe most inspiring monument near the Hugging Face office in Paris is certainly the Louvre museum.\n", "What is the most inspiring monument near the Hugging Face office in Paris?")

# context = generate_context(test_needle, 46357, 79)

# if COMPRESS_CONTEXT_WITH_RAG:
#     context = await retrieve_relevant_excerpts_quickly(context, test_question, rag_embedding, top_k=20, words_per_chunk=50, flag_mentions_of_paris=True)


In [None]:
# Run through each iteration of context_lengths and depths
for depth_percent in tqdm(document_depth_percents):
    for context_length in context_lengths:
        # Load results from file. 
        try:
            with open(f'output/results_{model_to_test_description}.json', 'r') as f:
                results = json.load(f)
        except FileNotFoundError:
            results = []
            pass

        # Checks to see if you've already checked a length/percent/version.
        # This helps if the program stop running and you want to restart later
        if result_exists(results, context_length, depth_percent, results_version, model_to_test_description):
            continue
        
        # Go generate the required length context and place your needle statement in
        context = generate_context(needle, context_length, depth_percent)

        if COMPRESS_CONTEXT_WITH_RAG:
            context = await retrieve_relevant_excerpts_quickly(context, question, rag_embedding, top_k=20, words_per_chunk=50)

        ### Get your model's answer to the question! Will it find your random fact?
        if USE_HF_MODEL:
            response = get_answer_hf(context, question, hf_tokenizer, hf_client)
        else:
            response = get_answer_openai(context, question, openai_model_to_test).content

        print(response)

        # Compare the reponse to the actual needle you placed
        score = evaluate_response(response, needle, question, evaluation_model)
        result = {
            'model' : model_to_test_description,
            'context_length' : int(context_length),
            'depth_percent' : int(depth_percent),
            'version' : results_version,
            'needle' : needle,
            'model_response' : response,
            'score' : score
        }
        if score < 10:
            result['context'] = context

        print (f"Result #: {len(results)}/{len(context_lengths) * len(document_depth_percents)}")
        print (f"Context: {context_length} tokens")
        print (f"Depth: {depth_percent}%")
        print (f"Context: {context_length} tokens")
        print (f"Depth: {depth_percent}%")
        print (f"Score: {score}")
        print (f"Response: {response}\n")

        results.append(result)

        # Save results to a JSON file each run
        with open(f'output/results_{model_to_test_description}.json', 'w') as f:
            json.dump(results, f)

- Resulting context after RAG is 1600 tokens, i.e. with `gpt-4`: $0.03 / 1K tokens, aka $0.05
- Long-context GPT costs `gpt-4-1106-preview`: $0.01 / 1K tokens, aka $1.28