In [2]:
import os
import glob
import json
import time
import pypdf
import openai
import random
import itertools
import numpy as np
import pandas as pd
from sklearn import svm
from io import StringIO
from langchain.vectorstores import FAISS
from langchain.chains import RetrievalQA
from langchain.chat_models import ChatOpenAI
from langchain.chains import QAGenerationChain
from langchain.evaluation.qa import QAEvalChain
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.document_loaders import UnstructuredFileLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from text_utils import GRADE_DOCS_PROMPT, GRADE_ANSWER_PROMPT, GRADE_DOCS_PROMPT_FAST, GRADE_ANSWER_PROMPT_FAST

from gpt_index import (
    GPTTreeIndex, 
    GPTSimpleVectorIndex, 
    SimpleDirectoryReader, 
    LLMPredictor, 
    ServiceContext,
    Response
)
from langchain.chat_models import ChatOpenAI
from langchain.llms import OpenAI
from gpt_index.evaluation import ResponseEvaluator
import pandas as pd
pd.set_option('display.max_colwidth', 0)

`Load docs`

In [3]:
def load_docs(files):

    # Load docs
    # IN: List of upload files (from Streamlit)
    # OUT: str
    # TODO: Support multple docs, Use Langchain loader

    all_text = ""
    for file_path in files:
        file_extension = os.path.splitext(file_path)[1]
        if file_extension == ".pdf":
            pdf_reader = pypdf.PdfReader(file_path)
            text = ""
            for page in pdf_reader.pages:
                text += page.extract_text()
            all_text += text
        elif file_extension == ".txt":
            loader = UnstructuredFileLoader(file_path)
            docs = loader.load()
            all_text += docs[0].page_content
        else:
            print('Please provide txt or pdf.')

    return all_text

fis = glob.glob("docs/transformers/*pdf")
# fis = glob.glob("docs/karpathy-lex-pod/*txt")
text = load_docs(fis)

`Split` 

In [4]:
def split_texts(text, chunk_size, overlap, split_method):

    # Split text
    # IN: text, chunk size, overlap
    # OUT: list of str splits
    # TODO: Add parameter for splitter type

    print("`Splitting doc ...`")
    if split_method == "RecursiveTextSplitter":
        text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size,
                                                       chunk_overlap=overlap)
    elif split_method == "CharacterTextSplitter":
        text_splitter = CharacterTextSplitter(separator=" ",
                                              chunk_size=chunk_size,
                                              chunk_overlap=overlap)
    splits = text_splitter.split_text(text)
    return splits

split_method = "RecursiveTextSplitter" 
overlap = 100
chunk_size = 1000
splits = split_texts(text, chunk_size, overlap, split_method)

`Splitting doc ...`


`Llama Index retriver`

In [30]:
from llama_index import Document
from gpt_index import (GPTSimpleVectorIndex)
documents = [Document(t) for t in splits]
# *** How are chunks used since we've passed in a list of chunked docs? ***
context = ServiceContext.from_defaults(chunk_size_limit=512)
# *** What vector DB implentation? ***
vector_index = GPTSimpleVectorIndex.from_documents(documents, service_context=context)

INFO:gpt_index.token_counter.token_counter:> [build_index_from_nodes] Total LLM token usage: 0 tokens
INFO:gpt_index.token_counter.token_counter:> [build_index_from_nodes] Total embedding token usage: 823050 tokens


In [42]:
# ***  Can I specify k? ***
query = "What corpus of data was GPT-3 trained on"
answer = vector_index.query(query)

INFO:gpt_index.token_counter.token_counter:> [query] Total LLM token usage: 289 tokens
INFO:gpt_index.token_counter.token_counter:> [query] Total embedding token usage: 11 tokens


In [32]:
answer.response

'\nGPT-3 was trained on a large web corpus.'

In [33]:
answer.source_nodes

[NodeWithScore(node=Node(text='achieves strong performance on many NLP datasets, including translation, question-answering, and\ncloze tasks, as well as several tasks that require on-the-ﬂy reasoning or domain adaptation, such as\nunscrambling words, using a novel word in a sentence, or performing 3-digit arithmetic. At the same\ntime, we also identify some datasets where GPT-3’s few-shot learning still struggles, as well as some\ndatasets where GPT-3 faces methodological issues related to training on large web corpora. Finally,\nwe ﬁnd that GPT-3 can generate samples of news articles which human evaluators have difﬁculty\ndistinguishing from articles written by humans. We discuss broader societal impacts of this ﬁnding\nand of GPT-3 in general.\n\x03Equal contribution\nyJohns Hopkins University, OpenAI\nAuthor contributions listed at end of paper.arXiv:2005.14165v4  [cs.CL]  22 Jul 2020Contents\n1 Introduction 3\n2 Approach 6', doc_id='a815d4c1-2d1f-43ab-83eb-9c4d1facc672', embedding=

`Llama Index eval`

In [35]:
# This mode of evaluation will return “YES”/”NO” if the synthesized response matches any source context.
evaluator = ResponseEvaluator(service_context=context)
eval_result = evaluator.evaluate(answer)
eval_result

INFO:gpt_index.token_counter.token_counter:> [build_index_from_nodes] Total LLM token usage: 0 tokens
INFO:gpt_index.token_counter.token_counter:> [build_index_from_nodes] Total embedding token usage: 0 tokens
INFO:gpt_index.token_counter.token_counter:> [query] Total LLM token usage: 505 tokens
INFO:gpt_index.token_counter.token_counter:> [query] Total embedding token usage: 0 tokens


'YES'

In [37]:
# This mode of evaluation will return “YES”/”NO” for every source node.
eval_result = evaluator.evaluate_source_nodes(answer)
pct = eval_result.count('YES') / len(eval_result)
eval_result

INFO:gpt_index.token_counter.token_counter:> [build_index_from_nodes] Total LLM token usage: 0 tokens
INFO:gpt_index.token_counter.token_counter:> [build_index_from_nodes] Total embedding token usage: 0 tokens
INFO:gpt_index.token_counter.token_counter:> [query] Total LLM token usage: 505 tokens
INFO:gpt_index.token_counter.token_counter:> [query] Total embedding token usage: 0 tokens


['YES']

`SVMretriver`

In [None]:
from langchain.embeddings import OpenAIEmbeddings
from langchain.embeddings import HuggingFaceEmbeddings

class SVMretriver:
    
    def __init__(self, splits, embeddings):
        
        self.embeddings = embeddings
        self.splits = splits 
        self.embedded_splits = np.array([self.embeddings.embed_query(split) for split in self.splits])
                            
    def get_relevant_documents(self, question, k):
        
        query = np.array(self.embeddings.embed_query(question))
        x = np.concatenate([query[None, ...], self.embedded_splits])
        y = np.zeros(x.shape[0])
        y[0] = 1
        
        clf = svm.LinearSVC(class_weight='balanced', verbose=False, max_iter=10000, tol=1e-6, C=0.1)
        clf.fit(x, y)
        
        similarities = clf.decision_function(x)
        sorted_ix = np.argsort(-similarities)
        
        top_k_results = []
        for row in sorted_ix[1:k+1]:
            top_k_results.append(Document(page_content=self.splits[row]))
        return top_k_results
    
embeddings = OpenAIEmbeddings()
embeddings = HuggingFaceEmbeddings()
fis = glob.glob("docs/karpathy-lex-pod/*txt")
text = load_docs(fis)
svm_retriever = SVMretriver(splits,embeddings)
top_k_docs = svm_retriever.get_relevant_documents(question, 3)

`Make retriver`

In [5]:
def make_retriever(splits, retriever_type, embeddings, num_neighbors):

    # Make document retriever
    # IN: list of str splits, retriever type, embedding type, number of neighbors for retrieval
    # OUT: retriever

    print("`Making retriever ...`")
    # Set embeddings
    if embeddings == "OpenAI":
        embd = OpenAIEmbeddings()
    elif embeddings == "HuggingFace":
        embd = HuggingFaceEmbeddings()

    # Select retriever
    if retriever_type == "similarity-search":
        try:
            vectorstore = FAISS.from_texts(splits, embd)
        except ValueError:
            print("`Error using OpenAI embeddings (disallowed TikToken token in the text). Using HuggingFace.`", icon="⚠️")
            vectorstore = FAISS.from_texts(splits, HuggingFaceEmbeddings())
        retriever = vectorstore.as_retriever(k=num_neighbors)
    elif retriever_type == "SVM":
        retriever = SVMRetriever.from_texts(splits,embd)
    elif retriever_type == "TF-IDF":
        retriever = TFIDFRetriever.from_texts(splits)
    return retriever

retriever_type = "similarity-search"
embeddings = "OpenAI"
num_neighbors = 3
retriever = make_retriever(splits, retriever_type, embeddings, num_neighbors)

`Making retriever ...`


`Make chain`

In [9]:
def make_chain(model_version, retriever):

    # Make chain
    # IN: model version, retriever
    # OUT: chain

    if (model_version == "gpt-3.5-turbo") or (model_version == "gpt-4"):
        llm = ChatOpenAI(model_name=model_version, temperature=0)
    elif model_version == "anthropic":
        llm = Anthropic(temperature=0)
    qa = RetrievalQA.from_chain_type(llm,
                                     chain_type="stuff",
                                     retriever=retriever,
                                     input_key="question")
    return qa

model_version = "gpt-3.5-turbo"
qa_chain = make_chain(model_version,retriever)

`Generate eval`

In [19]:
def grade_model_answer(predicted_dataset, predictions, grade_answer_prompt):

    # Grade the distilled answer
    # IN: ground truth, model predictions
    # OUT: list of scores

    print("`Grading model answer ...`")
    if grade_answer_prompt == "Fast":
        prompt = GRADE_ANSWER_PROMPT_FAST
    else:
        prompt = GRADE_ANSWER_PROMPT

    eval_chain = QAEvalChain.from_llm(llm=ChatOpenAI(model_name="gpt-3.5-turbo", temperature=0), 
                                      prompt=prompt)
    graded_outputs = eval_chain.evaluate(predicted_dataset,
                                         predictions,
                                         question_key="question",
                                         prediction_key="result")
    return graded_outputs

def grade_model_retrieval(gt_dataset, predictions, grade_docs_prompt):
    
    # Grade the docs retrieval
    # IN: ground truth, model predictions
    # OUT: list of scores

    print("`Grading relevance of retrived docs ...`")
    if grade_docs_prompt == "Fast":
        prompt = GRADE_DOCS_PROMPT_FAST
    else:
        prompt = GRADE_DOCS_PROMPT

    eval_chain = QAEvalChain.from_llm(llm=ChatOpenAI(model_name="gpt-3.5-turbo", temperature=0), 
                                      prompt=prompt)
    graded_outputs = eval_chain.evaluate(gt_dataset,
                                         predictions,
                                         question_key="question",
                                         prediction_key="result")
    return graded_outputs

def run_eval(chain, retriever, eval_qa_pair, grade_prompt):

    # Compute eval
    # IN: chain, retriever, eval question, flag for docs retrieval prompt
    # OUT: list of scores for answers and retrival, latency, predictions

    print("`Running eval ...`")
    predictions = []
    retrived_docs = []
    gt_dataset = []
    latency = []
    
    # Get answer and log latency
    start_time = time.time()
    predictions.append(chain(eval_qa_pair))
    gt_dataset.append(eval_qa_pair)
    end_time = time.time()
    elapsed_time = end_time - start_time
    latency.append(elapsed_time)

    # Retrive data
    docs=retriever.get_relevant_documents(eval_qa_pair["question"])

    # Extract text from retrived docs
    retrived_doc_text = ""
    for i,doc in enumerate(docs):
        retrived_doc_text += "Doc %s: "%str(i+1) + doc.page_content + " "
    retrived = {"question": eval_qa_pair["question"], "answer": eval_qa_pair["answer"], "result": retrived_doc_text}
    retrived_docs.append(retrived)
        
    # Grade
    graded_answers = grade_model_answer(gt_dataset, predictions, grade_prompt)
    graded_retrieval = grade_model_retrieval(gt_dataset, retrived_docs, grade_prompt)
    return graded_answers, graded_retrieval, latency, predictions

def generate_eval(text, N, chunk):

    # Generate N questions from context of chunk chars
    # IN: text, N questions, chunk size to draw question from in the doc
    # OUT: list of JSON
    # TODO: Refactor, PR for Langchain repo, add error handling for JSON load in QA chain

    print("`Generating eval set ...`")
    n = len(text)
    starting_indices = [random.randint(0, n-chunk) for _ in range(N)]
    sub_sequences = [text[i:i+chunk] for i in starting_indices]
    chain = QAGenerationChain.from_llm(ChatOpenAI(temperature=0))
    eval_set = []
    for i, b in enumerate(sub_sequences):
        try:
            qa = chain.run(b)
            eval_set.append(qa)
        except:
            print("Error on question %s"%i)
    eval_pair = list(itertools.chain.from_iterable(eval_set))
    return eval_pair

num_eval_questions = 1
grade_prompt = "Fast"
json_input = True

if json_input:
    with open("docs/karpathy-lex-pod/karpathy-pod-eval.json") as f:
        eval_set_full = json.load(f)
        num_eval_questions = len(eval_set_full)

for i in range(num_eval_questions):
    

    # Generate one question
    if json_input:
        eval_pair = eval_set_full[i]
    else:
        eval_pair = generate_eval(text, 1, 3000)
    if len(eval_pair) == 0:  # Error in eval generation
        continue
    else:
        # This returns a list, so we unpack to dict
        eval_pair = eval_pair[0]
        # Run eval 
        graded_answers, graded_retrieval, latency, predictions = run_eval(qa_chain, retriever, eval_pair, grade_prompt)
        # Assemble output 
        d = pd.DataFrame(predictions)
        d['answer score'] = [g['text'] for g in graded_answers]
        d['docs score'] = [g['text'] for g in graded_retrieval]
        d['latency'] = latency
        # Summary statistics
        d['answer correct'] = ["TRUE" if "INCORRECT" not in text else "FALSE" for text in d['answer score']]
        d['docs relevant'] = ["TRUE" if "Context is relevant: True" in text else "FALSE" for text in d['docs score']]
        # Convert the dictionary to a JSON string
        out_data = {'tableA': d.to_dict('records')}
        json_str = json.dumps(out_data)
        return {"output dataframe": json_str}

`Generating eval set ...`
`Running eval ...`
`Grading model answer ...`
`Grading relevance of retrived docs ...`


In [20]:
json_str

'{"tableA": [{"question": "What is the potential downside of having additional sensors in a car?", "answer": "Additional sensors can be a liability because they require a supply chain, maintenance, firmware, and can hold back the production line.", "result": "According to the context, the potential downside of having additional sensors in a car is that they are not free and can be a liability. They require computer vision engineers, procurement, maintenance, firmware writing, and incorporation into the system. They can also contribute noise and entropy into everything, bloat the data engine, and hold back the line in production. Therefore, it is important to consider the full cost of any one sensor and be really sure if it is necessary. In some cases, the cost is high and not worth it.", "answer score": "CORRECT", "docs score": "Context is relevant: True.", "latency": 4.297659158706665, "answer correct": "TRUE", "docs relevant": "TRUE"}]}'

`Test SVM retiever`

https://github.com/karpathy/randomfun/blob/master/knn_vs_svm.ipynb

In [5]:
import multiprocessing

embeddings = OpenAIEmbeddings()
print("---Single-process---")
start_time = time.time()
out = np.array([embeddings.embed_query(split) for split in splits])
end_time = time.time()
elapsed_time = end_time - start_time
print(elapsed_time)
print("---Single-process---")
# 758.6700360774994 sec

---Single-process---
---Single-process---


In [7]:
print("---DF---")
start_time = time.time()
d=pd.DataFrame()
d['splits']=splits
d['embeddings']=d['splits'].apply(lambda x: embeddings.embed_query(x))
end_time = time.time()
elapsed_time = end_time - start_time
print(elapsed_time)
print("---DF---")

---DF---
1141.9887909889221
---DF---


In [9]:
from multiprocessing import Pool
def embed_query(split):
    return embeddings.embed_query(split)
start_time = time.time()
with Pool() as p:
    out = np.array(p.map(embed_query, splits))
end_time = time.time()
elapsed_time = end_time - start_time
print(elapsed_time)

Process SpawnPoolWorker-55:
Process SpawnPoolWorker-56:
Process SpawnPoolWorker-53:
Traceback (most recent call last):
Traceback (most recent call last):
  File "/Users/31treehaus/opt/anaconda3/envs/ml/lib/python3.9/multiprocessing/process.py", line 315, in _bootstrap
    self.run()
  File "/Users/31treehaus/opt/anaconda3/envs/ml/lib/python3.9/multiprocessing/process.py", line 108, in run
    self._target(*self._args, **self._kwargs)
Traceback (most recent call last):
  File "/Users/31treehaus/opt/anaconda3/envs/ml/lib/python3.9/multiprocessing/pool.py", line 114, in worker
    task = get()
  File "/Users/31treehaus/opt/anaconda3/envs/ml/lib/python3.9/multiprocessing/queues.py", line 367, in get
    return _ForkingPickler.loads(res)
AttributeError: Can't get attribute 'embed_query' on <module '__main__' (built-in)>
  File "/Users/31treehaus/opt/anaconda3/envs/ml/lib/python3.9/multiprocessing/process.py", line 315, in _bootstrap
    self.run()
  File "/Users/31treehaus/opt/anaconda3/env

Process SpawnPoolWorker-62:
Traceback (most recent call last):
  File "/Users/31treehaus/opt/anaconda3/envs/ml/lib/python3.9/multiprocessing/process.py", line 315, in _bootstrap
    self.run()
  File "/Users/31treehaus/opt/anaconda3/envs/ml/lib/python3.9/multiprocessing/process.py", line 108, in run
    self._target(*self._args, **self._kwargs)
  File "/Users/31treehaus/opt/anaconda3/envs/ml/lib/python3.9/multiprocessing/pool.py", line 114, in worker
    task = get()
  File "/Users/31treehaus/opt/anaconda3/envs/ml/lib/python3.9/multiprocessing/queues.py", line 367, in get
    return _ForkingPickler.loads(res)
AttributeError: Can't get attribute 'embed_query' on <module '__main__' (built-in)>
Process SpawnPoolWorker-64:
Traceback (most recent call last):
  File "/Users/31treehaus/opt/anaconda3/envs/ml/lib/python3.9/multiprocessing/process.py", line 315, in _bootstrap
    self.run()
  File "/Users/31treehaus/opt/anaconda3/envs/ml/lib/python3.9/multiprocessing/process.py", line 108, in ru

Process SpawnPoolWorker-74:
Traceback (most recent call last):
  File "/Users/31treehaus/opt/anaconda3/envs/ml/lib/python3.9/multiprocessing/process.py", line 315, in _bootstrap
    self.run()
  File "/Users/31treehaus/opt/anaconda3/envs/ml/lib/python3.9/multiprocessing/process.py", line 108, in run
    self._target(*self._args, **self._kwargs)
  File "/Users/31treehaus/opt/anaconda3/envs/ml/lib/python3.9/multiprocessing/pool.py", line 114, in worker
    task = get()
  File "/Users/31treehaus/opt/anaconda3/envs/ml/lib/python3.9/multiprocessing/queues.py", line 367, in get
    return _ForkingPickler.loads(res)
AttributeError: Can't get attribute 'embed_query' on <module '__main__' (built-in)>
Process SpawnPoolWorker-75:
Traceback (most recent call last):
  File "/Users/31treehaus/opt/anaconda3/envs/ml/lib/python3.9/multiprocessing/process.py", line 315, in _bootstrap
    self.run()
  File "/Users/31treehaus/opt/anaconda3/envs/ml/lib/python3.9/multiprocessing/process.py", line 108, in ru

KeyboardInterrupt: 

ocess.py", line 108, in run
    self._target(*self._args, **self._kwargs)
  File "/Users/31treehaus/opt/anaconda3/envs/ml/lib/python3.9/multiprocessing/pool.py", line 114, in worker
    task = get()
  File "/Users/31treehaus/opt/anaconda3/envs/ml/lib/python3.9/multiprocessing/queues.py", line 365, in get
    res = self._reader.recv_bytes()
  File "/Users/31treehaus/opt/anaconda3/envs/ml/lib/python3.9/multiprocessing/connection.py", line 216, in recv_bytes
    buf = self._recv_bytes(maxlength)
  File "/Users/31treehaus/opt/anaconda3/envs/ml/lib/python3.9/multiprocessing/connection.py", line 414, in _recv_bytes
    buf = self._recv(4)
  File "/Users/31treehaus/opt/anaconda3/envs/ml/lib/python3.9/multiprocessing/connection.py", line 379, in _recv
    chunk = read(handle, remaining)
KeyboardInterrupt
Traceback (most recent call last):
  File "/Users/31treehaus/opt/anaconda3/envs/ml/lib/python3.9/multiprocessing/process.py", line 315, in _bootstrap
    self.run()
  File "/Users/31treehaus/o

In [6]:
elapsed_time

758.6700360774994

In [None]:
print("---Multi-process---")
start_time = time.time()
def embed_chunk(chunk):
    return embeddings.embed_query(chunk)
# Multiprocessing pool with all available CPU cores
num_processes = multiprocessing.cpu_count()  
pool = multiprocessing.Pool(num_processes)
results = pool.map(embed_chunk, splits)
pool.close()
pool.join()
out = np.array(results)
end_time = time.time()
elapsed_time = end_time - start_time
print(elapsed_time)
print("---Multi-process---")

In [116]:
from langchain.chains.question_answering import load_qa_chain
llm = ChatOpenAI(model_name="gpt-3.5-turbo",temperature=0)
chain = load_qa_chain(llm, chain_type="stuff")
chain.run(input_documents=top_k_docs, question=question)

'Software 2.0 refers to the idea that the way we program computers is changing from people writing software in traditional programming languages like C++ to accumulating training sets and data sets and crafting objectives by which we train neural networks. The neural networks are then compiled into a binary, which is the neural net weights and the forward pass of the neural net. This type of programming is also known as machine learning or artificial intelligence programming.'

In [21]:
def make_retriever(splits, retriever_type, embeddings, num_neighbors):

    # Make retriever
    # IN: list of str splits, retriever type, and embedding type
    # OUT: retriever

    if retriever_type == "similarity-search":
        if embeddings == "OpenAI":
            vectorstore = FAISS.from_texts(splits, OpenAIEmbeddings())
        elif embeddings == "HuggingFace":
            vectorstore = FAISS.from_texts(splits, HuggingFaceEmbeddings())
        retriever = vectorstore.as_retriever(k=num_neighbors)
    elif retriever_type == "TF-IDF":
        retriever = TFIDFRetriever.from_texts(splits, k=num_neighbors)
    return retriever

def make_chain(model_version, retriever):

    # Make chain
    # IN: model version, vectorstore
    # OUT: chain
    # TODO: Support for multiple model types

    if (model_version == "gpt-3.5-turbo") or (model_version == "gpt-4"):
        llm = ChatOpenAI(model_name=model_version, temperature=0)
    
    elif model_version == "Anthropic":
        llm = Anthropic(temperature=0)
    qa = RetrievalQA.from_chain_type(llm,
                                     chain_type="stuff",
                                     retriever=retriever,
                                     input_key="question")
    return qa

retriever_type = "similarity-search"
embeddings = "OpenAI"
num_neighbors = 3
retriever = make_retriever(splits, retriever_type, embeddings, num_neighbors)

model_version = "gpt-3.5-turbo"
qa_chain = make_chain(model_version,retriever)

`Run eval`

In [36]:
from langchain.prompts import PromptTemplate

template = """You are a teacher grading a quiz. You are given a question, the student's answer, and the true answer, and are asked to score the student answer as either CORRECT or INCORRECT.

Example Format:
QUESTION: question here
STUDENT ANSWER: student's answer here
TRUE ANSWER: true answer here
GRADE: CORRECT or INCORRECT here

Grade the student answers based ONLY on their factual accuracy. Ignore differences in punctuation and phrasing between the student answer and true answer. It is OK if the student answer contains more information than the true answer, as long as it does not contain any conflicting statements. Begin! 

QUESTION: {query}
STUDENT ANSWER: {result}
TRUE ANSWER: {answer}
GRADE:"""

GRADE_ANSWER_PROMPT = PromptTemplate(input_variables=["query", "result", "answer"], template=template)

# Citation
# https://github.com/jerryjliu/llama_index/blob/main/examples/test_wiki/TestNYC-Benchmark-GPT4.ipynb
template = """ 
    Given the question below. \n
    ---------------------\n
    {query}
    \n---------------------\n
    Decide if the following retreived context is relevant. \n
    \n---------------------\n
    {result}
    Answer in the following format:\n
    'Context is relevant: True or False'
    and explain why it supports the correct answer: {answer}"""

GRADE_DOCS_PROMPT = PromptTemplate(input_variables=["query", "result", "answer"], template=template)


`QAEvalChain` expected `input_variables=["query", "result", "answer"]`
 
TODO: Make this more general.

In [64]:
def grade_model_answer(gt_dataset, predictions):

    # Grade the model
    # IN: model predictions and ground truth (lists)
    # OUT: list of scores
    # TODO: Support for multiple grading types

    print("`Grading model answer ...`")
    eval_chain = QAEvalChain.from_llm(llm=ChatOpenAI(model_name="gpt-3.5-turbo", temperature=0), 
                                      prompt=GRADE_ANSWER_PROMPT)
    graded_outputs = eval_chain.evaluate(gt_dataset,
                                         predictions,
                                         question_key="question",
                                         prediction_key="result")
    return graded_outputs

def grade_model_retrieval(gt_dataset, predictions):
    
    # Grade the docs retrieval
    # IN: model predictions and ground truth (lists)
    # OUT: list of scores
    # TODO: Support for multiple grading types

    print("`Grading relevance of retrived docs ...`")
    eval_chain = QAEvalChain.from_llm(llm=ChatOpenAI(model_name="gpt-3.5-turbo", temperature=0), 
                                      prompt=GRADE_DOCS_PROMPT)
    graded_outputs = eval_chain.evaluate(gt_dataset,
                                         predictions,
                                         question_key="question",
                                         prediction_key="result")
    return graded_outputs

def run_eval(chain, retriever, eval_set):

    # Compute eval
    # IN: chain and ground truth (lists)
    # OUT: list of scores, latenct, predictions
    # TODO: Support for multiple grading types

    print("`Running eval ...`")
    predictions = []
    retrived_docs = []
    gt_dataset = []
    latency = []
        
    for data in eval_set:
        
        # Get answer and log latency
        start_time = time.time()
        predictions.append(chain(data))
        gt_dataset.append(data)
        end_time = time.time()
        elapsed_time = end_time - start_time
        latency.append(elapsed_time)
        # Retrive data
        docs=retriever.get_relevant_documents(data["question"])
        # Extract text from retrived docs
        retrived_doc_text = ""
        for i,doc in enumerate(docs):
            retrived_doc_text += "Doc %s: "%str(i+1) + doc.page_content + " "
        retrived = {"question": data["question"],"answer": data["answer"], "result": retrived_doc_text}
        retrived_docs.append(retrived)
        
    # Grade
    graded_answers = grade_model_answer(gt_dataset, predictions)
    graded_retrieval = grade_model_retrieval(gt_dataset, retrived_docs)
    return graded_answers, graded_retrieval, latency, predictions

graded_answers, graded_retrieval, latency, predictions = run_eval(qa_chain, retriever, eval_set_full)

`Running eval ...`
`Grading model answer ...`
`Grading relevance of retrived docs ...`


In [67]:
predictions

[{'question': 'How many parameters does the largest GLaM model have?',
  'answer': 'The largest GLaM model has 1.2T (trillion) parameters.',
  'result': 'The largest version of GLaM has 1.2T parameters in total with 64 experts per MoE layer.'},
 {'question': 'How many experts per layer does the GLaM model have?',
  'answer': 'The GLaM model has 64 experts per layer, with each token activating 96.6B parameters.',
  'result': 'The GLaM model can have from 1 to 256 experts per MoE layer, as stated in the context: "We then increase the number of experts in each MoE layer from 1 to 256."'},
 {'question': 'How many parameters, layers, and heads does the LaMDA model have?',
  'answer': 'The LaMDA model has 137B params, 64 layers, and 128 heads.',
  'result': 'The largest LaMDA model has 137B non-embedding parameters, and it uses a decoder-only Transformer language model with 64 layers, h=128, dk=dv=128, and relative attention as described in T5 [11]. However, the number of heads is not explic

In [65]:
graded_answers

[{'text': 'CORRECT'},
 {'text': 'INCORRECT'},
 {'text': 'CORRECT'},
 {'text': 'CORRECT'},
 {'text': 'INCORRECT'},
 {'text': 'CORRECT'},
 {'text': 'CORRECT'},
 {'text': 'CORRECT'}]

In [66]:
graded_retrieval

[{'text': 'Context is relevant: True. \n\nThe retrieved context directly answers the question by stating that the largest GLaM model has 1.2T parameters. It provides specific details about the GLaM models and their sizes, including the number of activated parameters per token. Therefore, the context is relevant to the question.'},
 {'text': 'Context is relevant: True. \n\nThe retrieved context mentions the GLaM model and its use of sparsely activated Mixture-of-Experts (MoE) layers. It also specifically states that the largest version of GLaM has 64 experts per MoE layer, with each token activating a subnetwork of 96.6B parameters. Therefore, the context supports the answer that the GLaM model has 64 experts per layer.'},
 {'text': 'Context is relevant: True\n\nThe relevant context from Doc 1 states that the largest LaMDA model has 137B non-embedding parameters, 64 layers, and h=128 heads. Therefore, the context provides the exact information required to answer the question.'},
 {'text