In [None]:
!pip install openai langchain datasets

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting openai
  Downloading openai-0.27.8-py3-none-any.whl (73 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m73.6/73.6 kB[0m [31m3.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting langchain
  Downloading langchain-0.0.214-py3-none-any.whl (1.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.1/1.1 MB[0m [31m21.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting datasets
  Downloading datasets-2.13.1-py3-none-any.whl (486 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m486.2/486.2 kB[0m [31m42.7 MB/s[0m eta [36m0:00:00[0m
Collecting aiohttp (from openai)
  Downloading aiohttp-3.8.4-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.0/1.0 MB[0m [31m29.2 MB/s[0m eta [36m0:00:00[0m
Collecting async-timeout<5.0.0,>=4.0.0 (from langchain)

In [None]:
from langchain.prompts import PromptTemplate
from langchain.chains import LLMChain
from langchain.llms import OpenAI
from langchain.evaluation.qa import QAEvalChain

In [None]:
api_key = ''

In [None]:
prompt = PromptTemplate(template="Question: {question} \nAnswer:", input_variables=["question"])
llm = OpenAI(model="text-davinci-003", temperature=0, openai_api_key=api_key)
chain = LLMChain(llm=llm, prompt=prompt)

In [None]:
example = [{
    "question": "Describe AI camp to me?",
    "answer": "AI Camp teaches future leaders about AI, coding, and how to work through creating impressive AI products and tech internships."
}]

In [None]:
pred = chain.apply(example)

In [None]:
pred

[{'text': ' AI Camp is an educational program designed to teach students the fundamentals of artificial intelligence (AI). It is a hands-on program that provides students with the opportunity to learn about AI through lectures, workshops, and projects. The program is designed to help students develop the skills necessary to become successful AI professionals. AI Camp also provides students with the opportunity to network with industry professionals and gain valuable experience in the field.'}]

## Lets Evaluate the answer using another LLM

In [None]:
llm_eval = OpenAI(temperature=0, openai_api_key=api_key)
eval_chain = QAEvalChain.from_llm(llm=llm_eval)
eval_outputs = eval_chain.evaluate(example, pred,
                    question_key="question",
                    answer_key="answer",
                    prediction_key="text")

In [None]:
eval_outputs

[{'text': ' CORRECT'}]

Suppose we got wrong answer

In [None]:
wrong_pred = [{'text': 'Dell is an American based technology company. It develops, sells, repairs, and supports computers and related products and services.'}]

In [None]:
eval_outputs = eval_chain.evaluate(example, wrong_pred,
                    question_key="question",
                    answer_key="answer",
                    prediction_key="text")

In [None]:
eval_outputs

[{'text': ' INCORRECT'}]

## Evaluate Using Custom Prompt

In [None]:
new_template = """You are an expert professor specialized in grading students'
answers to questions.
You are grading following questions:
{query}
Here is the real answer:
{answer}
You are grading the following predicted answer:
{result}
Use step by step reasoning to grade the answer. Be very critical. Write your reasoning before you grade the answer. Grade the answer on factuality.
You must include a similarity score between the real answer and the predicted answer. What grade do you give from 0 to 10.
Where 0 is the lowest (very low similarity) and 10 is the highest similarity. Your answer should be following format dont write anything after 10
reason: (Reason)\n score: (Score-number/10 format)"""

In [None]:
new_prompt = PromptTemplate(template=new_template, input_variables=["query", "answer", "result"])
llm_eval = OpenAI(temperature=0, openai_api_key=api_key)
eval_chain = QAEvalChain.from_llm(llm=llm_eval, prompt=new_prompt)

In [None]:
eval_outputs = eval_chain.evaluate(example, pred,
                    question_key="question",
                    answer_key="answer",
                    prediction_key="text")

In [None]:
eval_outputs

[{'text': '\n\nReason: The predicted answer provides a good overview of AI Camp, including the purpose of the program, the activities offered, and the benefits of attending. However, the predicted answer does not mention the topics of AI, coding, and creating AI products, which are all mentioned in the real answer.\n\nScore: 8/10'}]