In [None]:
!pip install langchain openai langchain_community

In [None]:
import openai
from langchain import LLMChain, PromptTemplate
from langchain.chat_models import ChatOpenAI
from langchain.evaluation import QAEvalChain
from langchain.llms import HuggingFacePipeline # for HF models
from transformers import pipeline

In [None]:
import os
import re

In [None]:
from google.colab import userdata
OPENAI_API_KEY=userdata.get('OPENAI_API_KEY')
HF_TOKEN=userdata.get('HF_TOKEN')

In [None]:
os.environ["OPENAI_API_KEY"] = OPENAI_API_KEY

In [None]:
prompt_template = PromptTemplate(
    input_variables=["question"],
    template="""Pick the right answer from A/B/C. Only reply with A, B, or C.

    {question}"""

    )

In [None]:
llm_openai = ChatOpenAI(temperature=0, model_name="gpt-4o-mini")

You may need to apply for access to the HF model you choose.

In [None]:
model_id = "google/flan-t5-large"  # Replace with your desired model ID
pipe = pipeline(model=model_id, device=0 if os.environ.get("CUDA_VISIBLE_DEVICES") else -1, token=HF_TOKEN) # Use GPU if available
llm_hf = HuggingFacePipeline(pipeline=pipe)

In [None]:
llm_chain = LLMChain(
    llm=llm_hf, # choose llm_openai or llm_hf (for Huggingface models)
    prompt=prompt_template
)

In [None]:
evaluator = QAEvalChain.from_llm(llm_hf, chain_type="stuff") # choose llm_hf or llm_openai

In [None]:
test_data = [
    {"question": "What is the capital of France? A:Berlin B:Paris C:Amsterdam", "expected_answer": "B"},
    {"question": "Who wrote '1984'? A:Shakespeare B:Franzen C:George Orwell", "expected_answer": "C"},
    {"question": "What is the chemical symbol for water? A:H2O B:NO C:Mg", "expected_answer": "A"},
]

In [None]:
predictions = []
for example in test_data:
    predicted_answer = llm_chain.run(example["question"])  # Run the LLM to get the prediction
    # Clean up the prediction to ensure it's just A, B, or C
    print(predicted_answer)
    predicted_answer = predicted_answer.strip().upper()  # Strip whitespace and make uppercase

    # Enforce A, B, or C output using regex

    match = re.search(r"[ABC]", predicted_answer)
    if match:
        predicted_answer = match.group(0)
    else:
        predicted_answer = "A"  # Default to A if no match, consider handling better

    predictions.append({"question": example["question"], "generated_answer": predicted_answer})



In [None]:
predictions

In [None]:
outputs = evaluator.evaluate(
    examples=test_data,
    predictions=predictions,
    question_key="question",
    answer_key="expected_answer",
    prediction_key="generated_answer"
)

In [None]:
outputs

In [None]:
for i, item in enumerate(outputs):
    # The 'results' key contains a string like 'CORRECT' or 'INCORRECT'
    is_correct_str = item.get('results', 'N/A')

    # You can convert it to a boolean if needed
    is_correct = is_correct_str == 'CORRECT'
    question = test_data[i]['question']
    expected_answer = test_data[i]['expected_answer']

    # Get the predicted answer from the predictions list used in evaluate()
    # Access the 'generated_answer' from your predictions list.
    predicted_answer = predictions[i]['generated_answer']
    print(f"Question: {question}")
    print(f"Expected: {expected_answer}")
    print(f"Predicted: {predicted_answer}")
    print(f"Correct: {is_correct}")
    print()