In [362]:
import os 
from langchain_core.prompts import PromptTemplate

from langchain.prompts.chat import ChatPromptTemplate
from langchain_core.output_parsers import BaseOutputParser
from dotenv import load_dotenv 
load_dotenv()
groq_api_key = os.environ["GROQ_API_KEY"]
from langchain_groq import ChatGroq
llm = ChatGroq(groq_api_key= groq_api_key,
               model="mixtral-8x7b-32768" )

In [323]:
prompt_template = """
Generate a simple one Random question related to the topic {topic} in dictionary format. Each dictionary should contain two keys:

"question": A question text related to the topic.
"answer": A one-word or one-liner answer to the question.

Ensure that the question covers various aspects of the topic. Example topics to include might vary based on the subject but should cover a wide range of related subtopics.

"""


In [87]:
# prompt_template = """
# Generate a simple one question related to the topic {topic} in dictionary format.


#     "question": "<Your Question Here>",
#     "answer": "<Your Answer Here>"

# """


In [324]:
from typing import Dict

In [328]:
class dictOutput(BaseOutputParser):
    def parse(self, text: str) -> Dict[str, str]:
        # Extracting the JSON part from the text
        import re
        match = re.search(r'\{[^}]+\}', text)
        if match:
            json_text = match.group(0)

            # Load the JSON data
            import json
            data = json.loads(json_text)

            # Extract the question and answer
            question = data["question"]
            answer = data["answer"]

            return {
                "question": question,
                "answer": answer
            }
        else:
            raise ValueError("No JSON object found in the input text.")

In [331]:
prompt = PromptTemplate(
    input_variables=["topic"],
    template=prompt_template
)
chain = prompt | llm | dictOutput()

In [344]:
response = chain.invoke({"topic":"sports"})

In [345]:
response

{'question': "What is the maximum number of sets in a men's singles tennis match?",
 'answer': 'Five sets'}

In [282]:
print(response["question"])
print(response["answer"])


What is a common cause of iron deficiency in humans?
Anemia


In [283]:
from langchain.evaluation.qa import QAEvalChain

In [284]:
eval_chain = QAEvalChain.from_llm(llm)

In [285]:
accuracy_criteria = {
    "accuracy": """
Incorrect:  The answer is incorrect or unrelated to the reference
Correct:  The answer is correct and aligns with the reference, including minor misspellings.
"""
}

In [286]:
from langchain.evaluation import load_evaluator

In [287]:
evaluator = load_evaluator(
    "labeled_score_string",
    criteria=accuracy_criteria,
    llm=llm,
)

In [288]:
# evaluator = load_evaluator("labeled_criteria", llm=llm, criteria="correctness")

# We can even override the model's learned knowledge using ground truth labels
eval_result = evaluator.evaluate_strings(
    input=response["question"],
    prediction= "Anemia",
    reference=response["answer"],
)
print(f'With ground truth: {eval_result["score"]}')

With ground truth: 2


In [289]:
if eval_result["score"]>6:
    print("correct")
else:
    print("incorrect")

incorrect


In [290]:
print(eval_result["reasoning"])

Explanation:
The assistant's response is incomplete and not accurate. Anemia is a condition that results from a lack of red blood cells or insufficient hemoglobin, which can be caused by an iron deficiency. However, the assistant's response does not specify that iron deficiency is a cause of anemia, nor does it explain what iron deficiency is or its relationship to anemia.

Rating: [[2]]

The assistant's response is not entirely incorrect, but it is incomplete and lacks sufficient detail to be considered accurate. The assistant could have provided a more comprehensive answer by explaining that iron deficiency is a common cause of anemia, and then elaborating on the reasons for iron deficiency.


In [350]:
prompt_template1 = """
You are an expert evaluator. Below, you will find a reference question with its correct answer, as well as the user's answer. Your task is to evaluate the user's answer based on the provided criteria.

Reference Question and Answer:
Question and answer: {qa}

User's Answer:
{userAnswer}

Evaluation Criteria:
Incorrect: The answer is incorrect or unrelated to the reference.
Correct: The answer is correct and aligns with the reference, including minor misspellings.

Please provide your evaluation based on the criteria above in the following format:
Result: [Correct/Incorrect]
Reason: [Explanation for your evaluation in short]
"""


In [351]:
prompt = PromptTemplate(
    input_variables=["qa","userAnswer"],
    template=prompt_template1
)


In [353]:
chain2 = prompt | llm 

In [360]:
final_response = chain2.invoke({"qa":response,"userAnswer":"5"},)

In [361]:
print(final_response.content)

Result: Correct
Reason: The user's answer of "5" matches the correct answer of "Five sets", as it refers to the same quantity. The missing word "sets" does not affect the accuracy of the answer.
