In [None]:
import os
import pandas as pd
from langchain.prompts import PromptTemplate
from langchain.chat_models import ChatOpenAI
from langchain.chains import LLMChain
from langchain.evaluation import load_evaluator, EvaluatorType
from langchain.evaluation.qa import QAEvalChain
from langchain.prompts import PromptTemplate
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

# Load  from environment variable 

# Define prompt
Base_prompt = """Please answer the question by choosing the answer(s) from the candidate answers, where DK means "Don't Know".
Important: Pay special attention to the output format. The output should be 1-5 words. Do not include the reasoning process and explanation in the output answer.
"""
CoT_prompt = """
You are an expert in spatial reasoning tasks. Always think step by step to answer the question.
1. Carefully read the context and question, identify the key objects, their attributes and relative relations mentioned in the context.
2. Based on step by step reasoning, select the appropriate answer(s) from the candidate answers, where DK means "Don't Know".
Important: Pay special attention to the output format. The output should be 1-5 words. Do not include the reasoning process and explanation in the output answer.
"""
Resq_base_prompt = """
You are a helper bot who is especially skilled in spatial reasoning and other common sense reasoning tasks.
Please answer Yes/No question with only Yes or No. Donot include the explanation and reasoning process in the output"""

Resq_CoT_prompt = """
You are a helper bot who is especially skilled in spatial reasoning and other common sense reasoning tasks.
When answering the Yes/No questions, try to think step by step.
1. Carefully read the context and question.
2. Identify the key objects and their attributes, and spatial relations mentioned in the question.
3. Based on step by step reasoning, answer the question  with Yes or NO.
Note: No reasoning process or explanation is needed.
"""
# Initialize language model
llm = ChatOpenAI(
    model="gpt-4o-mini",
     "",
    max_tokens=100,
    temperature=0.1
)

# Define different prompt templates
prompt_templates = {
    "base": PromptTemplate(
        input_variables=["context", "question", "choices"],
        template=Resq_base_prompt,
    ),
    "cot": PromptTemplate(
        input_variables=["context", "question", "choices"],
        template=Resq_CoT_prompt,),
}

eval_llm = ChatOpenAI(
    model="gpt-4o",
    "",
    max_tokens=100,
    temperature=0.1
)
# Load evaluators
exact_match_evaluator = load_evaluator(EvaluatorType.EXACT_MATCH)
string_distance_evaluator = load_evaluator(EvaluatorType.STRING_DISTANCE)
# criteria_evaluator = load_evaluator(
#         EvaluatorType.CRITERIA,
#         criteria={
#             "correctness": "Is the answer correct based on the given context and question?"
#         },
#         llm=eval_llm  # Specify the model to use for criteria evaluation
# )


# Initialize the sentence transformer model
sentence_model = SentenceTransformer('all-MiniLM-L6-v2')

def cosine_sim(text1, text2):
    # Encode the texts to get their embeddings
    embedding1 = sentence_model.encode([text1])
    embedding2 = sentence_model.encode([text2])
    
    # Calculate cosine similarity
    similarity = cosine_similarity(embedding1, embedding2)[0][0]
    return similarity

def evaluate_prompt(dataset, prompt_templates, llm):
    results = {prompt_name: {
        "exact_match": [], 
        "string_distance": [], 
        "qa_score": [],
        "semantic_similarity": []
    } for prompt_name in prompt_templates.keys()}
    
    # Load LangChain evaluators
    exact_match_evaluator = load_evaluator(EvaluatorType.EXACT_MATCH)
    string_distance_evaluator = load_evaluator(EvaluatorType.STRING_DISTANCE)
    # qa_evaluator = QAEvalChain.from_llm(llm=llm)
    
    for prompt_name, prompt_template in prompt_templates.items():
        chain = LLMChain(llm=llm, prompt=prompt_template)
        
        examples = []
        
        for row in dataset:
            context = row["Story"]
            question = row["Question"]
            choices = row["Candidate_Answers"]
            ground_truth = row["Answer"]
            
            response = chain.run(context=context, question=question, choices=choices)
            
            # Exact Match and String Distance
            exact_match_result = exact_match_evaluator.evaluate_strings(prediction=response, reference=ground_truth)
            string_distance_result = string_distance_evaluator.evaluate_strings(prediction=response, reference=ground_truth)
            
            # Custom Semantic Similarity
            semantic_similarity_score = cosine_sim(response, ground_truth)
            
            results[prompt_name]["exact_match"].append(exact_match_result['score'])
            results[prompt_name]["string_distance"].append(string_distance_result['score'])
            results[prompt_name]["semantic_similarity"].append(semantic_similarity_score)
            
            # Prepare input for QA evaluator
            examples.append({
                "query": question,
                "answer": ground_truth,
                "result": response,
                "context": context,
            })
        
        # QA Evaluation
        # # qa_results = qa_evaluator.evaluate(examples)
        # for result in qa_results:
        #     results[prompt_name]["qa_score"].append(result['score'])
    
    # Calculate average scores for each metric and prompt
    final_results = {}
    for prompt_name, prompt_results in results.items():
        final_results[prompt_name] = {
            metric: np.mean(scores) if scores else 0
            for metric, scores in prompt_results.items()
        }
    
    return final_results

df = pd.read_csv('Resq.csv').sample(n=210, random_state=61)
df = df.sample(frac=0.5, random_state=200)
dataset = df.to_dict('records')
results = evaluate_prompt(dataset, prompt_templates,llm)

# Print results
for prompt_name, metrics in results.items():
    print(f"Results for {prompt_name} prompt:")
    for metric, score in metrics.items():
        print(f"  {metric}: {score:.2%}")


In [None]:
import pandas as pd
from langchain.prompts import PromptTemplate
from langchain.chat_models import ChatOpenAI
from langchain.chains import LLMChain
from langchain.evaluation import EvaluatorType
from langchain.evaluation.loading import load_evaluator

# Load  from environment variable (make sure to set this in your environment)

Resq_base_prompt = """
You are a helper bot who is especially skilled in spatial reasoning and other common sense reasoning tasks.
Please answer Yes/No question with only Yes or No. Donot include the explanation and reasoning process in the output"""

Resq_CoT_prompt = """
You are a helper bot who is especially skilled in spatial reasoning and other common sense reasoning tasks.
When answering the Yes/No questions, try to think step by step.
1. Carefully read the context and question.
2. Identify the key objects and their attributes, and spatial relations mentioned in the question.
3. Based on step by step reasoning, answer the question  with Yes or NO.
Note: No reasoning process or explanation is needed.
"""
# Initialize language model
llm = ChatOpenAI(
    model="gpt-4o-mini",
     "",
    max_tokens=1000,
    temperature=0.1
)

# Define different prompt templates
prompt_templates = {
    "base": PromptTemplate(
        input_variables=["context", "question", "choices"],
        template=Resq_base_prompt,
    ),
    "cot": PromptTemplate(
        input_variables=["context", "question", "choices"],
        template=Resq_CoT_prompt,),
}

def evaluate_and_save_responses(df, prompt_templates, llm, output_file):
    """
    Evaluate prompts and save responses to a CSV file.
    
    :param df: DataFrame containing the dataset
    :param prompt_templates: Dictionary of prompt templates
    :param llm: Language model to use
    :param output_file: Name of the output CSV file
    :return: Updated DataFrame with model responses
    """
    for prompt_name, prompt_template in prompt_templates.items():
        chain = LLMChain(llm=llm, prompt=prompt_template)
        
        responses = []
        for _, row in df.iterrows():
            context = row["Story"]
            question = row["Question"]
            choices = row["Candidate_Answers"]
            
            response = chain.run(context=context, question=question, choices=choices)
            responses.append(response)
        
        # Add responses to the DataFrame
        df[f'{prompt_name}_response'] = responses
    
    # Save the updated DataFrame to a CSV file
    df.to_csv(output_file, index=False)
    
    return df


# Load and prepare data
df = pd.read_csv('Resq.csv').sample(n=99, random_state=61)
# Sample a subset if needed

# Evaluate and save responses
updated_df = evaluate_and_save_responses(df, prompt_templates, llm, 'model_responses.csv')

print("Responses have been saved to 'model_responses.csv'")


In [None]:
import pandas as pd
from langchain.prompts import PromptTemplate
from langchain.chains import LLMChain
import logging

logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

def evaluate_and_save_responses(df, prompt_templates, llm, output_file):
    """
    Evaluate prompts and save responses to a CSV file with error handling.
    
    :param df: DataFrame containing the dataset
    :param prompt_templates: Dictionary of prompt templates
    :param llm: Language model to use
    :param output_file: Name of the output CSV file
    :return: Updated DataFrame with model responses
    """
    for prompt_name, prompt_template in prompt_templates.items():
        chain = LLMChain(llm=llm, prompt=prompt_template)
        
        responses = []
        for index, row in df.iterrows():
            try:
                context = row["Story"]
                question = row["Question"]
                choices = row["Candidate_Answers"]
                
                full_prompt = prompt_template.format(context=context, question=question, choices=choices)
                logger.info(f"Full prompt for {prompt_name}, index {index}:\n{full_prompt}")
                
                response = chain.run(context=context, question=question, choices=choices)
                logger.info(f"Response for {prompt_name}, index {index}: {response}")
                
                responses.append(response)
            except Exception as e:
                logger.error(f"Error processing row {index} for {prompt_name}: {str(e)}")
                responses.append("ERROR: " + str(e))
        
        # Add responses to the DataFrame
        df[f'{prompt_name}_response'] = responses
    
    # Save the updated DataFrame to a CSV file
    df.to_csv(output_file, index=False)
    logger.info(f"Responses saved to {output_file}")
    
    return df

# Example usage:
prompt_templates = {
    "base": PromptTemplate(
        input_variables=["context", "question", "choices"],
        template="""
        Context: {context}
        Question: {question}
        Choices: {choices}
        
        Based on the context provided, please answer the question. Choose the best answer from the given choices. If you're not sure, you can answer 'DK' for "Don't Know".
        
        Your answer:
        """
    ),
    "cot": PromptTemplate(
        input_variables=["context", "question", "choices"],
        template="""
        Context: {context}
        Question: {question}
        Choices: {choices}
        
        Let's approach this step-by-step:
        1) First, carefully read the context and question.
        2) Consider each of the given choices.
        3) Think through how each choice relates to the information in the context.
        4) Select the best answer based on this analysis.
        
        If you're not sure after this process, you can answer 'DK' for "Don't Know".
        
        Your step-by-step reasoning and final answer:
        """
    )
}

# Load your dataset
df = pd.read_csv('ReSQ.csv')

# Sample a subset if needed (adjust as necessary)
df_sample = df.sample(n=10, random_state=42)

# Evaluate and save responses
updated_df = evaluate_and_save_responses(df_sample, prompt_templates, llm, 'model_responses.csv')

print("Responses have been saved to 'model_responses.csv'")