In [None]:
from llama_index.core.evaluation import CorrectnessEvaluator
from llama_index.llms.azure_openai import AzureOpenAI

import pandas as pd
import os

from llama_index.core.llms import ChatMessage, MessageRole
from llama_index.core import ChatPromptTemplate, PromptTemplate
from typing import Dict

In [None]:
# API DETAILS
AZURE_API_KEY = os.getenv('AZURE_API_KEY')
AZURE_DEPLOYMENT_NAME = os.getenv("AZURE_DEPLOYMENT_NAME")
AZURE_API_VERSION = os.getenv("AZURE_API_VERSION")
AZURE_OPENAI_ENDPOINT = os.getenv("AZURE_OPENAI_ENDPOINT")

llm_gpt4o = AzureOpenAI(
    deployment_name="gpt-4o-mini",
    temperature=0, 
    api_key=AZURE_API_KEY,
    azure_endpoint=AZURE_OPENAI_ENDPOINT,
    api_version=AZURE_API_VERSION
)

### Load data

In [None]:
# Use this as our example to verify against
df = pd.read_csv('results_no_rewriter_k3_correctness.csv')

# Extract the 'question' and 'reference answer' columns into lists
questions = df['question'].tolist()
reference_answers = df['reference_answer'].tolist()
agent_answers = df['agent_answer'].tolist()

print(agent_answers[:5])

#### LLamaIndex Correctness Evaluator

Evaluate the default LLamaIndex CorrectnessEvaluator and save the results for each

In [None]:
import nest_asyncio
nest_asyncio.apply()

evaluator = CorrectnessEvaluator(llm=llm_gpt4o)
results_default = []

for question,ref_answer, agent_answer in zip(questions, reference_answers, agent_answers):
    result = evaluator.evaluate(
        query=question,
        response=agent_answer,
        reference=ref_answer
    )

    results_default.append(result)

# Extract the scores from results_default
correctness_scores = [result.score for result in results_default]

# Add the correctness_default column to the original DataFrame
df['correctness_default'] = correctness_scores

# Save the updated DataFrame back to the CSV file
df.to_csv('results_no_rewriter_k3_correctness.csv', index=False)

# Calculate the average score
total_score = sum(result.score for result in results_default)
average_score = total_score / len(results_default)
print(average_score)

In [None]:
# Extract the scores from results_default
correctness_reasons1 = [result.feedback for result in results_default]

# Add the correctness_default column to the original DataFrame
df['reason_default'] = correctness_reasons1

# Save the updated DataFrame back to the CSV file
df.to_csv('results_no_rewriter_k3_correctness.csv', index=False)


### Make Correctness Evaluator

In [None]:
CORRECTNESS_SYS_TMPL = """
You are an expert evaluation system for a question answering chatbot.

You are given the following information:
- a user query,
- a reference answer, and
- a generated answer.

Your job is to judge the correctness of the generated answer.
Output a single score that represents a holistic evaluation.
You must return your response in a line with only the score.
Do not return answers in any other format.
On a separate line provide your reasoning for the score as well.

Follow these guidelines for scoring:
- Your score has to be between 1 and 5, where 1 is the worst and 5 is the best.
- Use the following criteria for scoring correctness:

1. Score of 1:
    - The generated answer is completely incorrect.
    - The generated answer contains major factual errors or misconceptions.
    - The generated answer does not address any components of the user query correctly.

2. Score of 2:
    - The generated answer has significant mistakes.
    - The generated answer addresses at least one component of the user query correctly but has major errors in other parts.

3. Score of 3:
    - The generated answer is partially correct.
    - The generated answer addresses multiple components of the user query correctly but includes some incorrect information.
    - Minor factual errors are present.

4. Score of 4:
    - The generated answer is mostly correct.
    - The generated answer correctly addresses all components of the user query with minimal errors.
    - The errors do not substantially affect the overall correctness of the answer.

5. Score of 5:
    - The generated answer is completely correct.
    - The generated answer addresses all components of the user query correctly without any errors.
    - The answer is factually accurate and aligns perfectly with the reference answer.
"""

CORRECTNESS_USER_TMPL = """
## User Query
{query}

## Reference Answer
{reference_answer}

## Generated Answer
{generated_answer}
"""

In [None]:
eval_chat_template = ChatPromptTemplate(
    message_templates=[
        ChatMessage(role=MessageRole.SYSTEM, content=CORRECTNESS_SYS_TMPL),
        ChatMessage(role=MessageRole.USER, content=CORRECTNESS_USER_TMPL),
    ]
)

In [None]:
def run_correctness_eval(
    query_str: str,
    reference_answer: str,
    generated_answer: str,
    llm: AzureOpenAI,
    threshold: float = 4.0,
) -> Dict:
    """Run correctness eval."""
    fmt_messages = eval_chat_template.format_messages(
        llm=llm,
        query=query_str,
        reference_answer=reference_answer,
        generated_answer=generated_answer,
    )
    chat_response = llm.chat(fmt_messages)
    raw_output = chat_response.message.content

    # Extract from response
    score_str, reasoning_str = raw_output.split("\n", 1)
    score = float(score_str)
    reasoning = reasoning_str.lstrip("\n")

    return {"passing": score >= threshold, "score": score, "reason": reasoning}


In [None]:
results_method1 = []

for question,ref_answer, agent_answer in zip(questions, reference_answers, agent_answers):
    result = run_correctness_eval(question, ref_answer, agent_answer, llm=llm_gpt4o, threshold=4.0)
    results_method1.append(result)

In [None]:
# Extract the scores from results_default
correctness_scores = [result['score']for result in results_method1]

# Add the correctness_default column to the original DataFrame
df['correctness_method1'] = correctness_scores

# Save the updated DataFrame back to the CSV file
df.to_csv('results_no_rewriter_k3_correctness.csv', index=False)

# Calculate the average score
total_score = sum(result['score'] for result in results_method1)
average_score = total_score / len(results_default)
print(average_score)


In [None]:
CORRECTNESS_SYS_TMPL = """
You are an expert evaluation system for a question answering chatbot.

You are given the following information:
- a user query,
- a reference answer, and
- a generated answer.

Your job is to judge the correctness of the generated answer.
Output a single score that represents a holistic evaluation.
You must return your response in a line with only the score.
Do not return answers in any other format.
On a separate line provide your reasoning for the score as well.
The reasoning MUST NOT UNDER ANY CIRCUMSTANCES BE LONGER THAN 1 SENTENCE.

Follow these guidelines for scoring:
- Your score has to be between 1 and 5, where 1 is the worst and 5 is the best.
- Use the following criteria for scoring correctness:

1. Score of 1:
    - The generated answer is completely incorrect.
    - Contains major factual errors or misconceptions.
    - Does not address any components of the user query correctly.
    - Example:
      - Query: "What is the capital of France?"
      - Generated Answer: "The capital of France is Berlin."

2. Score of 2:
    - Significant mistakes are present.
    - Addresses at least one component of the user query correctly but has major errors in other parts.
    - Example:
      - Query: "What is the capital of France and its population?"
      - Generated Answer: "The capital of France is Paris, and its population is 100 million."

3. Score of 3:
    - Partially correct with some incorrect information.
    - Addresses multiple components of the user query correctly.
    - Minor factual errors are present.
    - Example:
      - Query: "What is the capital of France and its population?"
      - Generated Answer: "The capital of France is Paris, and its population is around 3 million."

4. Score of 4:
    - Mostly correct with minimal errors.
    - Correctly addresses all components of the user query.
    - Errors do not substantially affect the overall correctness.
    - Example:
      - Query: "What is the capital of France and its population?"
      - Generated Answer: "The capital of France is Paris, and its population is approximately 2.1 million."

5. Score of 5:
    - Completely correct.
    - Addresses all components of the user query correctly without any errors.
    - Providing more information than necessary should not be penalized as long as all provided information is correct.
    - Example:
      - Query: "What is the capital of France and its population?"
      - Generated Answer: "The capital of France is Paris, and its population is approximately 2.1 million. Paris is known for its rich history and iconic landmarks such as the Eiffel Tower and Notre-Dame Cathedral."

Checklist for Evaluation:
  - Component Coverage: Does the answer cover all parts of the query?
  - Factual Accuracy: Are the facts presented in the answer correct?
  - Error Severity: How severe are any errors present in the answer?
  - Comparison to Reference: How closely does the answer align with the reference answer?

Edge Cases:
  - If the answer includes both correct and completely irrelevant information, focus only on the relevant portions for scoring.
  - If the answer is correct but incomplete, score based on the completeness criteria within the relevant score range.
  - If the answer provides more information than necessary, it should not be penalized as long as all information is correct.
"""

CORRECTNESS_USER_TMPL = """
## User Query
{query}

## Reference Answer
{reference_answer}

## Generated Answer
{generated_answer}
"""

eval_chat_template = ChatPromptTemplate(
    message_templates=[
        ChatMessage(role=MessageRole.SYSTEM, content=CORRECTNESS_SYS_TMPL),
        ChatMessage(role=MessageRole.USER, content=CORRECTNESS_USER_TMPL),
    ]
)

def run_correctness_eval(
    query_str: str,
    reference_answer: str,
    generated_answer: str,
    llm: AzureOpenAI,
    threshold: float = 4.0,
) -> Dict:
    """Run correctness eval."""
    fmt_messages = eval_chat_template.format_messages(
        llm=llm,
        query=query_str,
        reference_answer=reference_answer,
        generated_answer=generated_answer,
    )
    chat_response = llm.chat(fmt_messages)
    raw_output = chat_response.message.content

    # Extract from response
    score_str, reasoning_str = raw_output.split("\n", 1)
    score = float(score_str)
    reasoning = reasoning_str.lstrip("\n")

    return {"passing": score >= threshold, "score": score, "reason": reasoning}


import pandas as pd
from tqdm.notebook import tqdm

def process_correctness_scores(file_path, llm, threshold=4.0, num_rows=None):
    # Load the data
    df = pd.read_csv(file_path)

    # Extract the 'question', 'reference answer', and 'agent answer' columns into lists
    questions = df['question'].tolist()
    reference_answers = df['reference_answer'].tolist()
    agent_answers = df['agent_answer'].tolist()

    # If num_rows is None, process all rows
    if num_rows is None:
        num_rows = len(df)

    # Initialize the results list
    results_method2 = []

    # Use tqdm for the loading bar
    for question, ref_answer, agent_answer in tqdm(zip(questions[:num_rows], reference_answers[:num_rows], agent_answers[:num_rows]), total=num_rows, desc="Processing"):
        result = run_correctness_eval(question, ref_answer, agent_answer, llm=llm, threshold=threshold)
        results_method2.append(result)

    # Extract the scores from results_method2
    correctness_scores2 = [result['score'] for result in results_method2]
    correctness_reasons2 = [result['reason'] for result in results_method2]

    # Add correctness scores and reasons to the DataFrame
    df.loc[:num_rows-1, 'correctness_method2'] = correctness_scores2
    df.loc[:num_rows-1, 'reason_method2'] = correctness_reasons2

    df.to_csv(file_path, index=False)

    # Calculate the average score
    total_score = sum(result['score'] for result in results_method2)
    average_score = total_score / len(results_method2)
    print(f"Average Score: {average_score}")

    return average_score


score = process_correctness_scores(file_path='../exp3/class3_HyPA2_k_Q_eu.csv', llm=llm_gpt4o, threshold=4.0, num_rows=None)
#score = process_correctness_scores(file_path='rewriter_with_topics_n5.csv', llm=llm_gpt4o, threshold=4.0, num_rows=None)
#score = process_correctness_scores(file_path='rewriter_with_topics_n7.csv', llm=llm_gpt4o, threshold=4.0, num_rows=None)


In [None]:
!pip install ipywidgets
!pip install tqdm

!jupyter nbextension enable --py widgetsnbextension --sys-prefix
!jupyter nbextension install --py widgetsnbextension --sys-prefix
!jupyter nbextension enable --py widgetsnbextension


##### Average Scores:
- Default: 4.213754646840148
- Method 1: 3.8513011152416357
- Method 2: 4.074349442379182


##### Runtime for each:

- Default: 14m 23.1s
- Method 1: 19m 04.6s
- Method 2: 45m 27.3s

### Calculation and analysis

In [None]:
df1 = pd.read_csv('results_no_rewriter_k3_correctness.csv')

In [None]:
# Map correctness_human to True/False
df1['correctness_human_mapped'] = df1['correctness_human'] == 1

# Function to map the correctness methods to True/False
def map_correctness(col):
    return df1[col] >= 4

# Apply the mapping to the correctness columns
df1['correctness_default_mapped'] = map_correctness('correctness_default')
df1['correctness_method1_mapped'] = map_correctness('correctness_method1')
df1['correctness_method2_mapped'] = map_correctness('correctness_method2')

# Calculate the alignment scores
alignment_default = (df1['correctness_human_mapped'] == df1['correctness_default_mapped']).mean()
alignment_method1 = (df1['correctness_human_mapped'] == df1['correctness_method1_mapped']).mean()
alignment_method2 = (df1['correctness_human_mapped'] == df1['correctness_method2_mapped']).mean()

# Compute the average correctness by mapping 4+ to 1 and below 4 to 0
average_correctness_human = df1['correctness_human'].mean()
average_correctness_default_mapped = df1['correctness_default_mapped'].mean()
average_correctness_method1_mapped = df1['correctness_method1_mapped'].mean()
average_correctness_method2_mapped = df1['correctness_method2_mapped'].mean()

# Print results to 5 decimal places

# Alignment Scores
print(f"Alignment Score for correctness_default: {alignment_default:.5f}")
print(f"Alignment Score for correctness_method1: {alignment_method1:.5f}")
print(f"Alignment Score for correctness_method2: {alignment_method2:.5f}")

# Average Correctness Scores (Mapped 4+ to 1, below 4 to 0)
print(f"Average Correctness Score for correctness_human: {average_correctness_human:.5f}")
print(f"Average Correctness Score for correctness_default (Mapped): {average_correctness_default_mapped:.5f}")
print(f"Average Correctness Score for correctness_method1 (Mapped): {average_correctness_method1_mapped:.5f}")
print(f"Average Correctness Score for correctness_method2 (Mapped): {average_correctness_method2_mapped:.5f}")

In [None]:
# Function to calculate metrics
def calculate_metrics(group):
    alignment_default = (group['correctness_human_mapped'] == group['correctness_default_mapped']).mean()
    alignment_method1 = (group['correctness_human_mapped'] == group['correctness_method1_mapped']).mean()
    alignment_method2 = (group['correctness_human_mapped'] == group['correctness_method2_mapped']).mean()
    alignment_giskard = (group['correctness_human_mapped'] == group['correctness_mapped']).mean()
    
    average_correctness_human = group['correctness_human'].mean()
    average_correctness_default_mapped = group['correctness_default_mapped'].mean()
    average_correctness_method1_mapped = group['correctness_method1_mapped'].mean()
    average_correctness_method2_mapped = group['correctness_method2_mapped'].mean()
    
    return pd.Series({
        'alignment_default': alignment_default,
        'alignment_method1': alignment_method1,
        'alignment_method2': alignment_method2,
        'alignment_giskard': alignment_giskard,
        'average_correctness_human': average_correctness_human,
        'average_correctness_default_mapped': average_correctness_default_mapped,
        'average_correctness_method1_mapped': average_correctness_method1_mapped,
        'average_correctness_method2_mapped': average_correctness_method2_mapped
    })

# Apply the function to each group
metrics_by_question_type = grouped.apply(calculate_metrics)

# Calculate overall scores across all question types
overall_metrics = calculate_metrics(df)
overall_metrics.name = 'Overall'

# Append overall metrics to the dataframe using concat
metrics_by_question_type = pd.concat([metrics_by_question_type, overall_metrics.to_frame().T])

metrics_by_question_type

In [None]:
# Map correctness_human to True/False
from scipy.stats import spearmanr
df1['correctness_human_mapped'] = df1['correctness_human'] == 1

# Function to map the correctness methods to True/False
def map_correctness(col):
    return df1[col] >= 4

# Apply the mapping to the correctness columns
df1['correctness_default_mapped'] = map_correctness('correctness_default')
df1['correctness_method1_mapped'] = map_correctness('correctness_method1')
df1['correctness_method2_mapped'] = map_correctness('correctness_method2')

# Function to calculate metrics
def calculate_metrics(group):
    alignment_default = (group['correctness_human_mapped'] == group['correctness_default_mapped']).mean()
    alignment_method1 = (group['correctness_human_mapped'] == group['correctness_method1_mapped']).mean()
    alignment_method2 = (group['correctness_human_mapped'] == group['correctness_method2_mapped']).mean()
    alignment_giskard = (group['correctness_human_mapped'] == group['correctness']).mean()
    
    average_correctness_human = group['correctness_human'].mean()
    average_correctness_default_mapped = group['correctness_default_mapped'].mean()
    average_correctness_method1_mapped = group['correctness_method1_mapped'].mean()
    average_correctness_method2_mapped = group['correctness_method2_mapped'].mean()
    
    # Compute Spearman correlations
    spearman_default = spearmanr(group['correctness_human_mapped'], group['correctness_default_mapped']).correlation
    spearman_method1 = spearmanr(group['correctness_human_mapped'], group['correctness_method1_mapped']).correlation
    spearman_method2 = spearmanr(group['correctness_human_mapped'], group['correctness_method2_mapped']).correlation
    
    return pd.Series({
        'alignment_default': alignment_default,
        'alignment_method1': alignment_method1,
        'alignment_method2': alignment_method2,
        'alignment_giskard': alignment_giskard,
        'average_correctness_human': average_correctness_human,
        'average_correctness_default_mapped': average_correctness_default_mapped,
        'average_correctness_method1_mapped': average_correctness_method1_mapped,
        'average_correctness_method2_mapped': average_correctness_method2_mapped,
        'spearman_default': spearman_default,
        'spearman_method1': spearman_method1,
        'spearman_method2': spearman_method2
    })

# Apply the function to the whole dataset
overall_metrics = calculate_metrics(df1)

# Apply the function by question type
df1['question_type'] = df1['metadata'].apply(lambda x: eval(x)['question_type'])
grouped_metrics = df1.groupby('question_type').apply(calculate_metrics)
overall_metrics

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
from scipy.stats import spearmanr

# Assuming df1 is already defined and loaded with the required data
# df1 = pd.read_csv('results_no_rewriter_k3_correctness.csv')  # Example loading data

# Map correctness_human to True/False
df1['correctness_human_mapped'] = df1['correctness_human'] == 1

# Function to map the correctness methods to True/False
def map_correctness(col):
    return df1[col] >= 4

# Apply the mapping to the correctness columns
df1['correctness_default_mapped'] = map_correctness('correctness_default')
df1['correctness_method1_mapped'] = map_correctness('correctness_method1')
df1['correctness_method2_mapped'] = map_correctness('correctness_method2')

# Function to calculate metrics
def calculate_metrics(group):
    alignment_default = (group['correctness_human_mapped'] == group['correctness_default_mapped']).mean()
    alignment_method1 = (group['correctness_human_mapped'] == group['correctness_method1_mapped']).mean()
    alignment_method2 = (group['correctness_human_mapped'] == group['correctness_method2_mapped']).mean()
    alignment_giskard = (group['correctness_human_mapped'] == group['correctness']).mean()
    
    average_correctness_human = group['correctness_human'].mean()
    average_correctness_default_mapped = group['correctness_default_mapped'].mean()
    average_correctness_method1_mapped = group['correctness_method1_mapped'].mean()
    average_correctness_method2_mapped = group['correctness_method2_mapped'].mean()
    
    # Compute Spearman correlations
    spearman_default = spearmanr(group['correctness_human_mapped'], group['correctness_default_mapped']).correlation
    spearman_method1 = spearmanr(group['correctness_human_mapped'], group['correctness_method1_mapped']).correlation
    spearman_method2 = spearmanr(group['correctness_human_mapped'], group['correctness_method2_mapped']).correlation
    
    return pd.Series({
        'alignment_default': alignment_default,
        'alignment_method1': alignment_method1,
        'alignment_method2': alignment_method2,
        'alignment_giskard': alignment_giskard,
        'average_correctness_human': average_correctness_human,
        'average_correctness_default_mapped': average_correctness_default_mapped,
        'average_correctness_method1_mapped': average_correctness_method1_mapped,
        'average_correctness_method2_mapped': average_correctness_method2_mapped,
        'spearman_default': spearman_default,
        'spearman_method1': spearman_method1,
        'spearman_method2': spearman_method2
    })

# Apply the function to the whole dataset
overall_metrics = calculate_metrics(df1).to_frame('Overall').T

# Apply the function by question type
df1['question_type'] = df1['metadata'].apply(lambda x: eval(x)['question_type'])
grouped_metrics = df1.groupby('question_type').apply(calculate_metrics)

# Combine overall metrics with grouped metrics
combined_metrics = pd.concat([grouped_metrics, overall_metrics])

# Extract the alignment metrics
alignment_metrics = combined_metrics[['alignment_default', 'alignment_method1', 'alignment_method2', 'alignment_giskard']]

# Plotting the alignment metrics with enhanced styling
fig, ax = plt.subplots(figsize=(20, 14))  # Make the figure wider

alignment_metrics.plot(kind='bar', ax=ax, color=['coral', 'lightblue', 'indianred', 'cornflowerblue'], width=0.8)  # Adjust width for larger bars
ax.set_title('Percentage Agreement by Question Type', fontsize=40, weight='bold')
ax.set_xlabel('Question Type', fontsize=35)
ax.set_ylabel('Percentage Agreement', fontsize=35)
ax.tick_params(axis='x', labelsize=30)
ax.tick_params(axis='y', labelsize=30)
ax.yaxis.set_major_locator(plt.MaxNLocator(10))  # Set smaller y-axis tick intervals
ax.legend(['Baseline', 'Prompt 1', 'Prompt 2', 'Giskard'], fontsize=24, title_fontsize=30, frameon=True, edgecolor='black', loc='upper center', bbox_to_anchor=(0.5, -0.27), ncol=4, framealpha=1, borderpad=0.7, fancybox=True, shadow=True, facecolor='white')  # Position the legend below and centered
ax.legend().remove()  # Remove the legend
# Setting thicker border around the plot
for spine in ax.spines.values():
    spine.set_linewidth(4)

plt.xticks(rotation=65)
plt.tight_layout()

# Show the plot
plt.show()


In [None]:
from scipy.stats import spearmanr

# Map the columns according to the instructions
df['correctness'] = df['correctness'].map({True: 1, False: 0})
df['correctness_default'] = df['correctness_default'].apply(lambda x: 1 if x >= 4 else 0)
df['correctness_method1'] = df['correctness_method1'].apply(lambda x: 1 if x >= 4 else 0)
df['correctness_method2'] = df['correctness_method2'].apply(lambda x: 1 if x >= 4 else 0)
df['correctness_human'] = df['correctness_human'].apply(lambda x: 1 if x == 1 else 0)

# Calculate the Spearman correlation coefficients
spearman_default = spearmanr(df['correctness_default'], df['correctness_human'])
spearman_method1 = spearmanr(df['correctness_method1'], df['correctness_human'])
spearman_method2 = spearmanr(df['correctness_method2'], df['correctness_human'])

spearman_default, spearman_method1, spearman_method2


In [None]:
import pandas as pd
from scipy.stats import spearmanr
import matplotlib.pyplot as plt

# Load the CSV file again
file_path = 'results_no_rewriter_k3_correctness.csv'
df = pd.read_csv(file_path)

# Re-map the columns according to the instructions
df['correctness'] = df['correctness'].map({True: 1, False: 0})
df['correctness_default'] = df['correctness_default'].apply(lambda x: 1 if x >= 4 else 0)
df['correctness_method1'] = df['correctness_method1'].apply(lambda x: 1 if x >= 4 else 0)
df['correctness_method2'] = df['correctness_method2'].apply(lambda x: 1 if x >= 4 else 0)
df['correctness_human'] = df['correctness_human'].apply(lambda x: 1 if x == 1 else 0)

# Extract question types from metadata
df['question_type'] = df['metadata'].apply(lambda x: eval(x)['question_type'])

# Recompute the Spearman correlation coefficients
spearman_default = spearmanr(df['correctness_default'], df['correctness_human']).correlation
spearman_method1 = spearmanr(df['correctness_method1'], df['correctness_human']).correlation
spearman_method2 = spearmanr(df['correctness_method2'], df['correctness_human']).correlation
spearman_giskard = spearmanr(df['correctness'], df['correctness_human']).correlation

# Compute Spearman coefficients by question type
spearman_by_question_type = df.groupby('question_type').apply(lambda x: pd.Series({
    'Baseline': spearmanr(x['correctness_default'], x['correctness_human']).correlation,
    'Prompt 1': spearmanr(x['correctness_method1'], x['correctness_human']).correlation,
    'Prompt 2': spearmanr(x['correctness_method2'], x['correctness_human']).correlation,
    'Giskard': spearmanr(x['correctness'], x['correctness_human']).correlation
}))

# Add overall Spearman scores
spearman_by_question_type.loc['Overall'] = {
    'Baseline': spearman_default,
    'Prompt 1': spearman_method1,
    'Prompt 2': spearman_method2,
    'Giskard': spearman_giskard
}

# Plotting the scores with enhanced styling
fig, ax = plt.subplots(figsize=(20, 14))  # Make the figure wider

# Plot Spearman's Coefficient
spearman_by_question_type.plot(kind='bar', ax=ax, color=['coral', 'lightblue', 'indianred', 'cornflowerblue'], width=0.8)
ax.set_title("Spearman's Coefficient by Question Type", fontsize=33, weight='bold')
ax.set_xlabel('Question Type', fontsize=30)
ax.set_ylabel("Spearman's Coefficient", fontsize=30)
ax.tick_params(axis='x', labelsize=26)
ax.tick_params(axis='y', labelsize=26)
ax.yaxis.set_major_locator(plt.MaxNLocator(10))
ax.legend(['Baseline', 'Prompt 1', 'Prompt 2', 'Giskard'], fontsize=24, title_fontsize=30, frameon=True, edgecolor='black', loc='upper center', bbox_to_anchor=(0.5, -0.27), ncol=4, framealpha=1, borderpad=0.7, fancybox=True, shadow=True, facecolor='white')
for spine in ax.spines.values():
    spine.set_linewidth(3)
ax.set_xticklabels(ax.get_xticklabels(), rotation=45)

plt.tight_layout()
plt.show()


In [None]:
import pandas as pd
import matplotlib.pyplot as plt
from scipy.stats import spearmanr

# Load the CSV file again
file_path = 'results_no_rewriter_k3_correctness.csv'
df = pd.read_csv(file_path)

# Re-map the columns according to the instructions
df['correctness'] = df['correctness'].map({True: 1, False: 0})
df['correctness_default'] = df['correctness_default'].apply(lambda x: 1 if x >= 4 else 0)
df['correctness_method1'] = df['correctness_method1'].apply(lambda x: 1 if x >= 4 else 0)
df['correctness_method2'] = df['correctness_method2'].apply(lambda x: 1 if x >= 4 else 0)
df['correctness_human'] = df['correctness_human'].apply(lambda x: 1 if x == 1 else 0)

# Extract question types from metadata
df['question_type'] = df['metadata'].apply(lambda x: eval(x)['question_type'])

# Compute the p-values for Spearman coefficients by question type
spearman_pvalues_by_question_type = df.groupby('question_type').apply(lambda x: pd.Series({
    'Baseline': spearmanr(x['correctness_default'], x['correctness_human']).pvalue,
    'Prompt 1': spearmanr(x['correctness_method1'], x['correctness_human']).pvalue,
    'Prompt 2': spearmanr(x['correctness_method2'], x['correctness_human']).pvalue,
    'Giskard': spearmanr(x['correctness'], x['correctness_human']).pvalue
}))

# Add overall Spearman p-values
spearman_pvalues_by_question_type.loc['Overall'] = {
    'Baseline': spearmanr(df['correctness_default'], df['correctness_human']).pvalue,
    'Prompt 1': spearmanr(df['correctness_method1'], df['correctness_human']).pvalue,
    'Prompt 2': spearmanr(df['correctness_method2'], df['correctness_human']).pvalue,
    'Giskard': spearmanr(df['correctness'], df['correctness_human']).pvalue
}

# Plotting the p-values for Spearman coefficients with enhanced styling
fig, ax = plt.subplots(figsize=(20, 14))  # Make the figure wider

# Plot Spearman p-values by question type
spearman_pvalues_by_question_type.plot(kind='bar', ax=ax, color=['coral', 'lightblue', 'indianred', 'cornflowerblue'], width=0.85)
ax.set_title("P-Values for Spearman Coefficient by Question Type", fontsize=40, weight='bold')
ax.set_xlabel('Question Type', fontsize=35)
ax.set_ylabel("P-Value", fontsize=35)
ax.tick_params(axis='x', labelsize=30)
ax.tick_params(axis='y', labelsize=30)
ax.yaxis.set_major_locator(plt.MaxNLocator(10))
for spine in ax.spines.values():
    spine.set_linewidth(4)
ax.set_xticklabels(ax.get_xticklabels(), rotation=65)
ax.get_legend().remove()  # Remove the legend

# Add dashed horizontal lines for specified p-values with different colors and labels
thresholds = [0.05, 0.10, 0.20, 0.50]
colors = ['red', 'blue', 'green', 'purple']
x_lim = ax.get_xlim()[1] - 0.25
for threshold, color in zip(thresholds, colors):
    ax.axhline(y=threshold, color=color, linestyle='--', linewidth=2)
    ax.text(x_lim, threshold, f'p = {threshold}', color=color, 
            ha='right', va='bottom', fontsize=25, weight='bold', backgroundcolor='white')

plt.tight_layout()
plt.show()

# Display the p-values
spearman_pvalues_by_question_type


In [None]:
import pandas as pd
import matplotlib.pyplot as plt
from scipy.stats import spearmanr

# Load the CSV file again 
file_path = '../exp2/results/no_rewriter_rag_results/results_no_rewriter_k3_correctness.csv'
df = pd.read_csv(file_path)

# Re-map the columns according to the instructions
df['correctness'] = df['correctness'].map({True: 1, False: 0})
df['correctness_default'] = df['correctness_default'].apply(lambda x: 1 if x >= 4 else 0)
df['correctness_method1'] = df['correctness_method1'].apply(lambda x: 1 if x >= 4 else 0)
df['correctness_method2'] = df['correctness_method2'].apply(lambda x: 1 if x >= 4 else 0)
df['correctness_human'] = df['correctness_human'].apply(lambda x: 1 if x == 1 else 0)

# Extract question types from metadata
df['question_type'] = df['metadata'].apply(lambda x: eval(x)['question_type'])

# Recompute the Spearman correlation coefficients
spearman_default = spearmanr(df['correctness_default'], df['correctness_human']).correlation
spearman_method1 = spearmanr(df['correctness_method1'], df['correctness_human']).correlation
spearman_method2 = spearmanr(df['correctness_method2'], df['correctness_human']).correlation
spearman_giskard = spearmanr(df['correctness'], df['correctness_human']).correlation

# Compute Spearman coefficients and p-values by question type
spearman_results = df.groupby('question_type').apply(lambda x: pd.Series({
    'Baseline': spearmanr(x['correctness_default'], x['correctness_human']),
    'Prompt 1': spearmanr(x['correctness_method1'], x['correctness_human']),
    'Prompt 2': spearmanr(x['correctness_method2'], x['correctness_human']),
    'Giskard': spearmanr(x['correctness'], x['correctness_human'])
}))

# Extract correlations and p-values into separate dataframes
spearman_by_question_type = spearman_results.map(lambda x: x.correlation)
spearman_pvalues_by_question_type = spearman_results.map(lambda x: x.pvalue)

# Add overall Spearman scores
spearman_by_question_type.loc['Overall'] = {
    'Baseline': spearman_default,
    'Prompt 1': spearman_method1,
    'Prompt 2': spearman_method2,
    'Giskard': spearman_giskard
}

# Function to calculate overall Spearman scores excluding values based on p-value threshold
def calculate_overall_spearman_excluding(df, pvalue_threshold):
    filtered_df = df.map(lambda x: x if x.pvalue <= pvalue_threshold else None).dropna(how='any')
    overall_spearman = filtered_df.map(lambda x: x.correlation).mean()
    return overall_spearman

# Calculate overall Spearman scores excluding high p-values
spearman_exclude_0_5 = calculate_overall_spearman_excluding(spearman_results, 0.5)
spearman_exclude_0_2 = calculate_overall_spearman_excluding(spearman_results, 0.2)
spearman_exclude_0_1 = calculate_overall_spearman_excluding(spearman_results, 0.1)
spearman_exclude_0_05 = calculate_overall_spearman_excluding(spearman_results, 0.05)

# Add these new scores to the dataframe for plotting
spearman_by_question_type.loc['Overall Excluding p > 0.5'] = spearman_exclude_0_5
spearman_by_question_type.loc['Overall Excluding p > 0.2'] = spearman_exclude_0_2
spearman_by_question_type.loc['Overall Excluding p > 0.1'] = spearman_exclude_0_1
spearman_by_question_type.loc['Overall Excluding p > 0.05'] = spearman_exclude_0_05

# Plotting the scores by question type with enhanced styling
fig, ax = plt.subplots(figsize=(20, 14))  # Make the figure wider

# Plot Spearman's Coefficient by question type
spearman_by_question_type.drop(['Overall', 'Overall Excluding p > 0.5', 'Overall Excluding p > 0.2', 'Overall Excluding p > 0.1', 'Overall Excluding p > 0.05']).plot(kind='bar', ax=ax, color=['coral', 'lightblue', 'indianred', 'cornflowerblue'], width=0.8)
ax.set_title("Spearman's Coefficient by Question Type", fontsize=40, weight='bold')
ax.set_xlabel('Question Type', fontsize=35)
ax.set_ylabel("Spearman's Coefficient", fontsize=35)
ax.tick_params(axis='x', labelsize=30)
ax.tick_params(axis='y', labelsize=30)
ax.yaxis.set_major_locator(plt.MaxNLocator(10))
ax.get_legend().remove()  # Remove the legend
for spine in ax.spines.values():
    spine.set_linewidth(4)
ax.set_xticklabels(ax.get_xticklabels(), rotation=65)

plt.tight_layout()
plt.show()

# Plotting the overall scores with high p-values excluded
fig, ax = plt.subplots(figsize=(20, 16))  # Make the figure wider

# Prepare the dataframe for overall scores only
overall_scores = spearman_by_question_type.loc[['Overall', 'Overall Excluding p > 0.5', 'Overall Excluding p > 0.2', 'Overall Excluding p > 0.1', 'Overall Excluding p > 0.05']]

# Update the x labels
overall_scores.index = ['Overall', 'Ex. p > 0.5', 'Ex. p > 0.2', 'Ex. p > 0.1', 'Ex. p > 0.05']

# Plot overall Spearman's Coefficient
overall_scores.plot(kind='bar', ax=ax, color=['coral', 'lightblue', 'indianred', 'cornflowerblue'], width=0.8)
ax.set_title("Overall Spearman's Coefficient Excluding High P-Values", fontsize=40, weight='bold')
ax.set_xlabel('P-Value Exclusion Criteria', fontsize=35)
ax.set_ylabel("Spearman's Coefficient", fontsize=35)
ax.tick_params(axis='x', labelsize=30)
ax.tick_params(axis='y', labelsize=30)
ax.yaxis.set_major_locator(plt.MaxNLocator(10))
ax.legend(['Baseline', 'Prompt 1', 'Prompt 2', 'Giskard'], fontsize=30, title_fontsize=35, frameon=True, edgecolor='black', loc='upper center', bbox_to_anchor=(0.5, -0.55), ncol=4, framealpha=1., borderpad=.85, fancybox=True, shadow=False, facecolor='white')
for spine in ax.spines.values():
    spine.set_linewidth(4)
ax.set_xticklabels(ax.get_xticklabels(), rotation=65)

plt.tight_layout()
plt.show()

# Plotting the p-values for Spearman coefficients with enhanced styling
fig, ax = plt.subplots(figsize=(20, 14))  # Make the figure wider

# Plot Spearman p-values by question type
spearman_pvalues_by_question_type.plot(kind='bar', ax=ax, color=['coral', 'lightblue', 'indianred', 'cornflowerblue'], width=0.85)
ax.set_title("P-Values for Spearman Coefficient by Question Type", fontsize=40, weight='bold')
ax.set_xlabel('Question Type', fontsize=35)
ax.set_ylabel("P-Value", fontsize=35)
ax.tick_params(axis='x', labelsize=30)
ax.tick_params(axis='y', labelsize=30)
ax.yaxis.set_major_locator(plt.MaxNLocator(10))
for spine in ax.spines.values():
    spine.set_linewidth(4)
ax.set_xticklabels(ax.get_xticklabels(), rotation=65)
ax.get_legend().remove()  # Remove the legend

# Add dashed horizontal lines for specified p-values with different colors and labels
thresholds = [0.05, 0.10, 0.20, 0.50]
colors = ['red', 'blue', 'green', 'purple']
x_lim = ax.get_xlim()[1] - 0.5  # Adjust to move the labels a bit to the left
for threshold, color in zip(thresholds, colors):
    ax.axhline(y=threshold, color=color, linestyle='--', linewidth=2)
    ax.text(x_lim, threshold, f'p = {threshold}', color=color, 
            ha='right', va='bottom', fontsize=25, weight='bold', backgroundcolor='white')

plt.tight_layout()
plt.show()


In [None]:
import pandas as pd
import matplotlib.pyplot as plt
from scipy.stats import spearmanr

# Load the CSV file again 
file_path = '../exp2/results/no_rewriter_rag_results/results_no_rewriter_k3_correctness.csv'
df = pd.read_csv(file_path)

# Re-map the columns according to the instructions
df['correctness'] = df['correctness'].map({True: 1, False: 0})
df['correctness_default'] = df['correctness_default'].apply(lambda x: 1 if x >= 4 else 0)
df['correctness_method1'] = df['correctness_method1'].apply(lambda x: 1 if x >= 4 else 0)
df['correctness_method2'] = df['correctness_method2'].apply(lambda x: 1 if x >= 4 else 0)
df['correctness_human'] = df['correctness_human'].apply(lambda x: 1 if x == 1 else 0)

# Extract question types from metadata
df['question_type'] = df['metadata'].apply(lambda x: eval(x)['question_type'])

# Recompute the Spearman correlation coefficients
spearman_default = spearmanr(df['correctness_default'], df['correctness_human']).correlation
spearman_method1 = spearmanr(df['correctness_method1'], df['correctness_human']).correlation
spearman_method2 = spearmanr(df['correctness_method2'], df['correctness_human']).correlation
spearman_giskard = spearmanr(df['correctness'], df['correctness_human']).correlation

# Compute Spearman coefficients and p-values by question type
spearman_results = df.groupby('question_type').apply(lambda x: pd.Series({
    'Baseline': spearmanr(x['correctness_default'], x['correctness_human']),
    'Prompt 1': spearmanr(x['correctness_method1'], x['correctness_human']),
    'Prompt 2': spearmanr(x['correctness_method2'], x['correctness_human']),
    'Giskard': spearmanr(x['correctness'], x['correctness_human'])
}))

# Extract correlations and p-values into separate dataframes
spearman_by_question_type = spearman_results.map(lambda x: x.correlation)
spearman_pvalues_by_question_type = spearman_results.map(lambda x: x.pvalue)

# Add overall Spearman scores
spearman_by_question_type.loc['Overall'] = {
    'Baseline': spearman_default,
    'Prompt 1': spearman_method1,
    'Prompt 2': spearman_method2,
    'Giskard': spearman_giskard
}

# Function to calculate overall Spearman scores excluding values based on p-value threshold
def calculate_overall_spearman_excluding(df, pvalue_threshold):
    filtered_df = df.map(lambda x: x if x.pvalue <= pvalue_threshold else None).dropna(how='any')
    overall_spearman = filtered_df.map(lambda x: x.correlation).mean()
    return overall_spearman

# Calculate overall Spearman scores excluding high p-values
spearman_exclude_0_5 = calculate_overall_spearman_excluding(spearman_results, 0.5)
spearman_exclude_0_2 = calculate_overall_spearman_excluding(spearman_results, 0.2)
spearman_exclude_0_1 = calculate_overall_spearman_excluding(spearman_results, 0.1)
spearman_exclude_0_05 = calculate_overall_spearman_excluding(spearman_results, 0.05)

# Add these new scores to the dataframe for plotting
spearman_by_question_type.loc['Overall Excluding p > 0.5'] = spearman_exclude_0_5
spearman_by_question_type.loc['Overall Excluding p > 0.2'] = spearman_exclude_0_2
spearman_by_question_type.loc['Overall Excluding p > 0.1'] = spearman_exclude_0_1
spearman_by_question_type.loc['Overall Excluding p > 0.05'] = spearman_exclude_0_05

# Plotting the overall scores with high p-values excluded
fig, ax = plt.subplots(figsize=(14, 14))  # Make the figure square

# Prepare the dataframe for overall scores only
overall_scores = spearman_by_question_type.loc[['Overall', 'Overall Excluding p > 0.5', 'Overall Excluding p > 0.2', 'Overall Excluding p > 0.1', 'Overall Excluding p > 0.05']]

# Update the x labels
overall_scores.index = ['Overall', 'Ex. p > 0.5', 'Ex. p > 0.2', 'Ex. p > 0.1', 'Ex. p > 0.05']

# Plot overall Spearman's Coefficient
overall_scores.plot(kind='bar', ax=ax, color=['coral', 'lightblue', 'indianred', 'cornflowerblue'], width=0.8)
ax.set_xlabel('P-Value Exclusion Criteria', fontsize=40)  # Increase font size by 5
ax.set_ylabel("Spearman's Coefficient", fontsize=40)  # Increase font size by 5
ax.tick_params(axis='x', labelsize=35)  # Increase font size by 5
ax.tick_params(axis='y', labelsize=35)  # Increase font size by 5
ax.yaxis.set_major_locator(plt.MaxNLocator(10))

# Adjust legend properties to be below the plot and span its full width
legend = ax.legend(['Baseline', 'Prompt 1', 'Prompt 2', 'Giskard'], fontsize=23.4, title_fontsize=40, frameon=True, loc='upper center', bbox_to_anchor=(0.5, 1.15),  ncol=4, framealpha=1., borderpad=.9, fancybox=True, shadow=False)
legend.get_frame().set_linewidth(3)
legend.get_frame().set_edgecolor('black')  # Add black border to the legend

for spine in ax.spines.values():
    spine.set_linewidth(4)
ax.set_xticklabels(ax.get_xticklabels(), rotation=90)  # Set x-labels at 90-degree angle

plt.tight_layout()
plt.show()


In [None]:
import matplotlib.pyplot as plt

# Extract question types from metadata
df['question_type'] = df['metadata'].apply(lambda x: eval(x)['question_type'])

# Group by question type and compute the mean scores
metrics_by_question_type = df.groupby('question_type')[['correctness_default', 'correctness_method1', 'correctness_method2', 'correctness']].mean()

# Compute overall mean scores
overall_scores = df[['correctness_default', 'correctness_method1', 'correctness_method2', 'correctness']].mean()
metrics_by_question_type.loc['Overall'] = overall_scores

# Rename columns for plotting
metrics_by_question_type.columns = ['Baseline', 'Prompt 1', 'Prompt 2', 'Giskard']

# Plotting the scores with enhanced styling
fig, ax = plt.subplots(figsize=(20, 14))  # Make the figure wider

metrics_by_question_type.plot(kind='bar', ax=ax, color=['coral', 'lightblue', 'indianred', 'cornflowerblue'], width=0.8)  # Adjust width for larger bars
ax.set_title('Evaluation Metrics by Question Type', fontsize=33, weight='bold')
ax.set_xlabel('Question Type', fontsize=30)
ax.set_ylabel('Evaluation Score', fontsize=30)
ax.tick_params(axis='x', labelsize=26)
ax.tick_params(axis='y', labelsize=26)
ax.yaxis.set_major_locator(plt.MaxNLocator(10))  # Set smaller y-axis tick intervals
ax.legend(['Baseline', 'Prompt 1', 'Prompt 2', 'Giskard'], fontsize=24, title_fontsize=30, frameon=True, edgecolor='black', loc='upper center', bbox_to_anchor=(0.5, -0.27), ncol=4, framealpha=1, borderpad=0.7, fancybox=True, shadow=True, facecolor='white')  # Position the legend below and centered

# Setting thicker border around the plot
for spine in ax.spines.values():
    spine.set_linewidth(2)

plt.xticks(rotation=45)
plt.tight_layout()

# Show the plot
plt.show()


In [None]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.metrics import f1_score

# Compute F1 scores for each method compared to human evaluation
f1_scores = {
    'Baseline': f1_score(df['correctness_human'], df['correctness_default']),
    'Prompt 1': f1_score(df['correctness_human'], df['correctness_method1']),
    'Prompt 2': f1_score(df['correctness_human'], df['correctness_method2']),
    'Giskard': f1_score(df['correctness_human'], df['correctness'])
}

# Compute F1 scores by question type
f1_scores_by_question_type = df.groupby('question_type').apply(lambda x: pd.Series({
    'Baseline': f1_score(x['correctness_human'], x['correctness_default']),
    'Prompt 1': f1_score(x['correctness_human'], x['correctness_method1']),
    'Prompt 2': f1_score(x['correctness_human'], x['correctness_method2']),
    'Giskard': f1_score(x['correctness_human'], x['correctness'])
}))

# Add overall F1 scores
f1_scores_by_question_type.loc['Overall'] = f1_scores

# Plotting the F1 scores with enhanced styling
fig, ax = plt.subplots(figsize=(20, 14))  # Make the figure wider

f1_scores_by_question_type.plot(kind='bar', ax=ax, color=['coral', 'lightblue', 'indianred', 'cornflowerblue'], width=0.8)  # Adjust width for larger bars
ax.set_title('F1 Scores by Question Type', fontsize=40, weight='bold')
ax.set_xlabel('Question Type', fontsize=35)
ax.set_ylabel('F1 Score', fontsize=35)
ax.tick_params(axis='x', labelsize=30)
ax.tick_params(axis='y', labelsize=30)
ax.yaxis.set_major_locator(plt.MaxNLocator(10))  # Set smaller y-axis tick intervals
ax.get_legend().remove()
# Remove the legend
# ax.legend(['Baseline', 'Prompt 1', 'Prompt 2', 'Giskard'], fo?ntsize=24, title_fontsize=30, frameon=True, edgecolor='black', loc='upper center', bbox_to_anchor=(0.5, -0.27), ncol=4, framealpha=1, borderpad=0.7, fancybox=True, shadow=True, facecolor='white')  # Position the legend below and centered

# Setting thicker border around the plot
for spine in ax.spines.values():
    spine.set_linewidth(4)

plt.xticks(rotation=65)
plt.tight_layout()

# Show the plot
plt.show()


In [None]:
import pandas as pd
from sklearn.metrics import f1_score

# Assuming df is already defined and loaded with the required data
# df = pd.read_csv('results_no_rewriter_k3_correctness.csv')  # Example loading data

# Compute F1 scores for each method compared to human evaluation
f1_scores = {
    'Baseline': f1_score(df['correctness_human'], df['correctness_default']),
    'Prompt 1': f1_score(df['correctness_human'], df['correctness_method1']),
    'Prompt 2': f1_score(df['correctness_human'], df['correctness_method2']),
    'Giskard': f1_score(df['correctness_human'], df['correctness'])
}

# Print the F1 scores
for method, score in f1_scores.items():
    print(f"{method}: {score:.4f}")


In [None]:
import seaborn as sns

# Recompute confusion matrices for each method compared to human evaluation
conf_matrix_default = confusion_matrix(df['correctness_human'], df['correctness_default'])
conf_matrix_method1 = confusion_matrix(df['correctness_human'], df['correctness_method1'])
conf_matrix_method2 = confusion_matrix(df['correctness_human'], df['correctness_method2'])
conf_matrix_giskard = confusion_matrix(df['correctness_human'], df['correctness'])

# Plot confusion matrices with enhanced styling and larger fonts
fig, axes = plt.subplots(2, 2, figsize=(20, 14))

sns.heatmap(conf_matrix_default, annot=True, fmt='d', cmap='Blues', ax=axes[0, 0], annot_kws={"size": 20})
axes[0, 0].set_title('Confusion Matrix: Baseline', fontsize=24)
axes[0, 0].set_xlabel('Predicted', fontsize=20)
axes[0, 0].set_ylabel('Actual', fontsize=20)
axes[0, 0].tick_params(axis='both', which='major', labelsize=18)

sns.heatmap(conf_matrix_method1, annot=True, fmt='d', cmap='Blues', ax=axes[0, 1], annot_kws={"size": 20})
axes[0, 1].set_title('Confusion Matrix: Prompt 1', fontsize=24)
axes[0, 1].set_xlabel('Predicted', fontsize=20)
axes[0, 1].set_ylabel('Actual', fontsize=20)
axes[0, 1].tick_params(axis='both', which='major', labelsize=18)

sns.heatmap(conf_matrix_method2, annot=True, fmt='d', cmap='Blues', ax=axes[1, 0], annot_kws={"size": 20})
axes[1, 0].set_title('Confusion Matrix: Prompt 2', fontsize=24)
axes[1, 0].set_xlabel('Predicted', fontsize=20)
axes[1, 0].set_ylabel('Actual', fontsize=20)
axes[1, 0].tick_params(axis='both', which='major', labelsize=18)

sns.heatmap(conf_matrix_giskard, annot=True, fmt='d', cmap='Blues', ax=axes[1, 1], annot_kws={"size": 20})
axes[1, 1].set_title('Confusion Matrix: Giskard', fontsize=24)
axes[1, 1].set_xlabel('Predicted', fontsize=20)
axes[1, 1].set_ylabel('Actual', fontsize=20)
axes[1, 1].tick_params(axis='both', which='major', labelsize=18)

plt.tight_layout()
plt.show()

In [None]:
from sklearn.metrics import precision_score, recall_score

# Compute precision and recall for each method compared to human evaluation
precision_default = precision_score(df['correctness_human'], df['correctness_default'])
recall_default = recall_score(df['correctness_human'], df['correctness_default'])

precision_method1 = precision_score(df['correctness_human'], df['correctness_method1'])
recall_method1 = recall_score(df['correctness_human'], df['correctness_method1'])

precision_method2 = precision_score(df['correctness_human'], df['correctness_method2'])
recall_method2 = recall_score(df['correctness_human'], df['correctness_method2'])

precision_giskard = precision_score(df['correctness_human'], df['correctness'])
recall_giskard = recall_score(df['correctness_human'], df['correctness'])

# Compute precision and recall by question type
precision_recall_by_question_type = df.groupby('question_type').apply(lambda x: pd.Series({
    'Baseline Precision': precision_score(x['correctness_human'], x['correctness_default']),
    'Baseline Recall': recall_score(x['correctness_human'], x['correctness_default']),
    'Prompt 1 Precision': precision_score(x['correctness_human'], x['correctness_method1']),
    'Prompt 1 Recall': recall_score(x['correctness_human'], x['correctness_method1']),
    'Prompt 2 Precision': precision_score(x['correctness_human'], x['correctness_method2']),
    'Prompt 2 Recall': recall_score(x['correctness_human'], x['correctness_method2']),
    'Giskard Precision': precision_score(x['correctness_human'], x['correctness']),
    'Giskard Recall': recall_score(x['correctness_human'], x['correctness'])
}))

# Add overall precision and recall scores
precision_recall_by_question_type.loc['Overall'] = {
    'Baseline Precision': precision_default,
    'Baseline Recall': recall_default,
    'Prompt 1 Precision': precision_method1,
    'Prompt 1 Recall': recall_method1,
    'Prompt 2 Precision': precision_method2,
    'Prompt 2 Recall': recall_method2,
    'Giskard Precision': precision_giskard,
    'Giskard Recall': recall_giskard
}

# Plotting Precision and Recall as separate plots

# Plotting the precision with enhanced styling
fig, ax1 = plt.subplots(figsize=(20, 14))  # Make the figure wider

# Plot Precision
precision_recall_by_question_type[['Baseline Precision', 'Prompt 1 Precision', 'Prompt 2 Precision', 'Giskard Precision']].plot(kind='bar', ax=ax1, color=['coral', 'lightblue', 'indianred', 'cornflowerblue'], width=0.8)
ax1.set_title("Precision by Question Type", fontsize=40, weight='bold')
ax1.set_xlabel('Question Type', fontsize=35)
ax1.set_ylabel("Precision", fontsize=35)
ax1.tick_params(axis='x', labelsize=30)
ax1.tick_params(axis='y', labelsize=30)
ax1.yaxis.set_major_locator(plt.MaxNLocator(10))
ax1.legend(['Baseline', 'Prompt 1', 'Prompt 2', 'Giskard'], fontsize=24, title_fontsize=30, frameon=True, edgecolor='black', loc='upper center', bbox_to_anchor=(0.5, -0.27), ncol=4, framealpha=1, borderpad=0.7, fancybox=True, shadow=True, facecolor='white')
for spine in ax1.spines.values():
    spine.set_linewidth(3)
ax1.set_xticklabels(ax1.get_xticklabels(), rotation=65)
ax1.get_legend().remove()
plt.tight_layout()
plt.show()

# Plotting the recall with enhanced styling
fig, ax2 = plt.subplots(figsize=(20, 14))  # Make the figure wider

# Plot Recall
precision_recall_by_question_type[['Baseline Recall', 'Prompt 1 Recall', 'Prompt 2 Recall', 'Giskard Recall']].plot(kind='bar', ax=ax2, color=['coral', 'lightblue', 'indianred', 'cornflowerblue'], width=0.8)
ax2.set_title("Recall by Question Type", fontsize=40, weight='bold')
ax2.set_xlabel('Question Type', fontsize=35)
ax2.set_ylabel("Recall", fontsize=35)
ax2.tick_params(axis='x', labelsize=30)
ax2.tick_params(axis='y', labelsize=30)
ax2.yaxis.set_major_locator(plt.MaxNLocator(10))
ax2.legend(['Baseline', 'Prompt 1', 'Prompt 2', 'Giskard'], fontsize=24, title_fontsize=30, frameon=True, edgecolor='black', loc='upper center', bbox_to_anchor=(0.5, -0.27), ncol=4, framealpha=1, borderpad=0.7, fancybox=True, shadow=True, facecolor='white')
for spine in ax2.spines.values():
    spine.set_linewidth(4)
ax2.set_xticklabels(ax2.get_xticklabels(), rotation=65)
ax2.get_legend().remove()
plt.tight_layout()
plt.show()
