In [None]:
!pip install pandas
!pip install evaluate
!pip install bert_score
!pip install ragas

# Calculate BERTScore

In [None]:
from evaluate import load
import pandas as pd

# Load BERTScore
bertscore = load("bertscore")

# Load the Excel file into a pandas DataFrame
excel_file = 'dataset.xlsx'  # Path to your dataset Excel file
df = pd.read_excel(excel_file)

# Ensure the DataFrame has the required columns
required_columns = ['question', 'essence_coach', 'chatgpt_4o', 'ground_truth']
if not all(col in df.columns for col in required_columns):
    raise ValueError(f"The Excel file must contain the following columns: {required_columns}")

# Extract data for processing
questions = df['question'].tolist()
question_type = df['question_type'].tolist()
predictions_coach = df['essence_coach'].tolist()
predictions_gpt = df['chatgpt_4o'].tolist()
references = df['ground_truth'].tolist()

# Compute BERTScore for Essence Coach
results_coach = bertscore.compute(predictions=predictions_coach, references=references, lang="en")

# Compute BERTScore for ChatGPT
results_gpt = bertscore.compute(predictions=predictions_gpt, references=references, lang="en")

# Combine results with questions into a single DataFrame
df_score = pd.DataFrame({
    'question': questions,
    'question_type': question_type,
    'precision_coach': results_coach['precision'],
    'recall_coach': results_coach['recall'],
    'f1_coach': results_coach['f1'],
    'precision_gpt': results_gpt['precision'],
    'recall_gpt': results_gpt['recall'],
    'f1_gpt': results_gpt['f1'],
})

# Save the evaluation scores to a CSV file
df_score.to_csv('scores_bert.csv', index=False)

print("Evaluation scores saved to 'scores_bert.csv'")


# Averages BERTScore

In [None]:
# Calculate averages for all rows
overall_avg = df_score[['precision_coach', 'recall_coach', 'f1_coach',
                        'precision_gpt', 'recall_gpt', 'f1_gpt']].mean()

# Filter and calculate averages for specific question types
info_avg = df_score[df_score['question_type'] == 'information'][['precision_coach', 'recall_coach', 'f1_coach',
                                                                 'precision_gpt', 'recall_gpt', 'f1_gpt']].mean()

decision_avg = df_score[df_score['question_type'] == 'decision-making'][['precision_coach', 'recall_coach', 'f1_coach',
                                                                         'precision_gpt', 'recall_gpt', 'f1_gpt']].mean()

translation_avg = df_score[df_score['question_type'] == 'translation'][['precision_coach', 'recall_coach', 'f1_coach',
                                                                        'precision_gpt', 'recall_gpt', 'f1_gpt']].mean()

# Combine all averages into a single DataFrame
avg_data = pd.DataFrame({
    'Metric': ['precision_coach', 'recall_coach', 'f1_coach', 'precision_gpt', 'recall_gpt', 'f1_gpt'],
    'Overall': overall_avg.values,
    'Information': info_avg.values,
    'Decision-Making': decision_avg.values,
    'Translation': translation_avg.values,
})

# Save the averages to a CSV file
avg_data.to_csv('scores_bert_avg.csv', index=False)

print("Average scores saved to 'scores_bert_avg.csv'")


# Averages Human Evaluation

In [None]:
import pandas as pd

# Load the Excel file into a pandas DataFrame
excel_file = 'dataset.xlsx'  # Path to your dataset Excel file
df = pd.read_excel(excel_file)

# Ensure the DataFrame has the required columns
required_columns = [
    'relevance_coach', 'accuracy_coach', 'completeness_coach',
    'relevance_gpt', 'accuracy_gpt', 'completeness_gpt', 'question_type'
]
if not all(col in df.columns for col in required_columns):
    raise ValueError(f"The Excel file must contain the following columns: {required_columns}")

# Compute the overall averages
overall_avg = df[[
    'relevance_coach', 'accuracy_coach', 'completeness_coach',
    'relevance_gpt', 'accuracy_gpt', 'completeness_gpt'
]].mean()

# Compute averages for each question_type
grouped_avg = df.groupby('question_type')[[
    'relevance_coach', 'accuracy_coach', 'completeness_coach',
    'relevance_gpt', 'accuracy_gpt', 'completeness_gpt'
]].mean()

# Combine overall and grouped averages into a single DataFrame
overall_avg_df = pd.DataFrame(overall_avg, columns=['Overall']).T  # Convert to row
grouped_avg_df = grouped_avg.reset_index()  # Reset index for grouped averages

# Concatenate overall and grouped averages
combined_df = pd.concat([overall_avg_df, grouped_avg_df], ignore_index=False)

# Save the combined averages to a CSV file
combined_df.to_csv('scores_human_avg.csv', index=False)

print("Averages saved to 'scores_human_avg.csv'")


# Table images

In [None]:
import pandas as pd
import matplotlib.pyplot as plt

# Load the CSV file
csv_file = 'scores_bert_avg.csv'  # Replace with your file path
df_avg = pd.read_csv(csv_file)

# Round values to 3 decimal places
df_avg = df_avg.round(3)

# Transpose the DataFrame to invert rows and columns
df_avg_transposed = df_avg.set_index('Metric').T.reset_index()
df_avg_transposed.rename(columns={'index': 'Category'}, inplace=True)

# Create a table using Matplotlib
fig, ax = plt.subplots(figsize=(10, 6))  # Adjust size as needed
ax.axis('tight')
ax.axis('off')

# Create the table
table = ax.table(cellText=df_avg_transposed.values,
                 colLabels=df_avg_transposed.columns,
                 cellLoc='center',
                 loc='center')

# Style the table
table.auto_set_font_size(False)
table.set_fontsize(10)
table.auto_set_column_width(col=list(range(len(df_avg_transposed.columns))))

# Save the table as an image
output_image = 'scores_bert_avg_table_inverted.png'  # File name for the image
plt.savefig(output_image, dpi=300, bbox_inches='tight')
plt.show()

print(f"Table saved as an image: {output_image}")


In [None]:
import pandas as pd
import matplotlib.pyplot as plt

# Load the CSV file
csv_file = 'scores_human_avg.csv'  # Replace with your file path
df_human_avg = pd.read_csv(csv_file)

# Round values to 3 decimals for better presentation
df_human_avg = df_human_avg.round(3)

# Ensure the required columns are present
required_columns = ['relevance_coach', 'accuracy_coach', 'completeness_coach',
                    'relevance_gpt', 'accuracy_gpt', 'completeness_gpt', 'question_type']

if not all(col in df_human_avg.columns for col in required_columns):
    raise ValueError(f"The CSV file must contain the following columns: {required_columns}")

# Extract rows for overall and specific question types
categories = ['overall', 'information', 'decision-making', 'translation']
averages = []

# Compute overall average for all numerical columns
overall_avg = df_human_avg[required_columns[:-1]].mean().tolist()
averages.append(['overall'] + overall_avg)

# Compute averages for each question type
for category in categories[1:]:
    filtered = df_human_avg[df_human_avg['question_type'] == category]
    category_avg = filtered[required_columns[:-1]].mean().tolist()
    averages.append([category] + category_avg)

# Create a new DataFrame for the table
table_df = pd.DataFrame(averages, columns=['Category'] + required_columns[:-1])

# Create the table using Matplotlib
fig, ax = plt.subplots(figsize=(10, 6))  # Adjust size as needed
ax.axis('tight')
ax.axis('off')

# Create the table
table = ax.table(cellText=table_df.values,
                 colLabels=table_df.columns,
                 cellLoc='center',
                 loc='center')

# Style the table
table.auto_set_font_size(False)
table.set_fontsize(10)
table.auto_set_column_width(col=list(range(len(table_df.columns))))

# Save the table as an image
output_image = 'scores_human_avg_table.png'  # File name for the image
plt.savefig(output_image, dpi=300, bbox_inches='tight')
plt.show()

print(f"Table saved as an image: {output_image}")


# Bar plots

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

# Load the CSV file
csv_file = 'scores_human_avg.csv'  # Replace with your file path
df_human_avg = pd.read_csv(csv_file)

# Ensure the required columns are present
required_columns = ['relevance_coach', 'accuracy_coach', 'completeness_coach',
                    'relevance_gpt', 'accuracy_gpt', 'completeness_gpt', 'question_type']

if not all(col in df_human_avg.columns for col in required_columns if col != 'question_type'):
    raise ValueError(f"The CSV file must contain the following columns: {required_columns}")

# Manually set the `question_type` for the first row if missing
if pd.isna(df_human_avg.loc[0, 'question_type']):
    df_human_avg.loc[0, 'question_type'] = 'Overall'

# Define metrics
metrics = ['relevance', 'accuracy', 'completeness']

# Define question types
question_types = ['overall', 'information', 'decision-making', 'translation']

# Create a bar plot for each question type
for qtype in question_types:
    # Filter the data for the current question type
    row = df_human_avg[df_human_avg['question_type'].str.lower() == qtype]

    if not row.empty:
        coach_scores = row[[f"{metric}_coach" for metric in metrics]].values.flatten()
        gpt_scores = row[[f"{metric}_gpt" for metric in metrics]].values.flatten()

        # Create the bar plot
        fig, ax = plt.subplots(figsize=(10, 6))
        width = 0.35  # Bar width
        x = np.arange(len(metrics))  # Metric positions

        # Plot the bars for Essence Coach and GPT-4o
        ax.bar(x - width/2, coach_scores, width, label='Essence Coach', alpha=0.6)
        ax.bar(x + width/2, gpt_scores, width, label='GPT-4o', alpha=0.6)

        # Set plot labels and title
        ax.set_xticks(x)
        ax.set_xticklabels(['Relevance', 'Accuracy', 'Completeness'], fontsize=12)
        ax.set_ylabel('Scores', fontsize=12)
        ax.set_title(f'{qtype.capitalize()} Scores Comparison', fontsize=14)
        ax.legend(fontsize=10)

        # Save and show the plot
        output_image = f'scores_comparison_{qtype}.png'
        plt.tight_layout()
        plt.savefig(output_image, dpi=300)
        plt.show()

        print(f"Bar plot for {qtype.capitalize()} saved as an image: {output_image}")
    else:
        print(f"No data found for question type '{qtype}'. It will be skipped.")


# Print all contexts

In [None]:
import json

# File path to the data.jsonl file
file_path = "data.jsonl"

# Read and process the JSONL file
with open(file_path, 'r') as file:
    for line in file:
        # Parse the JSON object
        record = json.loads(line)

        # Extract the question and contexts
        question = record.get("user_question", "No question found")
        contexts = record.get("context", [])

        # Print the question
        print("User Question:")
        print(question)
        print("\nContexts:")

        # Print each context clearly separated
        for i, context in enumerate(contexts, start=1):
            print(f"Context {i}:")
            print(context)
            print("-" * 40)
        print("=" * 80)  # Separator for each record

# Evaluate Retrieved Contexts

In [None]:
import pandas as pd

# Load the file
excel_file = 'scores_retrieval.xlsx'
df = pd.read_excel(excel_file)

# Ensure required columns exist
required_columns = ['k', 'relevant_results', 'total_relevant_results',
                    'pos_rel_doc_1', 'pos_rel_doc_2', 'pos_rel_doc_3', 'pos_rel_doc_4']
if not all(col in df.columns for col in required_columns):
    raise ValueError(f"The CSV file must contain the following columns: {required_columns}")

# Initialize metrics
precision_at_k = []
recall_at_k = []
reciprocal_ranks = []
average_precisions = []

# Process each question
for _, row in df.iterrows():
    k = row['k']
    relevant_results = row['relevant_results']
    total_relevant = row['total_relevant_results']
    positions = [row[f'pos_rel_doc_{i+1}'] for i in range(4) if not pd.isna(row[f'pos_rel_doc_{i+1}'])]

    print("POSTIONS")
    print(positions)

    # Precision@K
    precision = relevant_results / k if k > 0 else 0
    precision_at_k.append(precision)

    # Recall@K
    recall = relevant_results / total_relevant if total_relevant > 0 else 0
    recall_at_k.append(recall)

    # Mean Reciprocal Rank (MRR)
    reciprocal_rank = 0
    if positions:
        reciprocal_rank = 1 / positions[0]
    reciprocal_ranks.append(reciprocal_rank)

    # Mean Average Precision (MAP)
    if positions:
        positions_sorted = sorted(positions)
        precision_scores = []
        for i, pos in enumerate(positions_sorted, start=1):
            if pos <= k:
                precision_scores.append(i / pos)
        average_precision = sum(precision_scores) / len(positions_sorted) if positions_sorted else 0
    else:
        average_precision = 0
    average_precisions.append(average_precision)

# Compute averages for the metrics
mean_precision_at_k = sum(precision_at_k) / len(precision_at_k) if precision_at_k else 0
mean_recall_at_k = sum(recall_at_k) / len(recall_at_k) if recall_at_k else 0
mean_reciprocal_rank = sum(reciprocal_ranks) / len(reciprocal_ranks) if reciprocal_ranks else 0
mean_average_precision = sum(average_precisions) / len(average_precisions) if average_precisions else 0

# Print the results
print(f"Precision@K: {mean_precision_at_k:.3f}")
print(f"Recall@K: {mean_recall_at_k:.3f}")
print(f"Mean Reciprocal Rank (MRR): {mean_reciprocal_rank:.3f}")
print(f"Mean Average Precision (MAP): {mean_average_precision:.3f}")
