In [211]:
import copy
import requests
from tqdm.notebook import tqdm




In [212]:
from helpers.parallel_evaluate import parallel_evaluate

# For Parallel Evaluation
num_threads = 4
batch_size = 250

In [213]:
import pandas as pd

def filter_grading_data_by_points(grading_data: pd.DataFrame) -> pd.DataFrame:
    """
    Filters the grading data to remove rows where points are not divisible by 0.5.
    
    Args:
        grading_data (pd.DataFrame): The DataFrame containing grading data, including a 'points_assigned' column.
    
    Returns:
        pd.DataFrame: The filtered DataFrame with only rows where points are divisible by 0.5.
    """
    # Ensure the 'points_assigned' column exists
    if 'points_assigned' not in grading_data.columns:
        raise ValueError("The 'points_assigned' column is missing from the grading data.")
    
    # Filter rows where 'points_assigned' is divisible by 0.5
    filtered_data = grading_data[grading_data['points_assigned'] % 0.5 == 0]
    
    return filtered_data


In [214]:
import pandas as pd

# Load CSVs
exam_data_path = "data_banking/exam_data.csv"
grading_data_path = "data_banking/grading.csv"
adjustments_data_path = "data_banking/point_adjustments.csv"
responses_data_path = "data_banking/responses.csv"

# Load CSVs into DataFrames
exam_data = pd.read_csv(exam_data_path)
grading_data = pd.read_csv(grading_data_path)

# Remove inconsistent point allocations so accuracy computation is not impacted.
grading_data = filter_grading_data_by_points(grading_data)

responses_data = pd.read_csv(responses_data_path)

data = []

# Iterate over rows in `responses_data`
for _, submission in responses_data.iterrows():
    
    rubrics = []
    for _, d in exam_data.iterrows():
        if (
            d["exam_id"] == submission["exam_id"]
            and d["exercise_id"] == submission["exercise_id"]
            and d["exercise_index"] == submission["exercise_index"]
        ):
            # Find the points assigned from grading_data
            points_assigned = grading_data.loc[
                (grading_data["answer_identifier"] == submission["answer_identifier"]) &
                (grading_data["rubric_id"] == d["rubric_id"]),
                "points_assigned"
            ].values
            rubrics.append(
                {
                    "id": str(d["rubric_id"]),
                    "title": d["rubric_title"],
                    "points": d["rubric_points"],
                    "points_assigned": points_assigned[0] if len(points_assigned) > 0 else 0
                }
            )
    data.append({
        "question": exam_data.loc[
            (exam_data["exam_id"] == submission["exam_id"]) &
            (exam_data["exercise_id"] == submission["exercise_id"]) &
            (exam_data["exercise_index"] == submission["exercise_index"]),
            "exercise_questions"
        ].values[0] if not exam_data.empty else "",  # Question text from exam_data
        "answer": submission.get("student_answer", ""),  # Student's answer4
        "maxPoints": exam_data.loc[
            (exam_data["exam_id"] == submission["exam_id"]) &
            (exam_data["exercise_id"] == submission["exercise_id"]) &
            (exam_data["exercise_index"] == submission["exercise_index"]),
            "exercise_points"
        ].values[0] if not exam_data.empty else 0,  # Max points for the exercise
        "minPoints": 0,  # Assuming minPoints is not provided explicitly in the data
        "rubrics": rubrics,  # Constructed rubrics list
        "pointStep": 0.5,  # Default point step
        "llmType": "gpt"  # Static field
    })


print(len(data))
print(data[:5])

3819
[{'question': 'Bank A vergibt auf der Aktivseite nur Kredite mit einer Laufzeit von mindestens 10 Jahren, während Bank B nur Kredite mit einer Laufzeit bis maximal 7 Jahre vergibt. Auf der Passiv-seite nehmen beide Banken gleichermassen Depositen an und haben nach allem, was man weiss, vergleichbare Depositoren. Keine der Banken nutzt andere Instrumente, die das Zins-risiko beeinflussen. Welchen Teilen des Zinsrisikos sind die beiden Banken ausgesetzt? Begründe deine Antworten.', 'answer': 'Markstrukturrisiko: Das aufgrund der Makrtsturkturänderung von seiten des Kunden die Zinskosten nicht mehr bezahlt werden können und so ein verlust für die bank resultiert. \\n Zinsbindungsdauerrisiko: Aufgrund der Unterschiedlichen Zinsbindungsdauer, kann es sein dass der passivzins angepasst werden muss und aber der Aktivzins für die Hypothek fest bleibt und sich dadurch die Marge verkleiner rsp. negativ werden könnte', 'maxPoints': np.float64(2.0), 'minPoints': 0, 'rubrics': [{'id': '1', 'ti

In [215]:
from helpers.stratified_sample import stratified_random_sample
import random

# Get well distributed sample (assuming maxPoints = rough difficulty)
sample = stratified_random_sample(data, sample_size=1000, num_groups=10) 


In [216]:

tracker, average_accuracy, logs = parallel_evaluate(sample, num_threads=num_threads, batch_size=batch_size, func=evaluate)

print(f"Average Accuracy Statistics: {average_accuracy:.2%}")


Evaluating submissions:   0%|          | 0/250 [00:00<?, ?it/s]

Evaluating submissions:   0%|          | 0/250 [00:00<?, ?it/s]

Evaluating submissions:   0%|          | 0/250 [00:00<?, ?it/s]

Evaluating submissions:   0%|          | 0/250 [00:00<?, ?it/s]

Average Accuracy Statistics: 73.07%


In [217]:
import copy
import requests
from tqdm.notebook import tqdm

def evaluate_without_rubrics(to_evaluate):
    EVALUATION_URL = "http://localhost:4000/evaluate"
    tracker = []
    accuracy_tracker = []
    logs = []

    # Calculates accuracy by comparing human-assigned points with LLM-assigned points
    def calculate_accuracy(human_points, gpt_points, max_points):
        if max_points == 0:
            return 1  # Avoid division by zero
        diff = abs(human_points - gpt_points)
        return 1 - (diff / max_points)

    for idx, submission in enumerate(tqdm(to_evaluate, desc="Evaluating submissions"), start=1):
        try:
            # Create a deep copy of the submission to avoid mutating the original object
            submission_copy = copy.deepcopy(submission)

            # Remember rubrics containing assigned points
            full_rubrics = submission_copy["rubrics"]

            # Extract human-assigned points (from points_assigned in full_rubrics)
            human_points = sum(rubric["points_assigned"] for rubric in full_rubrics)

            # No rubrics
            submission_copy["rubrics"] = []
            

            # Make API call for evaluation
            response = requests.post(EVALUATION_URL, json=submission_copy, headers={"Content-Type": "application/json"})
            response.raise_for_status()
            evaluation_result = response.json()

            # Extract LLM-assigned points and rubric data
            llm_points = evaluation_result.get("points", 0)
            max_points = submission_copy["maxPoints"]

            # Calculate accuracy
            accuracy = calculate_accuracy(human_points, llm_points, max_points)

            tracker.append({
                "human_points": human_points,
                "gpt_points": llm_points,
                "accuracy": accuracy,
                "evaluation_result": evaluation_result,
                "submission": submission,  # Store the modified copy
            })
            accuracy_tracker.append(accuracy)

            # Add logs
            logs.append(f"Submission {idx} Result: {evaluation_result}")
            logs.append(f"Human Points: {human_points}, GPT Points: {llm_points}")
            logs.append(f"Accuracy: {accuracy:.2%}")
        except requests.exceptions.RequestException as e:
            logs.append(f"Error evaluating submission {idx}: {e}")

    average_accuracy = sum(accuracy_tracker) / len(accuracy_tracker) if accuracy_tracker else 0

    return tracker, average_accuracy, logs


In [218]:
# Reuse the small_set from above
tracker_no_rubrics, average_accuracy_no_rubrics, logs_no_rubrics = parallel_evaluate(sample, num_threads=num_threads, batch_size=batch_size, func=evaluate_without_rubrics)
print(f"Average Accuracy Statistics: {average_accuracy_no_rubrics:.2%}")



Evaluating submissions:   0%|          | 0/250 [00:00<?, ?it/s]

Evaluating submissions:   0%|          | 0/250 [00:00<?, ?it/s]

Evaluating submissions:   0%|          | 0/250 [00:00<?, ?it/s]

Evaluating submissions:   0%|          | 0/250 [00:00<?, ?it/s]

Average Accuracy Statistics: 69.70%


In [219]:
print(tracker[0])
print(tracker_no_rubrics[0])

{'human_points': np.float64(2.0), 'gpt_points': 1, 'accuracy': np.float64(0.5), 'evaluation_result': {'status': 'incomplete', 'feedback': "Your answer identifies 'Character' and 'Conditions' as relevant factors for evaluating creditworthiness, which is correct. However, you did not describe 'Character' in sufficient detail, specifically regarding how the bank would assess the reputation, credit history, and previous business relationships of the potential borrower. Additionally, while you mentioned 'Conditions', the description lacks clarity on how they relate to the borrower's ability to service the debt considering various factors. To improve, ensure you provide detailed explanations for each concept, referencing how they specifically apply to the bank's evaluation process.", 'hint': "Focus on elaborating your descriptions to cover the specific assessment methods the bank might use for 'Character' and 'Conditions'. Understanding the implications of these factors in credit evaluations

In [226]:
def save_evaluation_result(t, filename):
    """
    Save the evaluation results from a list of dictionaries to a CSV file.
    
    Args:
        t (list): A list of dictionaries containing evaluation data.
        filename (str): The name of the file to save the results to.
    """
    # Filter out non-dictionary items
    valid_data = [item for item in t if isinstance(item, dict)]

    if not valid_data:
        raise ValueError("No valid dictionary items in tracker.")

    # Convert the tracker list to a DataFrame
    df = pd.DataFrame(valid_data)

    # Flatten 'evaluation_result'
    eval_df = df['evaluation_result'].apply(pd.Series)

    # Flatten 'submission' and drop the 'human_points' column from it
    submission_df = df['submission'].apply(pd.Series)
    submission_df = submission_df.drop(columns=['human_points', 'question', 'answer'], errors='ignore')

    # Concatenate the flattened DataFrames with the original, after dropping the nested columns
    df = pd.concat([df.drop(columns=['evaluation_result', 'submission']), eval_df, submission_df], axis=1)

    # Save the resulting DataFrame to a CSV file
    df.to_csv(filename, index=False)

    # Optional: Return the DataFrame for verification if needed
    return df


In [227]:
save_evaluation_result(tracker, 'results/banking_with_rubrics_result.csv')
save_evaluation_result(tracker_no_rubrics, 'results/banking_no_rubrics_result.csv')


Unnamed: 0,human_points,gpt_points,accuracy,status,feedback,hint,points,passedRubricsIds,failedRubricsIds,maxPoints,minPoints,rubrics,pointStep,llmType
0,1.5,2.0,0.900000,incomplete,Your answer touches on the Zinskanal (interest...,Try to think of another transmission channel s...,2.0,[1],"[2, 3, 4]",5.0,0,"[{'id': '48', 'title': 'Die Antwort nennt den ...",0.5,gpt
1,0.0,1.0,0.333333,incomplete,Die Antwort enthält eine grundlegende Erklärun...,"Versuche, weitere Informationen über die Auswi...",1.0,[],"[1, 2, 3]",1.5,0,"[{'id': '46', 'title': 'Die Antwort erklärt, d...",0.5,gpt
2,1.0,1.0,1.000000,incomplete,Your answer correctly identifies the situation...,"To improve, consider also discussing how 'Adve...",1.0,[1],"[2, 3]",1.5,0,"[{'id': '20', 'title': 'In der Antwort wird di...",0.5,gpt
3,1.0,0.0,0.333333,incorrect,"Die Antwort erwähnt wichtige Punkte, wie den S...","Überlege, welche spezifischen Ziele für die Ba...",0.0,[],"[1, 2, 3]",1.5,0,"[{'id': '33', 'title': 'Die Antwort nennt den ...",0.5,gpt
4,1.0,1.0,1.000000,incomplete,Your answer identified a relevant qualitative ...,Consider looking into other aspects of the 5C ...,1.0,[1],"[2, 3]",2.0,0,"[{'id': '59', 'title': 'Die Antwort nennt Char...",0.5,gpt
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
842,0.0,1.0,0.500000,incomplete,Ihre Antwort enthält Beispiele für makro- und ...,"Versuchen Sie, die Zielsetzungen und Wirkungen...",1.0,"[1, 2]",[3],2.0,0,"[{'id': '38', 'title': 'Die Antwort gibt die E...",0.5,gpt
843,0.5,0.5,1.000000,incomplete,Your answer identifies that the statement made...,Consider providing more context about the bank...,0.5,[],"[R1, R2]",1.0,0,"[{'id': '6', 'title': 'Die Antwort stimmt der ...",0.5,gpt
844,1.0,1.0,1.000000,incomplete,Your answer touches on the concept of 'Last Le...,Consider looking into how implicit state guara...,1.0,[],"[1, 2, 3]",1.5,0,"[{'id': '46', 'title': 'Die Antwort erklärt, d...",0.5,gpt
845,0.0,1.0,0.333333,incomplete,Your answer touches on the concept of state bo...,Consider discussing how an implicit state guar...,1.0,[],"[rubric1, rubric2, rubric3]",1.5,0,"[{'id': '46', 'title': 'Die Antwort erklärt, d...",0.5,gpt


In [None]:
for x in tracker:
    if x["accuracy"] < 0:
        print(x)