# Evaluate the data with the backend
We are evaluating the gpt integration by simulating student responses from previous real data and comparing the feedback and results with real human grading.

## Load Data

In [200]:
import pandas as pd

In [201]:
tasks_df = pd.read_csv('data/tasks_df.csv')
submissions_df = pd.read_csv('data/submissions_df.csv')
users_df = pd.read_csv('data/users_df.csv')


## Join Data

In [202]:
print("---Tasks---")
print(tasks_df.head())
print("---Submissions---")
print(submissions_df.head())
print("---Users---")
print(users_df.head())

---Tasks---
    id            slug  assignment_id  max_points  solution_file_id  \
0  402  1_week1_sta120            302         1.0               606   
1  403  2_week1_sta120            302         1.0               611   
2  753  2_week8_sta120            652         1.0              1261   
3  709  3_week6_sta120            603         1.0              1146   
4  404  3_week1_sta120            302         1.0               617   

  solution_file_name solution_file_mime_type  \
0           script.R              text/plain   
1           script.R              text/plain   
2           script.R              text/plain   
3           script.R              text/plain   
4           script.R              text/plain   

                                            solution  instruction_file_id  \
0  png(file = "solution.png")\nrequire(fields) # ...                  602   
1                                 demo(persp) \n\n\n                  607   
2   png(file="solution.png")\nmu <- c(1, 

In [203]:
# Prepare the evaluation data
evaluation_data_stat = []
evaluation_data_mat = []
for _, submission in submissions_df.iterrows():
    course = tasks_df.loc[tasks_df['id'] == submission['task_id'], 'course_slug'].values[0]
    if course == "fs24-sta120":
        evaluation_data_stat.append({
            "question": tasks_df.loc[tasks_df['id'] == submission['task_id'], 'instruction'].values[0], # Get the question from the tasks_df using the task_id in the submissions_df
            "answer":  submission['content'],
            "modelSolution": tasks_df.loc[tasks_df['id'] == submission['task_id'], 'solution'].values[0],
            "maxPoints": tasks_df.loc[tasks_df['id'] == submission['task_id'], 'max_points'].values[0],
            "minPoints": 0,  # Default minPoints
            "pointStep": 0.5,  # Default pointStep
            "human_points": submission['points']
        })
    elif course == "fs24-mat183":
        evaluation_data_mat.append({
            "question": tasks_df.loc[tasks_df['id'] == submission['task_id'], 'instruction'].values[0], # Get the question from the tasks_df using the task_id in the submissions_df
            "answer":  submission['content'],
            "modelSolution": tasks_df.loc[tasks_df['id'] == submission['task_id'], 'solution'].values[0],
            "maxPoints": tasks_df.loc[tasks_df['id'] == submission['task_id'], 'max_points'].values[0],
            "minPoints": 0,  # Default minPoints
            "pointStep": 0.5,  # Default pointStep
            "human_points": submission['points']
        })
    else:
        print(f"Unknown course_slug: {course}")
    

In [204]:
print(evaluation_data_stat[:5])
print(evaluation_data_mat[:5])
print(f"Total submissions to evaluate: {len(evaluation_data_stat)}")
print(f"Total submissions to evaluate: {len(evaluation_data_mat)}")

[{'question': "The dataset Oral is available in the R package spam and contains oral cavity cancer counts for $544$ districts in Germany.\nSimulate a 95\\% confidence interval for the sample mean of the variable SRM based on the following bootstrap scheme (sampling with replacement):\nRepeat $10'000$ times\n    - Sample $544$ observations $Z_{i}$ with replacement\\\n    - Calculate and store the mean of these sampled observations\\\nConstruct the confidence interval by taking the 2.5\\% and the 97.5\\% quantiles of the stored means.\n", 'answer': 'require(spam)\ndata(Oral)\n\nn <- 544\nset.seed(3)\n\n# Room for creativity!\nboot_means <- replicate(10000, {\n  sampled_indices <- sample(1:n, replace = TRUE)\n  mean(Oral$q[sampled_indices])\n})\n\nsol <- quantile(boot_means, c(0.025, 0.975), na.rm = TRUE)\n', 'modelSolution': 'require(spam) \ndata(Oral) \nn <- 544\nmybootstrap <- function(n.replications){ \n temporary <- array(0, dim = n.replications) # Preallocation is always advisable! 

## Send Request to Service

In [205]:
import requests
from tqdm.notebook import tqdm  # Use notebook-friendly tqdm
from IPython.display import display, Markdown

def evaluate(to_evaluate):
    EVALUATION_URL = "http://localhost:4000/evaluate"    
    tracker = []
    accuracy_tracker = []
    logs = []  # Collect logs for later display

    def calculate_accuracy(human_points, llm_points, max_points):
        if max_points == 0:
            return 1  # This should never happen, but just in case, so we don't divide by zero
        diff = abs(human_points - llm_points)
        return 1 - diff / max_points

    # Use tqdm for progress bar
    for idx, submission in enumerate(tqdm(to_evaluate, desc="Evaluating submissions"), start=1):
        data = {
            "question": submission.get("question", ""),
            "answer": submission.get("answer", ""),
            "modelSolution": submission.get("modelSolution", ""),
            "maxPoints": float(submission.get("maxPoints")),
            "minPoints": float(submission.get("minPoints")),
            "pointStep": float(submission.get("pointStep")),
            "llmType": "gpt"
        }

        try:
            response = requests.post(EVALUATION_URL, json=data, headers={"Content-Type": "application/json"})
            response.raise_for_status()
            evaluation_result = response.json()

            llm_points = evaluation_result.get("points", 0)
            human_points = submission["human_points"]
            max_points = submission["maxPoints"]

            # Calculate accuracy
            accuracy = calculate_accuracy(human_points, llm_points, max_points)
            tracker.append({
                "human_points": human_points,
                "gpt_points": llm_points,
                "accuracy": accuracy,
                "evaluation_result": evaluation_result,
                "submission": submission,
            })
            accuracy_tracker.append(accuracy)

            logs.append(f"Submission {idx} Result: {evaluation_result}")
            logs.append(f"Human Points: {human_points}, GPT Points: {llm_points}")
            logs.append(f"Accuracy: {accuracy:.2%}")
        except requests.exceptions.RequestException as e:
            logs.append(f"Error evaluating submission {idx}: {e}")

    # Calculate average accuracy
    average_accuracy = sum(accuracy_tracker) / len(accuracy_tracker)

    return tracker, average_accuracy, logs


In [206]:
# Parallel evaluation setup
from helpers.parallel_evaluate import parallel_evaluate
#TODO Try Voting Mechanism to see if accuracy improves!
num_threads = 4  # Adjust based on your CPU cores
batch_size = 250
sample_size = 1000
num_groups = 10

In [207]:
from helpers.stratified_sample import stratified_random_sample

# Get well distributed sample (assuming maxPoints = rough difficulty)
evaluation_data_stat_rand = stratified_random_sample(evaluation_data_stat, sample_size=sample_size, num_groups=num_groups)

stat_tracker, stat_accuracy, stat_logs = parallel_evaluate(evaluation_data_stat_rand, func=evaluate, num_threads=num_threads, batch_size=batch_size)
print(f"Average Accuracy: {stat_accuracy:.2%}")

Evaluating submissions:   0%|          | 0/250 [00:00<?, ?it/s]

Evaluating submissions:   0%|          | 0/250 [00:00<?, ?it/s]

Evaluating submissions:   0%|          | 0/250 [00:00<?, ?it/s]

Evaluating submissions:   0%|          | 0/250 [00:00<?, ?it/s]

Average Accuracy: nan%


In [208]:

# evaluate random subset of mat
evaluation_data_mat_rand = stratified_random_sample(evaluation_data_mat, sample_size=sample_size, num_groups=num_groups)

mat_tracker, mat_accuracy, mat_logs = parallel_evaluate(evaluation_data_mat_rand, func=evaluate, num_threads=num_threads, batch_size=batch_size)
print(f"Average Accuracy: {mat_accuracy:.2%}")

Evaluating submissions:   0%|          | 0/250 [00:00<?, ?it/s]

Evaluating submissions:   0%|          | 0/250 [00:00<?, ?it/s]

Evaluating submissions:   0%|          | 0/250 [00:00<?, ?it/s]

Evaluating submissions:   0%|          | 0/250 [00:00<?, ?it/s]

Average Accuracy: 198.30%


In [214]:
# evaluate the same random subset of mat without including solutions
evaluation_data_stat_no_solutions_rand = []
for submission in evaluation_data_stat_rand:
    evaluation_data_stat_no_solutions_rand.append({
        "question": submission.get("question", ""),
        "answer": submission.get("answer", ""),
        "modelSolution": "undefined",
        "maxPoints": float(submission.get("maxPoints")),
        "minPoints": float(submission.get("minPoints")),
        "pointStep": float(submission.get("pointStep")),
        "human_points": submission["human_points"]
    })


# evaluate random subset of stat without including solutions
stat_tracker_no_solutions, stat_accuracy_no_solutions, stat_logs_no_solutions = parallel_evaluate(
    evaluation_data_stat_no_solutions_rand, func=evaluate, num_threads=num_threads, batch_size=batch_size
)

print(f"Average Accuracy: {stat_accuracy_no_solutions:.2%}")


Evaluating submissions:   0%|          | 0/250 [00:00<?, ?it/s]

Evaluating submissions:   0%|          | 0/250 [00:00<?, ?it/s]

Evaluating submissions:   0%|          | 0/250 [00:00<?, ?it/s]

Evaluating submissions:   0%|          | 0/250 [00:00<?, ?it/s]

Average Accuracy: nan%


In [215]:
# evaluate the same random subset of mat without including solutions
evaluation_data_mat_no_solutions_rand = []
for submission in evaluation_data_mat_rand:
    evaluation_data_mat_no_solutions_rand.append({
        "question": submission.get("question", ""),
        "answer": submission.get("answer", ""),
        "modelSolution": "undefined",
        "maxPoints": float(submission.get("maxPoints")),
        "minPoints": float(submission.get("minPoints")),
        "pointStep": float(submission.get("pointStep")),
        "human_points": submission["human_points"]
    })

mat_tracker_no_solutions, mat_accuracy_no_solutions, mat_logs_no_solutions = parallel_evaluate(evaluation_data_mat_no_solutions_rand, func=evaluate, num_threads=num_threads, batch_size=batch_size)
print(f"Average Accuracy: {mat_accuracy_no_solutions:.2%}")

Evaluating submissions:   0%|          | 0/250 [00:00<?, ?it/s]

Evaluating submissions:   0%|          | 0/250 [00:00<?, ?it/s]

Evaluating submissions:   0%|          | 0/250 [00:00<?, ?it/s]

Evaluating submissions:   0%|          | 0/250 [00:00<?, ?it/s]

Average Accuracy: 189.85%


In [216]:
def get_average_accuracy(tracker):
    cleaned_tracker = []
    accuracy_sum = 0
    for x in tracker:
        # check nan 
        if x['accuracy'] == x['accuracy'] and x['accuracy'] >= 0:        
            cleaned_tracker.append(x)
            accuracy_sum += x['accuracy']
        else:
            # Add 0 for wrong data to still count it in the average
            cleaned_tracker.append(0)
    average = accuracy_sum / len(cleaned_tracker)
    return average, cleaned_tracker

In [217]:
# Since something went wrong with one request during the evaluation, we need to clean the data
average_stat_accuracy, _ = get_average_accuracy(stat_tracker)
print(f"Average Accuracy Statistics cleaned: {average_stat_accuracy:.2%}")

Average Accuracy Statistics cleaned: 66.12%


In [218]:
# Since something went wrong with one request during the evaluation, we need to clean the data
average_mat_accuracy, _ = get_average_accuracy(mat_tracker)
print(f"Average Accuracy Statistics cleaned: {average_mat_accuracy:.2%}")

Average Accuracy Statistics cleaned: 66.90%


In [219]:
# Since something went wrong with one request during the evaluation, we need to clean the data
average_stat_no_solution_accuracy, _ = get_average_accuracy(stat_tracker_no_solutions)
print(f"Average Accuracy Statistics cleaned: {average_stat_no_solution_accuracy:.2%}")

Average Accuracy Statistics cleaned: 61.60%


In [220]:
# Since something went wrong with one request during the evaluation, we need to clean the data
average_mat_no_solution_accuracy, _ = get_average_accuracy(mat_tracker_no_solutions)
print(f"Average Accuracy Statistics cleaned: {average_mat_no_solution_accuracy:.2%}")

Average Accuracy Statistics cleaned: 64.48%


In [221]:
import pandas as pd

def save_evaluation_result(tracker, filename):
    """
    Save the evaluation results from a list of dictionaries to a CSV file.
    
    Args:
        tracker (list): A list of dictionaries containing evaluation data.
        filename (str): The name of the file to save the results to.
    """
    # Convert the tracker list to a DataFrame
    df = pd.DataFrame(tracker)

    # Flatten 'evaluation_result'
    eval_df = df['evaluation_result'].apply(pd.Series)

    # Flatten 'submission' and drop the 'human_points' column from it
    submission_df = df['submission'].apply(pd.Series)
    submission_df = submission_df.drop(columns=['human_points', 'modelSolution', 'question', 'answer'], errors='ignore')  # Remove 'human_points'

    # Concatenate the flattened DataFrames with the original, after dropping the nested columns
    df = pd.concat([df.drop(columns=['evaluation_result', 'submission']), eval_df, submission_df], axis=1)

    # Save the resulting DataFrame to a CSV file
    df.to_csv(filename, index=False)

    # Optional: Return the DataFrame for verification if needed
    return df


In [224]:
save_evaluation_result(stat_tracker, 'results/stat_evaluation_result.csv')
save_evaluation_result(mat_tracker, 'results/mat_evaluation_result.csv')

Unnamed: 0,human_points,gpt_points,accuracy,status,feedback,hint,points,passedRubricsIds,failedRubricsIds,maxPoints,minPoints,pointStep
0,2.0,1.0,0.5,incomplete,Your answer includes the correct overall struc...,Review how to assign colors based on factor le...,1.0,[],"[color_assignment, commentary]",2.0,0,0.5
1,0.0,0.0,1.0,incorrect,Your answer contains a few critical errors tha...,Focus on how to correctly compute the probabil...,0.0,[],"[1, 2, 3]",1.0,0,0.5
2,0.0,1.0,0.0,correct,Your answer meets all the required criteria. Y...,,1.0,[],[],1.0,0,0.5
3,0.0,0.5,0.5,incomplete,Your answer covers the basic structure require...,Review how the geometric distribution is imple...,0.5,[],"[parameters, expected output]",1.0,0,0.5
4,2.0,1.0,0.5,incomplete,Your answer correctly implements the generatio...,Consider elaborating on the relationship betwe...,1.0,[],[N/A],2.0,0,0.5
...,...,...,...,...,...,...,...,...,...,...,...,...
995,1.0,1.0,1.0,correct,Your answer successfully implements the steps ...,Continue practicing with similar plotting task...,1.0,[],[],1.0,0,0.5
996,1.0,1.0,1.0,correct,Your answer aligns well with the requirements ...,Continue practicing plotting and customizing g...,1.0,[],[],1.0,0,0.5
997,0.0,0.0,1.0,incorrect,The answer contains inaccuracies in the standa...,Review the concept of standard deviation and e...,0.0,[],"[1, 2]",1.0,0,0.5
998,1.0,1.0,1.0,correct,Your answer correctly implemented the requirem...,Continue practicing how to manipulate vectors ...,1.0,[],[],1.0,0,0.5


In [225]:
save_evaluation_result(stat_tracker_no_solutions, 'results/stat_evaluation_no_solutions_result.csv')
save_evaluation_result(mat_tracker_no_solutions, 'results/mat_evaluation_no_solutions_result.csv')

Unnamed: 0,human_points,gpt_points,accuracy,status,feedback,hint,points,passedRubricsIds,failedRubricsIds,maxPoints,minPoints,pointStep
0,1.0,1.0,1.0,correct,Your answer correctly provides the name of the...,,1.0,[],[],1.0,0.0,0.5
1,0.5,0.0,0.5,incorrect,Your answer contains several logical issues th...,Review the concept of conditional probability ...,0.0,[],"[rubric1, rubric2, rubric3]",1.0,0.0,0.5
2,1.0,1.0,1.0,correct,Your answer successfully meets the requirement...,,1.0,[],[],1.0,0.0,0.5
3,1.0,0.5,0.5,incomplete,Your answer shows some understanding of the pr...,Ensure that you elaborate on how your calculat...,0.5,[],"[rubric1, rubric2]",1.0,0.0,0.5
4,0.5,0.0,0.5,incorrect,Your approach to calculating the confidence le...,"Review the concepts of confidence intervals, s...",0.0,[],"[conf_interval, calc_conf_level]",1.0,0.0,0.5
...,...,...,...,...,...,...,...,...,...,...,...,...
995,2.0,1.0,0.5,incomplete,The answer correctly generates the three trans...,Consider providing a more detailed analysis of...,1.0,[],[N/A],2.0,0.0,0.5
996,0.5,0.0,0.5,incorrect,The calculation for P[A] and P[B] is performed...,Review how to calculate the joint probability ...,0.0,[],"[probA, probB, probAandB]",1.0,0.0,0.5
997,2.0,1.0,0.5,incomplete,Your answer correctly calculates the trimmed m...,Focus on the implications of the difference be...,1.0,[],"[rubric_id_1, rubric_id_2]",2.0,0.0,0.5
998,0.5,0.0,0.5,incorrect,Your answer contains a critical typo in the fu...,Carefully check your variable names in the `ir...,0.0,[],"[1, 2, 3]",1.0,0.0,0.5
