# Baselines

This file calculates the baselines used in the paper.

In [None]:
# Imports

import os
import math
import pandas as pd
from sklearn.metrics import balanced_accuracy_score, accuracy_score, f1_score, precision_score, recall_score, roc_auc_score, mean_squared_error

os.environ["WANDB_DISABLED"] = "true"

# From https://galea.medium.com/how-to-love-jsonl-using-json-line-format-in-your-workflow-b6884f65175b

from json import JSONEncoder

class MyEncoder(JSONEncoder):
        def default(self, o):
            return o.__dict__ 
        
import json

def dump_jsonl(data, output_path, append=False):
    """
    Write list of objects to a JSON lines file.
    """
    mode = 'a+' if append else 'w'
    with open(output_path, mode, encoding='utf-8') as f:
        for line in data:
            json_record = json.dumps(line, ensure_ascii=False, cls=MyEncoder)
            f.write(json_record + '\n')
    print('Wrote {} records to {}'.format(len(data), output_path))

def load_jsonl(input_path) -> list:
    """
    Read list of objects from a JSON lines file.
    """
    data = []
    with open(input_path, 'r', encoding='utf-8') as f:
        for line in f:
            data.append(json.loads(line.rstrip('\n|\r')))
    print('Loaded {} records from {}'.format(len(data), input_path))
    return data

class JSONLDataObject:
    prompt = ""
    completion = ""

    def __init__(self, prompt, completion):
        self.prompt = prompt
        self.completion = completion

    def __repr__(self):
        return repr((self.prompt, self.completion))


In [None]:
# Mean baseline

df = pd.DataFrame(columns=["dataset", "approach", "balanced_accuracy", "raw_accuracy", "f1", "precision", "recall", "auc", "rmse"])

for dataset_name in ["statics", "assistments09", "assistments17"]:
    for approach in ["minimal", "extended"]:
        test_data = load_jsonl(f"jsonl_files/{dataset_name}-{approach}-test.jsonl")
        train_data = load_jsonl(f"jsonl_files/{dataset_name}-{approach}-train.jsonl")
        number_of_correct = 0
        number_total = 0
        for i in range(len(train_data)):
            if train_data[i]["completion"] == "CORRECT":
                number_of_correct += 1
            number_total += 1
        assert number_total == len(train_data)
        majority_label = (1 if number_of_correct > (number_total - number_of_correct) else 0)
        prob_of_correct = 1.0 * number_of_correct / number_total
        

        all_completions = []
        all_original = []
        all_probs = []

        # Mean baseline
        for i in range(len(test_data)):
            all_completions.append(majority_label)
            all_original.append(1 if test_data[i]["completion"] == "CORRECT" else 0)
            all_probs.append(prob_of_correct)

        balanced_accuracy = balanced_accuracy_score(all_original, all_completions)
        raw_accuracy = accuracy_score(all_original, all_completions)
        f1 = f1_score(all_original, all_completions)
        if f1 == 0.0:
            print(all_original)
            print(all_completions)
        precision = precision_score(all_original, all_completions)
        recall = recall_score(all_original, all_completions)
        auc = roc_auc_score(all_original, all_probs)
        mse = mean_squared_error(all_original, all_probs)
        rmse = math.sqrt(mse)
        

        df = df.append({"dataset": dataset_name, "approach": approach, "balanced_accuracy": balanced_accuracy, "raw_accuracy": raw_accuracy, "f1": f1, "precision": precision, "recall": recall, "auc": auc, "rmse": rmse}, ignore_index=True)

df.to_csv("mean_baseline.csv", index=False)



In [None]:
import random
random.seed(1)

In [None]:
# NaP Full baseline

df = pd.DataFrame(columns=["dataset", "approach", "balanced_accuracy", "raw_accuracy", "f1", "precision", "recall", "auc", "rmse"])

for dataset_name in ["statics", "assistments09", "assistments17"]:
    for approach in ["minimal", "extended"]:
        test_data = load_jsonl(f"jsonl_files/{dataset_name}-{approach}-test.jsonl")
        number_of_correct = 0
        number_total = 0
        for i in range(len(test_data)):
            if test_data[i]["completion"] == "CORRECT":
                number_of_correct += 1
            number_total += 1
        assert number_total == len(test_data)
        majority_label = (1 if number_of_correct > (number_total - number_of_correct) else 0)
        if majority_label == 1:
            prob_of_correct = 1.0 * number_of_correct / number_total
        else:
            prob_of_correct = 1.0 * (number_total - number_of_correct) / number_total

        all_completions = []
        all_original = []
        all_probs = []

        # Mean baseline
        for i in range(len(test_data)):
            lines = test_data[i]["prompt"].split("\n")
            start_index = 0 if approach == "minimal" else 3
            correct_until_now = int(lines[start_index].replace("Total correct until now: ", "").replace(" ", ""))
            wrong_until_now = int(lines[start_index + 1].replace("Total wrong until now: ", "").replace(" ", ""))
            if wrong_until_now == correct_until_now:
                all_completions.append(random.choice([0, 1]))
                all_probs.append(0.5)
            else:
                all_completions.append(1 if correct_until_now > wrong_until_now else 0)
                all_probs.append(1.0 * correct_until_now / (correct_until_now + wrong_until_now))
            all_original.append(1 if test_data[i]["completion"] == "CORRECT" else 0)
            
        balanced_accuracy = balanced_accuracy_score(all_original, all_completions)
        raw_accuracy = accuracy_score(all_original, all_completions)
        f1 = f1_score(all_original, all_completions)
        precision = precision_score(all_original, all_completions)
        recall = recall_score(all_original, all_completions)
        auc = roc_auc_score(all_original, all_probs)
        mse = mean_squared_error(all_original, all_probs)
        rmse = math.sqrt(mse)

        df = df.append({"dataset": dataset_name, "approach": approach, "balanced_accuracy": balanced_accuracy, "raw_accuracy": raw_accuracy, "f1": f1, "precision": precision, "recall": recall, "auc": auc, "rmse": rmse}, ignore_index=True)

df.to_csv("nap_full_baseline.csv", index=False)



In [None]:
# NaP Full Skill baseline

df = pd.DataFrame(columns=["dataset", "approach", "balanced_accuracy", "raw_accuracy", "f1", "precision", "recall", "auc", "rmse"])

for dataset_name in ["statics", "assistments09", "assistments17"]:
    for approach in ["extended"]:
        test_data = load_jsonl(f"jsonl_files/{dataset_name}-{approach}-test.jsonl")
        train_data = load_jsonl(f"jsonl_files/{dataset_name}-{approach}-train.jsonl")

        all_completions = []
        all_original = []
        all_probs = []

        # Mean baseline
        for i in range(len(test_data)):
            lines = test_data[i]["prompt"].split("\n")
            correct_until_now = int(lines[1].split(":")[1].replace(" ", ""))
            wrong_until_now = int(lines[2].split(":")[1].replace(" ", ""))
            if wrong_until_now == correct_until_now:
                all_completions.append(random.choice([0, 1]))
                all_probs.append(0.5)
            else:
                all_completions.append(1 if correct_until_now > wrong_until_now else 0)
                all_probs.append(1.0 * correct_until_now / (correct_until_now + wrong_until_now))
            all_original.append(1 if test_data[i]["completion"] == "CORRECT" else 0)
            
        balanced_accuracy = balanced_accuracy_score(all_original, all_completions)
        raw_accuracy = accuracy_score(all_original, all_completions)
        f1 = f1_score(all_original, all_completions)
        precision = precision_score(all_original, all_completions)
        recall = recall_score(all_original, all_completions)
        auc = roc_auc_score(all_original, all_probs)
        mse = mean_squared_error(all_original, all_probs)
        rmse = math.sqrt(mse)

        df = df.append({"dataset": dataset_name, "approach": approach, "balanced_accuracy": balanced_accuracy, "raw_accuracy": raw_accuracy, "f1": f1, "precision": precision, "recall": recall, "auc": auc, "rmse": rmse}, ignore_index=True)

df.to_csv("nap_full_skill_baseline.csv", index=False)

