# GPT Knowledge Tracing.ipynb

This file contains the code for the inference using the GPT-3.5 API in this project.

In [18]:
# Import libraries and helper codes

import os
import json
import random
from time import sleep

import pandas
from sklearn.metrics import classification_report, balanced_accuracy_score

from openai import AzureOpenAI, OpenAI
import tiktoken

from gpt_helpers import *

In [2]:
# Prepare the data and topics (= KC) for our inference

generate_new_file('dataverse_files/2_DBE_KT22_datafiles_100102_csv/KCs.csv',
                  'dataverse_files/2_DBE_KT22_datafiles_100102_csv/Question_KC_Relationships.csv',
                  'dataverse_files/2_DBE_KT22_datafiles_100102_csv/Generated_KC_Questions.csv')

In [5]:
# Continue preparing the data

data = read_data('', # change with the test data file you want to use for inference
                    'dataverse_files/2_DBE_KT22_datafiles_100102_csv/Questions.csv',
                    'dataverse_files/2_DBE_KT22_datafiles_100102_csv/Generated_KC_Questions.csv',
                    N = -1)
data = remove_padding(data) # kept for compatibility with the original code, which added padding in case of unequal-length subsequences. (The current code keeps all subsequences of the same length.)

In [6]:
# API information for the OpenAI API. Replace with your own information (kept empty for security reasons)

api_info = {
    'api_key': "",
    'api_version': "",
    'azure_endpoint': "",
    'model': ""
}

In [22]:
def evaluate_predictions_of_model(preds, gts):
    """
    Evaluate the predictions of a model, given the ground truth labels.

    Args:
        preds: List of predictions.
        gts: List of ground truth labels.

    Returns:
        metrics: Dictionary of metrics.
    """

    # The labels "CORRECT" and "WRONG", as returned by the GPT-3.5 model, are converted to 1 and 0, respectively
    preds = [1 if pred == "CORRECT" else 0 for pred in preds]
    gts = [1 if gt == "CORRECT" else 0 for gt in gts]

    # Compute the metrics
    report = classification_report(gts, preds, output_dict=True)

    # Extract the metrics from the classification report
    f1_0 = report['0']['f1-score']
    f1_1 = report['1']['f1-score']

    supp_0 = report['0']['support']
    supp_1 = report['1']['support']

    acc = report['accuracy']
    balanced_acc = balanced_accuracy_score(gts, preds)

    metrics = {
        'f1_0': f1_0,
        'f1_1': f1_1,
        'supp_0': supp_0,
        'supp_1': supp_1,
        'acc': acc,
        'bal_acc': balanced_acc
    }

    return metrics

In [23]:
# Generate the prompts to be used for GPT-3.5 inference
prompts, gts = generate_prompts(data, incl_id = False, incl_q = False, incl_kc = False, incl_diff = True)
prompts_sample, gts_sample = randomly_sample_prompts(prompts, gts, N = 100, seed = 0, max_token_len = 4096) # in the original code, we randomly "sampled" prompts, but for final run, this function effectively only shuffles the prompts and keeps those that fit within the maximum context length of our model

# Generate predictions from the GPT-3.5 model
preds = predict(prompts_sample, gts_sample, api_info)

OpenAI result: CORRECT
Ground truth: CORRECT
OpenAI result: CORRECT
Ground truth: CORRECT
OpenAI result: WRONG
Ground truth: CORRECT
OpenAI result: CORRECT
Ground truth: CORRECT
OpenAI result: CORRECT
Ground truth: CORRECT
OpenAI result: CORRECT
Ground truth: CORRECT
OpenAI result: WRONG
Ground truth: CORRECT
OpenAI result: WRONG
Ground truth: WRONG
OpenAI result: CORRECT
Ground truth: WRONG
OpenAI result: CORRECT
Ground truth: CORRECT
OpenAI result: CORRECT
Ground truth: CORRECT
OpenAI result: CORRECT
Ground truth: CORRECT
OpenAI result: CORRECT
Ground truth: WRONG
OpenAI result: CORRECT
Ground truth: CORRECT
OpenAI result: CORRECT
Ground truth: CORRECT
OpenAI result: CORRECT
Ground truth: CORRECT
OpenAI result: CORRECT
Ground truth: CORRECT
OpenAI result: CORRECT
Ground truth: WRONG
OpenAI result: CORRECT
Ground truth: CORRECT
OpenAI result: CORRECT
Ground truth: CORRECT
OpenAI result: CORRECT
Ground truth: CORRECT
OpenAI result: WRONG
Ground truth: CORRECT
OpenAI result: CORRECT
Gro

In [26]:
# Evaluate the predictions of the GPT-3.5 model
metrics = evaluate_predictions_of_model(preds, gts_sample)

print(metrics)

{'f1_0': 0.3243243243243243, 'f1_1': 0.8466257668711655, 'supp_0': 19.0, 'supp_1': 81.0, 'acc': 0.75, 'bal_acc': 0.5838206627680311}
