This file prepares the JSONL files to be submitted to OpenAI's fine-tuning API. All experiments are done with the `babbage-002` GPT-3 model on February 2023, with default fine-tuning parameters.

In [None]:
# Configuration variables

dataset_name = "statics" # statics, assistments09, assistments17
approach = "minimal" # minimal, extended

In [None]:
# Imports and helper functions

import pandas as pd
import numpy as np
import json
from tqdm import tqdm
import os
import pandas as pd
import random

os.environ["WANDB_DISABLED"] = "true"

class NpEncoder(json.JSONEncoder): # to fix the "Object of type 'int64' is not JSON serializable" error. Source: https://stackoverflow.com/a/57915246
    def default(self, obj):
        if isinstance(obj, np.integer):
            return int(obj)
        if isinstance(obj, np.floating):
            return float(obj)
        if isinstance(obj, np.ndarray):
            return obj.tolist()
        return super(NpEncoder, self).default(obj)
    
# From https://galea.medium.com/how-to-love-jsonl-using-json-line-format-in-your-workflow-b6884f65175b

from json import JSONEncoder

class MyEncoder(JSONEncoder):
        def default(self, o):
            return o.__dict__ 

import json

def dump_jsonl(data, output_path, append=False):
    """
    Write list of objects to a JSON lines file.
    """
    mode = 'a+' if append else 'w'
    with open(output_path, mode, encoding='utf-8') as f:
        for line in data:
            json_record = json.dumps(line, ensure_ascii=False, cls=MyEncoder)
            f.write(json_record + '\n')
    print('Wrote {} records to {}'.format(len(data), output_path))

def load_jsonl(input_path) -> list:
    """
    Read list of objects from a JSON lines file.
    """
    data = []
    with open(input_path, 'r', encoding='utf-8') as f:
        for line in f:
            data.append(json.loads(line.rstrip('\n|\r')))
    print('Loaded {} records from {}'.format(len(data), input_path))
    return data

class JSONLDataObject:
    prompt = ""
    completion = ""

    def __init__(self, prompt, completion):
        self.prompt = prompt
        self.completion = completion

    def __repr__(self):
        return repr((self.prompt, self.completion))


In [None]:
# Read the data from the input files
full_train_df = pd.read_csv(f'../initial_data/{dataset_name}/preprocessed_data_train.csv', sep='\t')
full_test_df = pd.read_csv(f'../initial_data/{dataset_name}/preprocessed_data_test.csv', sep='\t')

In [None]:
# Find statistics about the dataset

print("In train set:")
print("Number of entries:", len(full_train_df))
print("Unique problem IDs:", len(full_train_df['item_id'].unique()))
print("Unique user IDs:", len(full_train_df['user_id'].unique()))
print("Set of possible answers for each question:", full_train_df['correct'].unique())
print("Number of wrong answers:", len(full_train_df[full_train_df['correct'] == 0]))
print("Number of correct answers:", len(full_train_df[full_train_df['correct'] == 1]))
print("")

print("In test set:")
print("Number of entries:", len(full_test_df))
print("Unique problem IDs:", len(full_test_df['item_id'].unique()))
print("Unique user IDs:", len(full_test_df['user_id'].unique()))
print("Set of possible answers for each question:", full_test_df['correct'].unique())
print("Number of wrong answers:", len(full_test_df[full_test_df['correct'] == 0]))
print("Number of correct answers:", len(full_test_df[full_test_df['correct'] == 1]))
print("===")

# concat the train and test sets
full_df = pd.concat([full_train_df, full_test_df])

print("In the whole dataset:")
print("Number of entries:", len(full_df))
print("Unique problem IDs:", len(full_df['item_id'].unique()))
print("Unique user IDs:", len(full_df['user_id'].unique()))
print("Set of possible answers for each question:", full_df['correct'].unique())
print("Number of wrong answers:", len(full_df[full_df['correct'] == 0]))
print("Number of correct answers:", len(full_df[full_df['correct'] == 1]))

# Find if there are any user IDs common between the train and test sets

print("Intersection of user IDs in train and test sets:", len(set(full_train_df['user_id'].unique()).intersection(set(full_test_df['user_id'].unique()))))

In [None]:
class DataEntry:
    """
    Class for saving the data entries (sequences) extracted from the raw data.

    Fields:
        student_id: ID of the student
        question_ids: list of question IDs the student has answered
        answers: list of answers the student has given to the questions
        skill_ids: list of skill IDs of the questions
    """

    student_id = -1
    question_ids = []
    answers = []
    skill_ids = []

    def __init__(self, student_id, question_ids, answers, skill_ids):
        self.student_id = student_id
        self.question_ids = question_ids
        self.answers = answers
        self.skill_ids = skill_ids

    def __str__(self): # for printing the object
        return f'student_id: {self.student_id}, question_ids: {self.question_ids}, answers: {self.answers}, skill_ids: {self.skill_ids}'
    
    def __eq__(self, other): # necessary to find duplicates in a list of DataEntry objects
        if self.student_id != other.student_id:
            return False # two similar sequences but with different student IDs are not considered duplicates
        for i in range(len(self.question_ids)):
            if self.question_ids[i] != other.question_ids[i] or self.answers[i] != other.answers[i] or self.skill_ids[i] != other.skill_ids[i]: # if any field is different, the two sequences are not duplicates
                return False
        return True

In [None]:
def get_all_data_entries(df):
    """
    Function to extract all sequences from the raw data.

    Args:
        df: the pandas dataframe from the raw data

    Returns:
        all_data_entries: list of all sequences from the raw data which are extracted from the students in the given dataframe
    """

    to_be_removed_array = []
    source_ids = df['user_id'].unique() # Find all unique student IDs in the current dataframe
    all_data_entries = []
    for student_id in source_ids:
        student_df = df[df['user_id'] == student_id].copy() # Find all entries of the current specific student
        # Extract the necessary fields from the dataframe
        question_ids = student_df['item_id'].tolist()
        answers = student_df['correct'].tolist()
        skill_ids = student_df['skill_id'].tolist()
        # If all of answers are 1 or 0, then skip this data entry, as it is not informative (i.e., the student totally knows the concept, or does not know it at all and does not seem to learn it in the given sequence of questions)
        if sum(answers) == 0 or sum(answers) == len(answers):
            to_be_removed_array.append(student_id)
            continue
        data_entry = DataEntry(student_id, question_ids, answers, skill_ids)
        # Check if this has not been appended before
        if data_entry not in all_data_entries:
            all_data_entries.append(data_entry)
    print(f"Number of to be removed students: {len(to_be_removed_array)}")
    print("Number of total sequences:", len(all_data_entries))
    return all_data_entries


all_train_data_entries = get_all_data_entries(full_train_df)
all_test_data_entries = get_all_data_entries(full_test_df)

print("Number of sequences in the train set:", len(all_train_data_entries))
print("Number of sequences in the test set:", len(all_test_data_entries))

with open(f'intermediate_files/train-data-entries-{dataset_name}.json', 'w') as f:
    json.dump([entry.__dict__ for entry in all_train_data_entries], f, cls=NpEncoder)
with open(f'intermediate_files/test-data-entries-{dataset_name}.json', 'w') as f:
    json.dump([entry.__dict__ for entry in all_test_data_entries], f, cls=NpEncoder)

In [None]:
# Read into dataframes
train_df = pd.read_json(f'intermediate_files/train-data-entries-{dataset_name}.json')
test_df = pd.read_json(f'intermediate_files/test-data-entries-{dataset_name}.json')

# Shuffle
train_df = train_df.sample(frac=1, random_state=1).reset_index(drop=True)
test_df = test_df.sample(frac=1, random_state=1).reset_index(drop=True)

# Check for correct loading
print("Number of sequences in the train set:", len(train_df))
print("Number of sequences in the test set:", len(test_df))

In [None]:
dfs = [train_df, test_df]
all_jsonl_files = []

# Generate texts for fine-tuning GPT-3 for each dataframe
for df in dfs:
    all_jsonl_data = []
    for index, row in df.iterrows():
        for i in range(1, len(row['question_ids'])):
            answers = row['answers'][:i]
            answers_without_last = answers[:-1]
            count_of_zero = answers_without_last.count(0)
            count_of_one = answers_without_last.count(1)
            # add a space between each digit of count_of_zero and count_of_one
            count_of_zero = ' '.join(str(count_of_zero))
            count_of_one = ' '.join(str(count_of_one))
            question_ids = row['question_ids'][:i]
            current_question_id = question_ids[-1]
            current_question_id = ' '.join(str(current_question_id))
            prompt = ""
            if approach == "extended":
                skill_ids = row['skill_ids'][:i]
                current_skill_id = skill_ids[-1]
                current_skill_id_str = ' '.join(str(current_skill_id))
                prompt += "Current skill ID: " + current_skill_id_str + "\n"
                count_of_current_skill_id_with_answer_0 = 0
                count_of_current_skill_id_with_answer_1 = 0
                for k in range(len(question_ids) - 1):
                    if skill_ids[k] == current_skill_id:
                        if answers[k] == 1:
                            count_of_current_skill_id_with_answer_1 += 1
                        else:
                            count_of_current_skill_id_with_answer_0 += 1
                count_of_current_skill_id_with_answer_0 = ' '.join(str(count_of_current_skill_id_with_answer_0))
                count_of_current_skill_id_with_answer_1 = ' '.join(str(count_of_current_skill_id_with_answer_1))
                prompt += "Total correct for prior questions with skill ID " + str(current_skill_id_str) + " : " + str(count_of_current_skill_id_with_answer_1) + "\n"
                prompt += "Total wrong for prior questions with skill ID " + str(current_skill_id_str) + " : " + str(count_of_current_skill_id_with_answer_0) + "\n"
            prompt += f"Total correct until now: {count_of_one}\nTotal wrong until now: {count_of_zero}\nCurrent question ID: {current_question_id}\nStudent response: "
            current_student_response = answers[-1]
            completion = ('CORRECT' if int(current_student_response) == 1 else 'WRONG')
            all_jsonl_data.append(JSONLDataObject(prompt, completion))
    all_jsonl_files.append(all_jsonl_data)

for i in range(len(all_jsonl_files)):
    random.shuffle(all_jsonl_files[i])

dump_jsonl(all_jsonl_files[0], f'jsonl_files/{dataset_name}-{approach}-train.jsonl')
dump_jsonl(all_jsonl_files[1], f'jsonl_files/{dataset_name}-{approach}-test.jsonl')