# DNN Knowledge Tracing.ipynb

This file contains the code for the training the DNNs in this project.

In [1]:
# Import libraries and helper codes

import json
from collections import Counter
import re

import torch
from torch.utils.data import Dataset, DataLoader
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F

import pandas as pd
import numpy as np
from sklearn.metrics import accuracy_score, balanced_accuracy_score, f1_score, roc_auc_score, confusion_matrix

from tqdm import tqdm

from seq_helpers import *

# Config variables to downsample and/or include KCs
should_downsample = False
should_include_kcs = True

In [2]:
device = torch.device("mps") # change to cuda or cpu if necessary

device

# Grid Search
We perform a grid search over N (the sequence length in the input data) and n_layers (the number of layers in each model). We select only these two changes, because we want to approximate the effect of varying the input sequence length on model performance (data source variation) AND the effect of increasing model complexity via n_layers. We could have chosen a different complexity hyperparameter (like hidden_size), but opt to choose only one due to computational constraints (adding one extra hyperparameter to the search increased our costs significantly).

In [3]:
# Change the parameters if necessary
hparams = {
    'batch_size': 32,
    'lr': 0.001,
    'epochs': 100,
    'input_size': 2
}

In [9]:
# Dictionary to store the best models. acc = normal accuracy, bal_acc = balanced accuracy, f1_0 = f1 score for label 0 (wrong), f1_1 = f1 score for label 1 (correct), sup_0 = support for label 0 (wrong), sup_1 = support for label 1 (correct), auc = AUC score, epoch = epoch number in which the best model was found
best_metrics = {
    'RNN': {
        'val': {'acc': 0, 'bal_acc': 0, 'f1_0': 0, 'f1_1': 0, 'sup_0': 0, 'sup_1': 0, 'auc': 0, 'epoch': 0},
        'test': {'acc': 0, 'bal_acc': 0, 'f1_0': 0, 'f1_1': 0, 'sup_0': 0, 'sup_1': 0, 'auc': 0, 'epoch': 0}
        },
    'LSTM': {
        'val': {'acc': 0, 'bal_acc': 0, 'f1_0': 0, 'f1_1': 0, 'sup_0': 0, 'sup_1': 0, 'auc': 0, 'epoch': 0},
        'test': {'acc': 0, 'bal_acc': 0, 'f1_0': 0, 'f1_1': 0, 'sup_0': 0, 'sup_1': 0, 'auc': 0, 'epoch': 0}
        },
    'Transformer': {
        'val': {'acc': 0, 'bal_acc': 0, 'f1_0': 0, 'f1_1': 0, 'sup_0': 0, 'sup_1': 0, 'auc': 0, 'epoch': 0},
        'test': {'acc': 0, 'bal_acc': 0, 'f1_0': 0, 'f1_1': 0, 'sup_0': 0, 'sup_1': 0, 'auc': 0, 'epoch': 0}
        },
}

for N in [5, 10, 20]:
    for n_layers in [1, 2, 5]:
        for model_name in ['RNN', 'LSTM', 'Transformer']:
            print(f"Training {model_name} with N = {N} and n_layers = {n_layers}")

            # Load data
            train_data = load_and_process_data(f'data_outputs/train_fold_1_n_{N}.json', seq_len = N)
            val_data = load_and_process_data(f'data_outputs/val_fold_1_n_{N}.json', seq_len = N)
            test_data = load_and_process_data(f'data_outputs/test_n_{N}.json', seq_len = N)

            train_dataset = StudentSequenceDataset(train_data, embedding_dim=0)
            val_dataset = StudentSequenceDataset(val_data, embedding_dim=0)
            test_dataset = StudentSequenceDataset(test_data, embedding_dim=0)

            train_data_loader = DataLoader(train_dataset, batch_size=hparams['batch_size'], shuffle=True)
            val_data_loader = DataLoader(val_dataset, batch_size=hparams['batch_size'], shuffle=False)
            test_data_loader = DataLoader(test_dataset, batch_size=hparams['batch_size'], shuffle=False)

            # Create the model and evaluator
            if model_name == 'RNN':
                model = SimpleRNN(input_size=hparams['input_size'], num_layers=n_layers).to(device)
            elif model_name == 'LSTM':
                model = SimpleLSTM(input_size=hparams['input_size'], num_layers=n_layers).to(device)
            elif model_name == 'Transformer':
                model = SimpleTransformer(input_size=hparams['input_size'], num_layers=n_layers).to(device)
            evaluator = ModelEvaluator({f'{model_name}_{N}_{n_layers}_epoch': model}, train_data_loader, val_data_loader, test_data_loader, device, epochs=hparams['epochs'], lr=hparams['lr'])
            
            # Train and evaluate the model
            evaluator.train_and_evaluate()
            val_metrics, test_metrics = evaluator.compute_best_metrics()

            # Check if the model is the best one
            if val_metrics['bal_acc'] > best_metrics[model_name]['val']['bal_acc']:
                best_metrics[model_name]['val'] = val_metrics
                best_metrics[model_name]['test'] = test_metrics
                print(f"New best {model_name} model with N = {N} and n_layers = {n_layers}!")
                print(f"    Val: {val_metrics}")
                print(f"    Test: {test_metrics}")




# Confidence Score Demo
We select one of our best models (RNN with N=5 and n_layers=5), and use it to both predict the correctness of a random student's answer as well as a confidence in this prediction (approximated by the raw sigmoid output).

In [4]:
hparams = {
    'batch_size': 32,
    'lr': 0.001,
    'epochs': 100,
    'input_size': 2
}

In [None]:
model_name = 'RNN'
N = 5
n_layers = 5

print(f"Training {model_name} with N = {N} and n_layers = {n_layers}")

# Load data

train_data = load_and_process_data(f'data_outputs/train_fold_1_n_{N}.json', seq_len = N)
val_data = load_and_process_data(f'data_outputs/val_fold_1_n_{N}.json', seq_len = N)
test_data = load_and_process_data(f'data_outputs/test_n_{N}.json', seq_len = N)

train_dataset = StudentSequenceDataset(train_data, embedding_dim=0)
val_dataset = StudentSequenceDataset(val_data, embedding_dim=0)
test_dataset = StudentSequenceDataset(test_data, embedding_dim=0)

train_data_loader = DataLoader(train_dataset, batch_size=hparams['batch_size'], shuffle=True)
val_data_loader = DataLoader(val_dataset, batch_size=hparams['batch_size'], shuffle=False)
test_data_loader = DataLoader(test_dataset, batch_size=hparams['batch_size'], shuffle=False)

# Create the model and evaluator

model = SimpleRNN(input_size=hparams['input_size'], num_layers=n_layers).to(device)

evaluator = ModelEvaluator({f'{model_name}_{N}_{n_layers}_epoch': model}, train_data_loader, val_data_loader, test_data_loader, device, epochs=hparams['epochs'], lr=hparams['lr'])

# Train and evaluate the model

evaluator.train_and_evaluate()
val_metrics, test_metrics = evaluator.compute_best_metrics(save_dict=False)

In [10]:
# Predict with confidence score

labels, preds, confs = evaluator.predict(f'{model_name}_{N}_{n_layers}_epoch', test_data_loader, device, with_confidence=True)

# Print 10 predictions and their confidences (see the Ethical Risk Assessment section in the report)
for i in range(55, 66):
    print(f"True Label: {labels[i]}")
    print(f"Prediction: {preds[i]}")
    print(f"Confidence: {confs[i] * 100}%")
    if labels[i] != preds[i]:
        if confs[i] > 0.9:
            print("Verdict: Overconfident\n")
        else:
            print("Verdict: Balanced\n")
    
    if labels[i] == preds[i]:
        if confs[i] < 0.1:
            print("Verdict: Underconfident\n")
        else:
            print("Verdict: Balanced\n")

True Label: 1.0
Prediction: 1.0
Confidence: 99.99145269393921%
Verdict: Balanced

True Label: 0.0
Prediction: 0.0
Confidence: 0.0017738397218636237%
Verdict: Underconfident

True Label: 1.0
Prediction: 1.0
Confidence: 99.18718338012695%
Verdict: Balanced

True Label: 1.0
Prediction: 1.0
Confidence: 99.9940276145935%
Verdict: Balanced

True Label: 1.0
Prediction: 1.0
Confidence: 99.95296001434326%
Verdict: Balanced

True Label: 1.0
Prediction: 1.0
Confidence: 99.92734789848328%
Verdict: Balanced

True Label: 0.0
Prediction: 0.0
Confidence: 3.0384546789719025e-05%
Verdict: Underconfident

True Label: 1.0
Prediction: 1.0
Confidence: 99.97549653053284%
Verdict: Balanced

True Label: 0.0
Prediction: 1.0
Confidence: 99.97177720069885%
Verdict: Overconfident

True Label: 1.0
Prediction: 1.0
Confidence: 99.90384578704834%
Verdict: Balanced

True Label: 1.0
Prediction: 0.0
Confidence: 0.8072282187640667%
Verdict: Balanced

