In [1]:
#  -- PART 0: Import Relevant Modules 
import pandas as pd
import numpy as np
import torch 

from datasets import Dataset
from transformers import BertTokenizer, BertForSequenceClassification, TrainingArguments, Trainer 
from sklearn.metrics import f1_score, accuracy_score

In [2]:
# -- PART 1: Read Files 
training_df = pd.read_csv("cleaned_training_data.csv")
test_df = pd.read_csv("cleaned_test_data.csv")

print(len(training_df), len(test_df))

255 9941


In [3]:
# Topic = class in question 
# ID = student id 
# segment = section of student notes 
# IdeaUnit = idea that the notes should contain 
# label = 1 if accurate representation, 0 otherwise 
# NoteText = student's note (what's evaluated against IdeaUnit, needs to match each of those key components)

training_df.head(3)

Unnamed: 0,Topic,ID,Segment,IdeaUnit,label,NoteText
0,ComputerScience,6260226,1,declarative knowledge is a factual statement,1,basics of computer science declarative knowled...
1,ComputerScience,6260226,1,imperative knowledge is solving a problem or a...,1,basics of computer science declarative knowled...
2,ComputerScience,6260226,1,algorithms are instructions with steps to comp...,1,basics of computer science declarative knowled...


In [4]:
test_df.head(3)

Unnamed: 0,Topic,ID,Segment,IdeaUnit,label,NoteText
0,ComputerScience,6260230,1,declarative knowledge is a factual statement,1,declarative factual statementsdeclarative says...
1,ComputerScience,6260230,1,imperative knowledge is solving a problem or a...,1,declarative factual statementsdeclarative says...
2,ComputerScience,6260230,1,algorithms are instructions with steps to comp...,0,declarative factual statementsdeclarative says...


In [5]:
# -- PART 2: Pre-Processing Steps

# -- (a) create minimal df
training_df = training_df[['IdeaUnit', 'NoteText', 'label']]
test_df = test_df[['IdeaUnit', 'NoteText', 'label']]

training_df.head(3)

Unnamed: 0,IdeaUnit,NoteText,label
0,declarative knowledge is a factual statement,basics of computer science declarative knowled...,1
1,imperative knowledge is solving a problem or a...,basics of computer science declarative knowled...,1
2,algorithms are instructions with steps to comp...,basics of computer science declarative knowled...,1


In [6]:
# -- (b) convert datasets to HuggingFace dataset
hf_training_df = Dataset.from_pandas(training_df)
hf_test_df = Dataset.from_pandas(test_df)

In [7]:
# -- PART 3: Train BERT Model (compare IdeaUnit and NoteText against each other and provide label 1)

# (a) load models and tokenization 
model_name = 'bert-base-uncased' # 'bert-large-uncased'

tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertForSequenceClassification.from_pretrained(model_name, num_labels = 2)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [8]:
# -- (b) encode IdeaUnit and NoteText (use cased because capitalization does matter here)
def tokenize_input(examples): 
    return tokenizer(
        examples['IdeaUnit'],
        examples['NoteText'], 
        max_length = 512, 
        padding = 'max_length', 
        truncation = True
    )

In [9]:
# map training and test dataset to preprocessing function 
tokenized_train_df = hf_training_df.map(tokenize_input, batched = True)
tokenized_train_df.set_format('torch')

tokenized_test_df = hf_test_df.map(tokenize_input, batched = True)
tokenized_test_df.set_format('torch')

Map:   0%|          | 0/255 [00:00<?, ? examples/s]

Map:   0%|          | 0/9941 [00:00<?, ? examples/s]

In [10]:
# input_ids = tokens that represent text after tokenization 
# token_type_ids = which tokens are real vs. padding 
# attention_mask = which token belongs to unit vs. student note 
tokenized_train_df

Dataset({
    features: ['IdeaUnit', 'NoteText', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 255
})

In [11]:
# -- PART 4: Train BERT Model (compare IdeaUnit and NoteText against each other and provide label 1)

# assign training arguments (hyperparameters)
training_args = TrainingArguments(
    output_dir = 'training_arguments', 
    learning_rate = 4e-05,
    num_train_epochs = 4, 
    per_device_train_batch_size = 8,
    per_device_eval_batch_size = 8, 
)

# train model 
trainer = Trainer(
    model = model, 
    args = training_args, 
    train_dataset = tokenized_train_df
)

trainer.train()

Step,Training Loss


TrainOutput(global_step=128, training_loss=0.4036652147769928, metrics={'train_runtime': 22.7513, 'train_samples_per_second': 44.833, 'train_steps_per_second': 5.626, 'total_flos': 268373276467200.0, 'train_loss': 0.4036652147769928, 'epoch': 4.0})

In [12]:
# -- PART 5: Predict on Test Data

# get predictions 
predicted_class = trainer.predict(tokenized_test_df)

In [13]:
# convert raw scores to binary classification (choose larger)
predicted_class = np.argmax(predicted_class.predictions, axis = 1)

# add predicted values to original dataset
test_df['predicted_class'] = predicted_class 

In [14]:
# -- PART 6: Evaluate Results (Accuracy)

print("F1 Score:", 100*f1_score(test_df['label'], test_df['predicted_class']))
print("Accuracy:", 100*accuracy_score(test_df['label'], test_df['predicted_class']))


F1 Score: 65.56726596404215
Accuracy: 72.06518458907554


In [15]:
# original model is 69.56%
# pushed num_train_epochs from 3 to 5 = 70.11%
# per_device_train_batch_size from 8 to 16 = 70.81%