### Initialization and Starter Code

In [None]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer, BertModel
from sklearn.model_selection import KFold
import numpy as np
import pandas as pd
from sklearn.metrics import cohen_kappa_score
from datetime import datetime
from tqdm.auto import tqdm #progress bars to see progress 

SCORE_RANGES = {
        1: {'sentence_fluency': (1, 6), 'word_choice': (1, 6), 'conventions': (1, 6),'organization': (1, 6),
            'content': (1, 6), 'holistic': (2, 12)},
        2: {'sentence_fluency': (1, 6), 'word_choice': (1, 6), 'conventions': (1, 6),'organization': (1, 6),
            'content': (1, 6), 'holistic': (1, 6)},
        3: {'narrativity': (0, 3), 'language': (0, 3), 'prompt_adherence': (0, 3), 'content': (0, 3),
            'holistic': (0, 3)},
        4: {'narrativity': (0, 3), 'language': (0, 3), 'prompt_adherence': (0, 3), 'content': (0, 3),
            'holistic': (0, 3)},
        5: {'narrativity': (0, 4), 'language': (0, 4), 'prompt_adherence': (0, 4), 'content': (0, 4),
            'holistic': (0, 4)},
        6: {'narrativity': (0, 4), 'language': (0, 4), 'prompt_adherence': (0, 4), 'content': (0, 4),
            'holistic': (0, 4)},
        7: {'conventions': (0, 6), 'organization': (0, 6), 'content': (0, 6),'holistic': (0, 30)},
        8: {'sentence_fluency': (2, 12), 'word_choice': (2, 12), 'conventions': (2, 12),'organization': (2, 12),
            'content': (2, 12), 'holistic': (0, 60)}}

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') #use gpu if avilable 
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') #initialize BERT tokenizer 
def read_data(path):
   data = pd.read_csv(path)
   return {
       'essay_ids': data['essay_id'].values,
       'prompt_ids': data['prompt_id'].values,
       'essay_text': data['essay_text'].values,
       'features': data.iloc[:, 12:].values,
       'holistic': data['holistic'].values
   }

def normalize_scores(scores, prompt_id): #helper function to normalize scores since we have dif ranges
   score_range = SCORE_RANGES[prompt_id]['holistic']
   return (scores - score_range[0]) / (score_range[1] - score_range[0])

def denormalize_scores(norm_scores, prompt_id):
   score_range = SCORE_RANGES[prompt_id]['holistic']
   return norm_scores * (score_range[1] - score_range[0]) + score_range[0]

def quadratic_weighted_kappa(y_true, y_pred):
   return cohen_kappa_score(y_true, np.round(y_pred), weights='quadratic')

class essays_dataset(Dataset): #class to help organize and prepare our data for training
   def __init__(self, essays, features, scores, prompt_ids, max_length=512):
       self.essays = essays
       self.features = features
       self.scores = scores
       self.prompt_ids = prompt_ids
       self.max_length = max_length #max tokens length for BERT which is 512

   def __len__(self):
       return len(self.essays) #returns number of essays in the dataset

   def __getitem__(self, idx): #to get a single essay with its info 
       essay = str(self.essays[idx])
       features = self.features[idx]
       score = self.scores[idx]
       prompt_id = self.prompt_ids[idx]

        #tokenize the essay text 
       encoding = tokenizer(  
           essay,
           max_length=self.max_length, #BERTs max length 512
           padding='max_length', #add paddinf if <512 to reach 512
           truncation=True, #cut off if >512
           return_tensors='pt'
       )

        #return dict with all needed info
       return {
           'input_ids': encoding['input_ids'].squeeze(), #token id and remove extra single dimension
           'attention_mask': encoding['attention_mask'].squeeze(), #mask to ignore padding 
           'features': torch.FloatTensor(features), 
           'score': torch.FloatTensor([score]),
           'prompt_id': prompt_id
       }

class Our_BERT(nn.Module):
   def __init__(self, hidden_size=0):
       super(Our_BERT, self).__init__()
       #load pretrained bert model
       self.bert = BertModel.from_pretrained('bert-base-uncased')
       
       input_size = 768 + 86 #768 from BERT 86 our features
       
        #create regression head based on hidden_size
       if hidden_size > 0:  #ff hidden size sis pecified use a hidden layer with ReLU
           self.regression_head = nn.Sequential(
               nn.Linear(input_size, hidden_size),
               nn.ReLU(),
               nn.Linear(hidden_size, 1)
           )
       else: #otherwise use single layer
           self.regression_head = nn.Linear(input_size, 1)
        #initialize weights using He/Kaiming initialization
       for module in self.regression_head.modules():
           if isinstance(module, nn.Linear):
               nn.init.kaiming_normal_(module.weight)
               nn.init.zeros_(module.bias)

   def forward(self, input_ids, attention_mask, features): #forward pass
       #create an embedding for each token in our essay text
       outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask, return_dict=True) 
       cls_embedding = outputs.last_hidden_state[:, 0, :] #get CLS token embedding which represents essay
       combined_features = torch.cat([cls_embedding, features], dim=1) #combine CLS with features
       return self.regression_head(combined_features) #pass them through the regression head 

def train_model(model, train_loader, criterion, optimizer, epoch):
   model.train() #set the model in training mode 
   total_loss = 0
   batches_no = 0

    #iterate through batches with progress bar
   for batch in tqdm(train_loader, desc=f'Epoch {epoch}', leave=False):
       input_ids = batch['input_ids'].to(device) #move all batch data to correct device so gpu or cpu
       attention_mask = batch['attention_mask'].to(device)
       features = batch['features'].to(device)
       scores = batch['score'].to(device)

       optimizer.zero_grad() #clear gradients from before
       outputs = model(input_ids, attention_mask, features) #forward pass
       loss = criterion(outputs, scores) #calculate loss
       loss.backward() #backprobagation
       optimizer.step() #update weights
        #track the total loss
       total_loss += loss.item()
       batches_no += 1

   avg_loss = total_loss / batches_no #calculate avg loss for each epoch
   print(f'epoch {epoch}: avg training loss = {avg_loss:.4f}')
   return avg_loss

def evaluate_model(model, val_loader, criterion, val_prompt_id):
   model.eval() #set model in evaluation mode
   total_loss = 0
   batches_no = 0
   all_predictions = [] #to store all our predictions
   all_real_scores = [] #to store all the real scores

   print("\n--------------------Evaluating-----------------------")
   with torch.no_grad(): #don't track gradients during evaluation
       for batch in tqdm(val_loader, desc='Validation', leave=False):
           input_ids = batch['input_ids'].to(device)
           attention_mask = batch['attention_mask'].to(device)
           features = batch['features'].to(device)
           scores = batch['score'].to(device)

           outputs = model(input_ids, attention_mask, features) 
           loss = criterion(outputs, scores) #calculate loss

           total_loss += loss.item()
           batches_no += 1
            #convert normalized scores back to original range
           predictions = denormalize_scores(outputs.cpu().numpy(), val_prompt_id)
           targets = denormalize_scores(scores.cpu().numpy(), val_prompt_id)
           all_predictions.extend(predictions)
           all_real_scores.extend(targets)
    #calculate average loss and QWK score
   avg_loss = total_loss / batches_no
   qwk = quadratic_weighted_kappa(np.round(all_real_scores), np.round(all_predictions))
   
   print(f"\nValidation results for prompt {val_prompt_id}:")
   print(f"Avg loss: {avg_loss:.4f}")
   print(f"QWK: {qwk:.4f}")
   #print min, max and mean of predicted and actual to see if we are normalizing and denormalizing properly and how far off we are from actual values
   print(f"Predicted min: {np.min(all_predictions):.2f}, max: {np.max(all_predictions):.2f}, mean: {np.mean(all_predictions):.2f}")
   print(f"Actual min: {np.min(all_real_scores):.2f}, max: {np.max(all_real_scores):.2f}, mean: {np.mean(all_real_scores):.2f}")
   
   return avg_loss, qwk



### Grid search

In [None]:

print(f"\n{'='*50}")
print(f"Starting Holistic scoring using BERT model at {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
print(f"{'='*50}")

data_dict = read_data('dataset.csv')
validation_prompts = list(range(2, 9)) #prompts 2-8 used for cross validation
results = [] #to store the results for each time
hidden_sizes = [0, 4, 8]

for hidden_size in hidden_sizes: #grid search over dif regression head configs
    print(f"\n{'='*50}")
    print(f"Trying regression head with {hidden_size} hidden units")
    fold_scores = [] #to store the scores for each validation prompt

    for val_prompt in validation_prompts:
        print(f"\n{'-'*50}")
        print(f"Validation prompt no. {val_prompt}")
        #create train and validation splits
        train_mask = np.isin(data_dict['prompt_ids'], [prompt for prompt in validation_prompts if prompt != val_prompt])
        val_mask = (data_dict['prompt_ids'] == val_prompt)

        #create datasets using the class we defined for training and validation
        train_dataset = essays_dataset(
            data_dict['essay_text'][train_mask],
            data_dict['features'][train_mask],
            [normalize_scores(score, pid) for score, pid in zip(data_dict['holistic'][train_mask], data_dict['prompt_ids'][train_mask])],
            data_dict['prompt_ids'][train_mask]
        )

        val_dataset = essays_dataset(
            data_dict['essay_text'][val_mask],
            data_dict['features'][val_mask],
            [normalize_scores(score, val_prompt) for score in data_dict['holistic'][val_mask]],
            data_dict['prompt_ids'][val_mask]
        )

        #dataLoaders to feed our datasets to the model in batches of 32
        train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
        val_loader = DataLoader(val_dataset, batch_size=32)
         #initialize model and optimizer
        model = Our_BERT(hidden_size=hidden_size).to(device)
        optimizer = torch.optim.AdamW(
            model.parameters(),
            lr=0.005,
            betas=(0.9, 0.999),
            weight_decay=0.1
        )
        criterion = nn.MSELoss() #minimum squared error 

        print(f"\nStarting training...")
        for epoch in range(5): 
            train_loss = train_model(model, train_loader, criterion, optimizer, epoch + 1)
            val_loss, val_qwk = evaluate_model(model, val_loader, criterion, val_prompt)

        fold_scores.append(val_qwk)
     #calculate avg and std of scores
    avg_qwk = np.mean(fold_scores)
    std_qwk = np.std(fold_scores)
    results.append({
        'hidden_size': hidden_size,
        'avg_qwk': avg_qwk,
        'std_qwk': std_qwk,
        'fold_scores': fold_scores
    })
#find the best configs
best_result = max(results, key=lambda x: x['avg_qwk'])
best_hidden_size = best_result['hidden_size']



Starting Holistic scoring using BERT model at 2024-12-07 16:12:17

Trying regression head with 0 hidden units

--------------------------------------------------
Validation prompt no. 2

Starting training...


                                                          

epoch 1: avg training loss = 2.5709

--------------------Evaluating-----------------------


                                                           


Validation results for prompt 2:
Avg loss: 0.0409
QWK: 0.4115
Predicted min: 1.40, max: 7.40, mean: 4.22
Actual min: 1.00, max: 6.00, mean: 3.41


                                                          

epoch 2: avg training loss = 0.0359

--------------------Evaluating-----------------------


                                                           


Validation results for prompt 2:
Avg loss: 0.0163
QWK: 0.6073
Predicted min: 1.18, max: 6.95, mean: 3.67
Actual min: 1.00, max: 6.00, mean: 3.41


                                                          

epoch 3: avg training loss = 0.0334

--------------------Evaluating-----------------------


                                                           


Validation results for prompt 2:
Avg loss: 0.0488
QWK: 0.3798
Predicted min: 1.91, max: 7.88, mean: 4.35
Actual min: 1.00, max: 6.00, mean: 3.41


                                                          

epoch 4: avg training loss = 0.0315

--------------------Evaluating-----------------------


                                                           


Validation results for prompt 2:
Avg loss: 0.0237
QWK: 0.5367
Predicted min: 1.54, max: 7.51, mean: 3.91
Actual min: 1.00, max: 6.00, mean: 3.41


                                                          

epoch 5: avg training loss = 0.0302

--------------------Evaluating-----------------------


                                                           


Validation results for prompt 2:
Avg loss: 0.0149
QWK: 0.6270
Predicted min: 1.35, max: 7.12, mean: 3.60
Actual min: 1.00, max: 6.00, mean: 3.41

--------------------------------------------------
Validation prompt no. 3

Starting training...


                                                          

epoch 1: avg training loss = 2.1774

--------------------Evaluating-----------------------


                                                           


Validation results for prompt 3:
Avg loss: 0.0523
QWK: 0.3320
Predicted min: 0.96, max: 3.29, mean: 1.74
Actual min: 0.00, max: 3.00, mean: 1.85


                                                          

epoch 2: avg training loss = 0.0310

--------------------Evaluating-----------------------


                                                           


Validation results for prompt 3:
Avg loss: 0.0437
QWK: 0.5199
Predicted min: 0.66, max: 3.95, mean: 1.71
Actual min: 0.00, max: 3.00, mean: 1.85


                                                          

epoch 3: avg training loss = 0.0276

--------------------Evaluating-----------------------


                                                           


Validation results for prompt 3:
Avg loss: 0.0449
QWK: 0.5430
Predicted min: 0.42, max: 3.95, mean: 1.64
Actual min: 0.00, max: 3.00, mean: 1.85


                                                          

epoch 4: avg training loss = 0.0264

--------------------Evaluating-----------------------


                                                           


Validation results for prompt 3:
Avg loss: 0.0391
QWK: 0.5230
Predicted min: 0.52, max: 4.17, mean: 1.86
Actual min: 0.00, max: 3.00, mean: 1.85


                                                          

epoch 5: avg training loss = 0.0257

--------------------Evaluating-----------------------


                                                           


Validation results for prompt 3:
Avg loss: 0.0386
QWK: 0.5504
Predicted min: 0.43, max: 4.14, mean: 1.84
Actual min: 0.00, max: 3.00, mean: 1.85

--------------------------------------------------
Validation prompt no. 4

Starting training...


                                                          

epoch 1: avg training loss = 1.9327

--------------------Evaluating-----------------------


                                                           


Validation results for prompt 4:
Avg loss: 0.0734
QWK: 0.3583
Predicted min: 0.79, max: 3.44, mean: 1.79
Actual min: 0.00, max: 3.00, mean: 1.43


                                                          

epoch 2: avg training loss = 0.0289

--------------------Evaluating-----------------------


                                                           


Validation results for prompt 4:
Avg loss: 0.0903
QWK: 0.2944
Predicted min: 0.89, max: 3.94, mean: 2.00
Actual min: 0.00, max: 3.00, mean: 1.43


                                                          

epoch 3: avg training loss = 0.0256

--------------------Evaluating-----------------------


                                                           


Validation results for prompt 4:
Avg loss: 0.0583
QWK: 0.5491
Predicted min: 0.51, max: 3.77, mean: 1.66
Actual min: 0.00, max: 3.00, mean: 1.43


                                                          

epoch 4: avg training loss = 0.0243

--------------------Evaluating-----------------------


                                                           


Validation results for prompt 4:
Avg loss: 0.0787
QWK: 0.4003
Predicted min: 0.77, max: 4.09, mean: 1.92
Actual min: 0.00, max: 3.00, mean: 1.43


                                                          

epoch 5: avg training loss = 0.0231

--------------------Evaluating-----------------------


                                                           


Validation results for prompt 4:
Avg loss: 0.0685
QWK: 0.4649
Predicted min: 0.69, max: 4.06, mean: 1.83
Actual min: 0.00, max: 3.00, mean: 1.43

--------------------------------------------------
Validation prompt no. 5

Starting training...


                                                          

epoch 1: avg training loss = 1.9546

--------------------Evaluating-----------------------


                                                           


Validation results for prompt 5:
Avg loss: 0.0280
QWK: 0.5840
Predicted min: 0.74, max: 4.43, mean: 2.35
Actual min: 0.00, max: 4.00, mean: 2.41


                                                          

epoch 2: avg training loss = 0.0392

--------------------Evaluating-----------------------


                                                           


Validation results for prompt 5:
Avg loss: 0.0249
QWK: 0.6330
Predicted min: 0.76, max: 4.78, mean: 2.40
Actual min: 0.00, max: 4.00, mean: 2.41


                                                          

epoch 3: avg training loss = 0.0324

--------------------Evaluating-----------------------


                                                           


Validation results for prompt 5:
Avg loss: 0.0237
QWK: 0.6615
Predicted min: 0.76, max: 4.97, mean: 2.42
Actual min: 0.00, max: 4.00, mean: 2.41


                                                          

epoch 4: avg training loss = 0.0301

--------------------Evaluating-----------------------


                                                           


Validation results for prompt 5:
Avg loss: 0.0231
QWK: 0.6708
Predicted min: 0.79, max: 5.01, mean: 2.42
Actual min: 0.00, max: 4.00, mean: 2.41


                                                          

epoch 5: avg training loss = 0.0289

--------------------Evaluating-----------------------


                                                           


Validation results for prompt 5:
Avg loss: 0.0231
QWK: 0.6743
Predicted min: 0.72, max: 4.94, mean: 2.34
Actual min: 0.00, max: 4.00, mean: 2.41

--------------------------------------------------
Validation prompt no. 6

Starting training...


                                                          

epoch 1: avg training loss = 1.8334

--------------------Evaluating-----------------------


                                                           


Validation results for prompt 6:
Avg loss: 0.0469
QWK: 0.3564
Predicted min: 1.12, max: 4.25, mean: 2.41
Actual min: 0.00, max: 4.00, mean: 2.72


                                                          

epoch 2: avg training loss = 0.0359

--------------------Evaluating-----------------------


                                                           


Validation results for prompt 6:
Avg loss: 0.0345
QWK: 0.5222
Predicted min: 1.16, max: 5.36, mean: 2.79
Actual min: 0.00, max: 4.00, mean: 2.72


                                                          

epoch 3: avg training loss = 0.0304

--------------------Evaluating-----------------------


                                                           


Validation results for prompt 6:
Avg loss: 0.0357
QWK: 0.5316
Predicted min: 1.18, max: 5.70, mean: 2.94
Actual min: 0.00, max: 4.00, mean: 2.72


                                                          

epoch 4: avg training loss = 0.0282

--------------------Evaluating-----------------------


                                                           


Validation results for prompt 6:
Avg loss: 0.0336
QWK: 0.5501
Predicted min: 0.75, max: 5.26, mean: 2.56
Actual min: 0.00, max: 4.00, mean: 2.72


                                                          

epoch 5: avg training loss = 0.0272

--------------------Evaluating-----------------------


                                                           


Validation results for prompt 6:
Avg loss: 0.0338
QWK: 0.5564
Predicted min: 0.57, max: 5.33, mean: 2.53
Actual min: 0.00, max: 4.00, mean: 2.72

--------------------------------------------------
Validation prompt no. 7

Starting training...


                                                          

epoch 1: avg training loss = 1.6764

--------------------Evaluating-----------------------


                                                           


Validation results for prompt 7:
Avg loss: 0.0279
QWK: 0.4471
Predicted min: 0.24, max: 29.71, mean: 12.84
Actual min: 2.00, max: 24.00, mean: 16.06


                                                          

epoch 2: avg training loss = 0.0365

--------------------Evaluating-----------------------


                                                           


Validation results for prompt 7:
Avg loss: 0.0280
QWK: 0.5052
Predicted min: -0.47, max: 35.25, mean: 12.61
Actual min: 2.00, max: 24.00, mean: 16.06


                                                          

epoch 3: avg training loss = 0.0317

--------------------Evaluating-----------------------


                                                           


Validation results for prompt 7:
Avg loss: 0.0313
QWK: 0.4929
Predicted min: -0.98, max: 36.98, mean: 12.16
Actual min: 2.00, max: 24.00, mean: 16.06


                                                          

epoch 4: avg training loss = 0.0300

--------------------Evaluating-----------------------


                                                           


Validation results for prompt 7:
Avg loss: 0.0291
QWK: 0.5185
Predicted min: -0.92, max: 38.23, mean: 12.42
Actual min: 2.00, max: 24.00, mean: 16.06


                                                          

epoch 5: avg training loss = 0.0290

--------------------Evaluating-----------------------


                                                           


Validation results for prompt 7:
Avg loss: 0.0168
QWK: 0.6448
Predicted min: 0.96, max: 40.34, mean: 14.46
Actual min: 2.00, max: 24.00, mean: 16.06

--------------------------------------------------
Validation prompt no. 8

Starting training...


                                                          

epoch 1: avg training loss = 2.0311

--------------------Evaluating-----------------------


                                                           


Validation results for prompt 8:
Avg loss: 0.0310
QWK: 0.3342
Predicted min: 18.15, max: 64.63, mean: 45.38
Actual min: 10.00, max: 60.00, mean: 36.95


                                                          

epoch 2: avg training loss = 0.0309

--------------------Evaluating-----------------------


                                                           


Validation results for prompt 8:
Avg loss: 0.0498
QWK: 0.3055
Predicted min: 14.40, max: 70.71, mean: 47.14
Actual min: 10.00, max: 60.00, mean: 36.95


                                                          

epoch 3: avg training loss = 0.0282

--------------------Evaluating-----------------------


                                                           


Validation results for prompt 8:
Avg loss: 0.0331
QWK: 0.4086
Predicted min: 9.37, max: 66.71, mean: 43.09
Actual min: 10.00, max: 60.00, mean: 36.95


                                                          

epoch 4: avg training loss = 0.0274

--------------------Evaluating-----------------------


                                                           


Validation results for prompt 8:
Avg loss: 0.0572
QWK: 0.2920
Predicted min: 12.67, max: 71.51, mean: 47.89
Actual min: 10.00, max: 60.00, mean: 36.95


                                                          

epoch 5: avg training loss = 0.0270

--------------------Evaluating-----------------------


                                                           


Validation results for prompt 8:
Avg loss: 0.0608
QWK: 0.2818
Predicted min: 12.09, max: 72.39, mean: 48.38
Actual min: 10.00, max: 60.00, mean: 36.95

Trying regression head with 4 hidden units

--------------------------------------------------
Validation prompt no. 2

Starting training...


                                                          

epoch 1: avg training loss = 0.1190

--------------------Evaluating-----------------------


                                                           


Validation results for prompt 2:
Avg loss: 0.0319
QWK: 0.4458
Predicted min: 2.24, max: 7.09, mean: 4.11
Actual min: 1.00, max: 6.00, mean: 3.41


                                                          

epoch 2: avg training loss = 0.0321

--------------------Evaluating-----------------------


                                                           


Validation results for prompt 2:
Avg loss: 0.0364
QWK: 0.4431
Predicted min: 2.03, max: 7.48, mean: 4.19
Actual min: 1.00, max: 6.00, mean: 3.41


                                                          

epoch 3: avg training loss = 0.0295

--------------------Evaluating-----------------------


                                                           


Validation results for prompt 2:
Avg loss: 0.0259
QWK: 0.5277
Predicted min: 1.68, max: 6.66, mean: 3.99
Actual min: 1.00, max: 6.00, mean: 3.41


                                                          

epoch 4: avg training loss = 0.0279

--------------------Evaluating-----------------------


                                                           


Validation results for prompt 2:
Avg loss: 0.0258
QWK: 0.5458
Predicted min: 1.46, max: 6.71, mean: 3.97
Actual min: 1.00, max: 6.00, mean: 3.41


                                                          

epoch 5: avg training loss = 0.0267

--------------------Evaluating-----------------------


                                                           


Validation results for prompt 2:
Avg loss: 0.0200
QWK: 0.6143
Predicted min: 1.33, max: 6.47, mean: 3.81
Actual min: 1.00, max: 6.00, mean: 3.41

--------------------------------------------------
Validation prompt no. 3

Starting training...


                                                          

epoch 1: avg training loss = 0.8549

--------------------Evaluating-----------------------


                                                           


Validation results for prompt 3:
Avg loss: 0.0443
QWK: 0.4657
Predicted min: 0.87, max: 3.48, mean: 1.75
Actual min: 0.00, max: 3.00, mean: 1.85


                                                          

epoch 2: avg training loss = 0.0280

--------------------Evaluating-----------------------


                                                           


Validation results for prompt 3:
Avg loss: 0.0394
QWK: 0.5662
Predicted min: 0.40, max: 4.05, mean: 1.74
Actual min: 0.00, max: 3.00, mean: 1.85


                                                          

epoch 3: avg training loss = 0.0265

--------------------Evaluating-----------------------


                                                           


Validation results for prompt 3:
Avg loss: 0.0384
QWK: 0.5671
Predicted min: 0.39, max: 4.08, mean: 1.79
Actual min: 0.00, max: 3.00, mean: 1.85


                                                          

epoch 4: avg training loss = 0.0255

--------------------Evaluating-----------------------


                                                           


Validation results for prompt 3:
Avg loss: 0.0459
QWK: 0.5519
Predicted min: 0.19, max: 3.79, mean: 1.59
Actual min: 0.00, max: 3.00, mean: 1.85


                                                          

epoch 5: avg training loss = 0.0254

--------------------Evaluating-----------------------


                                                           


Validation results for prompt 3:
Avg loss: 0.0378
QWK: 0.5668
Predicted min: 0.36, max: 4.13, mean: 1.85
Actual min: 0.00, max: 3.00, mean: 1.85

--------------------------------------------------
Validation prompt no. 4

Starting training...


                                                          

epoch 1: avg training loss = 6.5197

--------------------Evaluating-----------------------


                                                           


Validation results for prompt 4:
Avg loss: 0.1623
QWK: 0.0000
Predicted min: 0.67, max: 0.67, mean: 0.67
Actual min: 0.00, max: 3.00, mean: 1.43


                                                          

epoch 2: avg training loss = 0.1273

--------------------Evaluating-----------------------


                                                           


Validation results for prompt 4:
Avg loss: 0.1049
QWK: 0.0000
Predicted min: 1.17, max: 1.17, mean: 1.17
Actual min: 0.00, max: 3.00, mean: 1.43


                                                          

epoch 3: avg training loss = 0.0707

--------------------Evaluating-----------------------


                                                           


Validation results for prompt 4:
Avg loss: 0.0977
QWK: 0.0000
Predicted min: 1.45, max: 1.45, mean: 1.45
Actual min: 0.00, max: 3.00, mean: 1.43


                                                          

epoch 4: avg training loss = 0.0562

--------------------Evaluating-----------------------


                                                           


Validation results for prompt 4:
Avg loss: 0.0999
QWK: 0.0000
Predicted min: 1.57, max: 1.57, mean: 1.57
Actual min: 0.00, max: 3.00, mean: 1.43


                                                          

epoch 5: avg training loss = 0.0526

--------------------Evaluating-----------------------


                                                           


Validation results for prompt 4:
Avg loss: 0.1017
QWK: 0.0000
Predicted min: 1.62, max: 1.62, mean: 1.62
Actual min: 0.00, max: 3.00, mean: 1.43

--------------------------------------------------
Validation prompt no. 5

Starting training...


                                                          

epoch 1: avg training loss = 0.8053

--------------------Evaluating-----------------------


                                                           


Validation results for prompt 5:
Avg loss: 0.0512
QWK: 0.0763
Predicted min: 1.95, max: 2.80, mean: 2.28
Actual min: 0.00, max: 4.00, mean: 2.41


                                                          

epoch 2: avg training loss = 0.0450

--------------------Evaluating-----------------------


                                                           


Validation results for prompt 5:
Avg loss: 0.0354
QWK: 0.5569
Predicted min: 1.67, max: 3.95, mean: 2.48
Actual min: 0.00, max: 4.00, mean: 2.41


                                                          

epoch 3: avg training loss = 0.0361

--------------------Evaluating-----------------------


                                                           


Validation results for prompt 5:
Avg loss: 0.0316
QWK: 0.4770
Predicted min: 1.17, max: 4.16, mean: 2.24
Actual min: 0.00, max: 4.00, mean: 2.41


                                                          

epoch 4: avg training loss = 0.0325

--------------------Evaluating-----------------------


                                                           


Validation results for prompt 5:
Avg loss: 0.0300
QWK: 0.5377
Predicted min: 0.98, max: 4.37, mean: 2.20
Actual min: 0.00, max: 4.00, mean: 2.41


                                                          

epoch 5: avg training loss = 0.0313

--------------------Evaluating-----------------------


                                                           


Validation results for prompt 5:
Avg loss: 0.0293
QWK: 0.5680
Predicted min: 0.88, max: 4.48, mean: 2.18
Actual min: 0.00, max: 4.00, mean: 2.41

--------------------------------------------------
Validation prompt no. 6

Starting training...


                                                          

epoch 1: avg training loss = 1.7907

--------------------Evaluating-----------------------


                                                           


Validation results for prompt 6:
Avg loss: 0.0474
QWK: 0.4472
Predicted min: 0.52, max: 4.65, mean: 2.26
Actual min: 0.00, max: 4.00, mean: 2.72


                                                          

epoch 2: avg training loss = 0.0342

--------------------Evaluating-----------------------


                                                           


Validation results for prompt 6:
Avg loss: 0.0429
QWK: 0.4775
Predicted min: 0.69, max: 4.79, mean: 2.31
Actual min: 0.00, max: 4.00, mean: 2.72


                                                          

epoch 3: avg training loss = 0.0298

--------------------Evaluating-----------------------


                                                           


Validation results for prompt 6:
Avg loss: 0.0349
QWK: 0.5387
Predicted min: 0.95, max: 5.02, mean: 2.51
Actual min: 0.00, max: 4.00, mean: 2.72


                                                          

epoch 4: avg training loss = 0.0283

--------------------Evaluating-----------------------


                                                           


Validation results for prompt 6:
Avg loss: 0.0400
QWK: 0.4966
Predicted min: 0.82, max: 4.87, mean: 2.37
Actual min: 0.00, max: 4.00, mean: 2.72


                                                          

epoch 5: avg training loss = 0.0283

--------------------Evaluating-----------------------


                                                           


Validation results for prompt 6:
Avg loss: 0.0404
QWK: 0.4919
Predicted min: 0.85, max: 4.83, mean: 2.36
Actual min: 0.00, max: 4.00, mean: 2.72

--------------------------------------------------
Validation prompt no. 7

Starting training...


                                                          

epoch 1: avg training loss = 0.3065

--------------------Evaluating-----------------------


                                                           


Validation results for prompt 7:
Avg loss: 0.0182
QWK: 0.5823
Predicted min: 3.24, max: 35.38, mean: 13.80
Actual min: 2.00, max: 24.00, mean: 16.06


                                                          

epoch 2: avg training loss = 0.0317

--------------------Evaluating-----------------------


                                                           


Validation results for prompt 7:
Avg loss: 0.0194
QWK: 0.6089
Predicted min: 1.42, max: 39.11, mean: 13.72
Actual min: 2.00, max: 24.00, mean: 16.06


                                                          

epoch 3: avg training loss = 0.0305

--------------------Evaluating-----------------------


                                                           


Validation results for prompt 7:
Avg loss: 0.0225
QWK: 0.5786
Predicted min: 1.09, max: 39.42, mean: 13.22
Actual min: 2.00, max: 24.00, mean: 16.06


                                                          

epoch 4: avg training loss = 0.0296

--------------------Evaluating-----------------------


                                                           


Validation results for prompt 7:
Avg loss: 0.0318
QWK: 0.4976
Predicted min: 1.01, max: 38.17, mean: 11.98
Actual min: 2.00, max: 24.00, mean: 16.06


                                                          

epoch 5: avg training loss = 0.0289

--------------------Evaluating-----------------------


                                                           


Validation results for prompt 7:
Avg loss: 0.0184
QWK: 0.6313
Predicted min: 0.99, max: 39.88, mean: 13.88
Actual min: 2.00, max: 24.00, mean: 16.06

--------------------------------------------------
Validation prompt no. 8

Starting training...


                                                          

epoch 1: avg training loss = 0.5849

--------------------Evaluating-----------------------


                                                           


Validation results for prompt 8:
Avg loss: 0.0848
QWK: 0.1965
Predicted min: 13.38, max: 74.01, mean: 51.93
Actual min: 10.00, max: 60.00, mean: 36.95


                                                          

epoch 2: avg training loss = 0.0325

--------------------Evaluating-----------------------


                                                           


Validation results for prompt 8:
Avg loss: 0.0572
QWK: 0.2788
Predicted min: 9.70, max: 70.56, mean: 48.21
Actual min: 10.00, max: 60.00, mean: 36.95


                                                          

epoch 3: avg training loss = 0.0293

--------------------Evaluating-----------------------


                                                           


Validation results for prompt 8:
Avg loss: 0.0710
QWK: 0.2419
Predicted min: 12.14, max: 72.85, mean: 50.17
Actual min: 10.00, max: 60.00, mean: 36.95


                                                          

epoch 4: avg training loss = 0.0283

--------------------Evaluating-----------------------


                                                           


Validation results for prompt 8:
Avg loss: 0.0675
QWK: 0.2500
Predicted min: 12.00, max: 72.25, mean: 49.73
Actual min: 10.00, max: 60.00, mean: 36.95


                                                          

epoch 5: avg training loss = 0.0276

--------------------Evaluating-----------------------


                                                           


Validation results for prompt 8:
Avg loss: 0.0567
QWK: 0.2832
Predicted min: 11.52, max: 70.49, mean: 48.24
Actual min: 10.00, max: 60.00, mean: 36.95

Trying regression head with 8 hidden units

--------------------------------------------------
Validation prompt no. 2

Starting training...


                                                          

epoch 1: avg training loss = 1.9938

--------------------Evaluating-----------------------


                                                           


Validation results for prompt 2:
Avg loss: 0.0818
QWK: 0.2630
Predicted min: 2.35, max: 8.07, mean: 4.71
Actual min: 1.00, max: 6.00, mean: 3.41


                                                          

epoch 2: avg training loss = 0.0368

--------------------Evaluating-----------------------


                                                           


Validation results for prompt 2:
Avg loss: 0.0308
QWK: 0.4684
Predicted min: 2.23, max: 7.54, mean: 4.08
Actual min: 1.00, max: 6.00, mean: 3.41


                                                          

epoch 3: avg training loss = 0.0322

--------------------Evaluating-----------------------


                                                           


Validation results for prompt 2:
Avg loss: 0.0348
QWK: 0.4409
Predicted min: 2.26, max: 7.63, mean: 4.15
Actual min: 1.00, max: 6.00, mean: 3.41


                                                          

epoch 4: avg training loss = 0.0312

--------------------Evaluating-----------------------


                                                           


Validation results for prompt 2:
Avg loss: 0.0250
QWK: 0.5115
Predicted min: 2.05, max: 7.41, mean: 3.96
Actual min: 1.00, max: 6.00, mean: 3.41


                                                          

epoch 5: avg training loss = 0.0304

--------------------Evaluating-----------------------


                                                           


Validation results for prompt 2:
Avg loss: 0.0232
QWK: 0.5249
Predicted min: 1.96, max: 6.72, mean: 3.93
Actual min: 1.00, max: 6.00, mean: 3.41

--------------------------------------------------
Validation prompt no. 3

Starting training...


                                                          

epoch 1: avg training loss = 1.5046

--------------------Evaluating-----------------------


                                                           


Validation results for prompt 3:
Avg loss: 0.0513
QWK: 0.4943
Predicted min: 0.48, max: 3.40, mean: 1.60
Actual min: 0.00, max: 3.00, mean: 1.85


                                                          

epoch 2: avg training loss = 0.0315

--------------------Evaluating-----------------------


                                                           


Validation results for prompt 3:
Avg loss: 0.0454
QWK: 0.5334
Predicted min: 0.30, max: 3.74, mean: 1.63
Actual min: 0.00, max: 3.00, mean: 1.85


                                                          

epoch 3: avg training loss = 0.0275

--------------------Evaluating-----------------------


                                                           


Validation results for prompt 3:
Avg loss: 0.0460
QWK: 0.5327
Predicted min: 0.29, max: 3.68, mean: 1.61
Actual min: 0.00, max: 3.00, mean: 1.85


                                                          

epoch 4: avg training loss = 0.0263

--------------------Evaluating-----------------------


                                                           


Validation results for prompt 3:
Avg loss: 0.0404
QWK: 0.5482
Predicted min: 0.39, max: 3.78, mean: 1.74
Actual min: 0.00, max: 3.00, mean: 1.85


                                                          

epoch 5: avg training loss = 0.0255

--------------------Evaluating-----------------------


                                                           


Validation results for prompt 3:
Avg loss: 0.0415
QWK: 0.5495
Predicted min: 0.38, max: 3.68, mean: 1.70
Actual min: 0.00, max: 3.00, mean: 1.85

--------------------------------------------------
Validation prompt no. 4

Starting training...


                                                          

epoch 1: avg training loss = 0.6885

--------------------Evaluating-----------------------


                                                           


Validation results for prompt 4:
Avg loss: 0.1003
QWK: 0.0000
Predicted min: 1.27, max: 1.27, mean: 1.27
Actual min: 0.00, max: 3.00, mean: 1.43


                                                          

epoch 2: avg training loss = 0.0572

--------------------Evaluating-----------------------


                                                           


Validation results for prompt 4:
Avg loss: 0.1029
QWK: 0.0000
Predicted min: 1.65, max: 1.65, mean: 1.65
Actual min: 0.00, max: 3.00, mean: 1.43


                                                          

epoch 3: avg training loss = 0.0504

--------------------Evaluating-----------------------


                                                           


Validation results for prompt 4:
Avg loss: 0.1053
QWK: 0.0000
Predicted min: 1.69, max: 1.69, mean: 1.69
Actual min: 0.00, max: 3.00, mean: 1.43


                                                          

epoch 4: avg training loss = 0.0500

--------------------Evaluating-----------------------


                                                           


Validation results for prompt 4:
Avg loss: 0.1068
QWK: 0.0000
Predicted min: 1.71, max: 1.71, mean: 1.71
Actual min: 0.00, max: 3.00, mean: 1.43


                                                          

epoch 5: avg training loss = 0.0498

--------------------Evaluating-----------------------


                                                           


Validation results for prompt 4:
Avg loss: 0.1070
QWK: 0.0000
Predicted min: 1.72, max: 1.72, mean: 1.72
Actual min: 0.00, max: 3.00, mean: 1.43

--------------------------------------------------
Validation prompt no. 5

Starting training...


                                                          

epoch 1: avg training loss = 1.2754

--------------------Evaluating-----------------------


                                                           


Validation results for prompt 5:
Avg loss: 0.0322
QWK: 0.5061
Predicted min: 0.85, max: 4.14, mean: 2.16
Actual min: 0.00, max: 4.00, mean: 2.41


                                                          

epoch 2: avg training loss = 0.0338

--------------------Evaluating-----------------------


                                                           


Validation results for prompt 5:
Avg loss: 0.0337
QWK: 0.5675
Predicted min: 0.60, max: 4.34, mean: 2.05
Actual min: 0.00, max: 4.00, mean: 2.41


                                                          

epoch 3: avg training loss = 0.0307

--------------------Evaluating-----------------------


                                                           


Validation results for prompt 5:
Avg loss: 0.0293
QWK: 0.5958
Predicted min: 0.70, max: 4.50, mean: 2.15
Actual min: 0.00, max: 4.00, mean: 2.41


                                                          

epoch 4: avg training loss = 0.0298

--------------------Evaluating-----------------------


                                                           


Validation results for prompt 5:
Avg loss: 0.0290
QWK: 0.6000
Predicted min: 0.73, max: 4.50, mean: 2.16
Actual min: 0.00, max: 4.00, mean: 2.41


                                                          

epoch 5: avg training loss = 0.0294

--------------------Evaluating-----------------------


                                                           


Validation results for prompt 5:
Avg loss: 0.0284
QWK: 0.6114
Predicted min: 0.74, max: 4.49, mean: 2.17
Actual min: 0.00, max: 4.00, mean: 2.41

--------------------------------------------------
Validation prompt no. 6

Starting training...


                                                          

epoch 1: avg training loss = 0.6517

--------------------Evaluating-----------------------


                                                           


Validation results for prompt 6:
Avg loss: 0.0388
QWK: 0.4998
Predicted min: 0.65, max: 4.77, mean: 2.40
Actual min: 0.00, max: 4.00, mean: 2.72


                                                          

epoch 2: avg training loss = 0.0315

--------------------Evaluating-----------------------


                                                           


Validation results for prompt 6:
Avg loss: 0.0324
QWK: 0.5627
Predicted min: 0.86, max: 5.23, mean: 2.60
Actual min: 0.00, max: 4.00, mean: 2.72


                                                          

epoch 3: avg training loss = 0.0281

--------------------Evaluating-----------------------


                                                           


Validation results for prompt 6:
Avg loss: 0.0325
QWK: 0.5679
Predicted min: 0.84, max: 5.30, mean: 2.59
Actual min: 0.00, max: 4.00, mean: 2.72


                                                          

epoch 4: avg training loss = 0.0273

--------------------Evaluating-----------------------


                                                           


Validation results for prompt 6:
Avg loss: 0.0321
QWK: 0.5749
Predicted min: 0.85, max: 5.27, mean: 2.63
Actual min: 0.00, max: 4.00, mean: 2.72


                                                          

epoch 5: avg training loss = 0.0264

--------------------Evaluating-----------------------


                                                           


Validation results for prompt 6:
Avg loss: 0.0326
QWK: 0.5675
Predicted min: 0.81, max: 5.06, mean: 2.60
Actual min: 0.00, max: 4.00, mean: 2.72

--------------------------------------------------
Validation prompt no. 7

Starting training...


                                                          

epoch 1: avg training loss = 0.6575

--------------------Evaluating-----------------------


                                                           


Validation results for prompt 7:
Avg loss: 0.0346
QWK: 0.0000
Predicted min: 12.80, max: 12.80, mean: 12.80
Actual min: 2.00, max: 24.00, mean: 16.06


                                                          

epoch 2: avg training loss = 0.0707

--------------------Evaluating-----------------------


                                                           


Validation results for prompt 7:
Avg loss: 0.0230
QWK: 0.0000
Predicted min: 16.28, max: 16.28, mean: 16.28
Actual min: 2.00, max: 24.00, mean: 16.06


                                                          

epoch 3: avg training loss = 0.0652

--------------------Evaluating-----------------------


                                                           


Validation results for prompt 7:
Avg loss: 0.0234
QWK: 0.0000
Predicted min: 16.73, max: 16.73, mean: 16.73
Actual min: 2.00, max: 24.00, mean: 16.06


                                                          

epoch 4: avg training loss = 0.0648

--------------------Evaluating-----------------------


                                                           


Validation results for prompt 7:
Avg loss: 0.0235
QWK: 0.0000
Predicted min: 16.77, max: 16.77, mean: 16.77
Actual min: 2.00, max: 24.00, mean: 16.06


                                                          

epoch 5: avg training loss = 0.0647

--------------------Evaluating-----------------------


                                                           


Validation results for prompt 7:
Avg loss: 0.0236
QWK: 0.0000
Predicted min: 16.85, max: 16.85, mean: 16.85
Actual min: 2.00, max: 24.00, mean: 16.06

--------------------------------------------------
Validation prompt no. 8

Starting training...


                                                          

epoch 1: avg training loss = 0.9058

--------------------Evaluating-----------------------


                                                           


Validation results for prompt 8:
Avg loss: 0.0156
QWK: 0.4577
Predicted min: 18.25, max: 55.65, mean: 41.54
Actual min: 10.00, max: 60.00, mean: 36.95


                                                          

epoch 2: avg training loss = 0.0347

--------------------Evaluating-----------------------


                                                           


Validation results for prompt 8:
Avg loss: 0.0469
QWK: 0.3010
Predicted min: 11.27, max: 67.63, mean: 46.85
Actual min: 10.00, max: 60.00, mean: 36.95


                                                          

epoch 3: avg training loss = 0.0303

--------------------Evaluating-----------------------


                                                           


Validation results for prompt 8:
Avg loss: 0.0448
QWK: 0.3292
Predicted min: 7.55, max: 67.92, mean: 46.04
Actual min: 10.00, max: 60.00, mean: 36.95


                                                          

epoch 4: avg training loss = 0.0291

--------------------Evaluating-----------------------


                                                           


Validation results for prompt 8:
Avg loss: 0.0928
QWK: 0.2021
Predicted min: 11.96, max: 75.95, mean: 52.53
Actual min: 10.00, max: 60.00, mean: 36.95


                                                          

epoch 5: avg training loss = 0.0282

--------------------Evaluating-----------------------


                                                           


Validation results for prompt 8:
Avg loss: 0.0463
QWK: 0.3346
Predicted min: 6.12, max: 68.84, mean: 45.95
Actual min: 10.00, max: 60.00, mean: 36.95




### Training final model

In [None]:

print(f"\n{'='*50}")
print("Best option for regression head:")
print(f"Hidden units: {best_hidden_size}")
print(f"Average QWK: {best_result['avg_qwk']:.4f} (±{best_result['std_qwk']:.4f})")
print(f"{'='*50}")

#create masks for final training (prompts 2-8) and testing (prompt 1)
train_mask = np.isin(data_dict['prompt_ids'], range(2, 9))
test_mask = (data_dict['prompt_ids'] == 1)

#create final training dataset using all prompts 2-8
final_dataset = essays_dataset(
    data_dict['essay_text'][train_mask],
    data_dict['features'][train_mask],
    [normalize_scores(score, pid) for score, pid in zip(data_dict['holistic'][train_mask], data_dict['prompt_ids'][train_mask])],
    data_dict['prompt_ids'][train_mask]
)

#create test dataset using prompt 1
test_dataset = essays_dataset(
    data_dict['essay_text'][test_mask],
    data_dict['features'][test_mask],
    [normalize_scores(score, 1) for score in data_dict['holistic'][test_mask]],
    data_dict['prompt_ids'][test_mask]
)

final_loader = DataLoader(final_dataset, batch_size=32, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=32)

#initialize final model with best configuration
final_model = Our_BERT(hidden_size=best_hidden_size).to(device)
final_optimizer = torch.optim.AdamW(
    final_model.parameters(),
    lr=0.005,
    betas=(0.9, 0.999),
    weight_decay=0.1
)
final_criterion = nn.MSELoss()
#training final model
print("\nFinal training on prompts 2-8...")
for epoch in range(5):
    train_loss = train_model(final_model, final_loader, final_criterion, final_optimizer, epoch + 1)

print("\nEvaluating final model on Prompt 1 (test set)...")
test_loss, test_qwk = evaluate_model(final_model, test_loader, final_criterion, val_prompt_id=1)
print(f"\nFinal test results on prompt 1:")
print(f"Test loss: {test_loss:.4f}")
print(f"Test QWK: {test_qwk:.4f}")




Best option for regression head:
Hidden units: 0
Average QWK: 0.5428 (±0.1249)

Final training on prompts 2-8...


                                                          

epoch 1: avg training loss = 1.4347


                                                          

epoch 2: avg training loss = 0.0309


                                                          

epoch 3: avg training loss = 0.0282


                                                          

epoch 4: avg training loss = 0.0275


                                                          

epoch 5: avg training loss = 0.0268

Evaluating final model on Prompt 1 (test set)...

--------------------Evaluating-----------------------


                                                           


Validation results for prompt 1:
Avg loss: 0.0094
QWK: 0.7965
Predicted min: 2.48, max: 13.95, mean: 8.19
Actual min: 2.00, max: 12.00, mean: 8.53

Final test results on prompt 1:
Test loss: 0.0094
Test QWK: 0.7965




### Saving the model

In [None]:
def save_model(model, optimizer, epoch, train_loss, test_qwk, path):
    #saving our model with all relavant info 
    checkpoint = { 
        'model_state_dict': model.state_dict(),
        'optimizer_state_dict': optimizer.state_dict(),
        'epoch': epoch,
        'train_loss': train_loss,
        'test_qwk': test_qwk,
    }
    torch.save(checkpoint, path)

In [None]:
path = "model-C-1"
save_model(final_model, final_optimizer, epoch + 1, train_loss, test_qwk, path)