In [1]:
#NER Tagging For Essay Claim Identification 
# Ambra, Nikki, and Drew

In [2]:
import os
from torch import cuda

%load_ext autoreload
%autoreload 2

In [3]:
config = {'model_name': None,   
         'max_length': 1024,
         'train_batch_size':4,
         'valid_batch_size':4,
         'epochs':5,
         'learning_rates': [2.5e-5, 2.5e-5, 2.5e-6, 2.5e-6, 2.5e-7],
         'max_grad_norm':10,
         'device': 'cuda' if cuda.is_available() else 'cpu'}

In [4]:
import numpy as np, os 
import pandas as pd, gc 
from tqdm import tqdm

from transformers import AutoTokenizer, AutoModelForTokenClassification
from torch.utils.data import Dataset, DataLoader
import torch
from sklearn.metrics import accuracy_score

In [5]:
# Read in the dataset

train_df = pd.read_csv('./feedback-prize-2021/train.csv')
print("Train Dataset Shape:", train_df.shape )
train_df.head()

Train Dataset Shape: (144293, 8)


Unnamed: 0,id,discourse_id,discourse_start,discourse_end,discourse_text,discourse_type,discourse_type_num,predictionstring
0,423A1CA112E2,1622628000000.0,8.0,229.0,Modern humans today are always on their phone....,Lead,Lead 1,1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 1...
1,423A1CA112E2,1622628000000.0,230.0,312.0,They are some really bad consequences when stu...,Position,Position 1,45 46 47 48 49 50 51 52 53 54 55 56 57 58 59
2,423A1CA112E2,1622628000000.0,313.0,401.0,Some certain areas in the United States ban ph...,Evidence,Evidence 1,60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75
3,423A1CA112E2,1622628000000.0,402.0,758.0,"When people have phones, they know about certa...",Evidence,Evidence 2,76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 9...
4,423A1CA112E2,1622628000000.0,759.0,886.0,Driving is one of the way how to get around. P...,Claim,Claim 1,139 140 141 142 143 144 145 146 147 148 149 15...


In [83]:
# Parse through all the training files and makes a dataframe with columns ['train_ID (filename)', 'text']
test_names, train_texts = [], []
for file_name in list(os.listdir('./feedback-prize-2021/train')):
    ID = file_name.replace('.txt', '')
    train_text = open('./feedback-prize-2021/train/' + file_name, 'r').read()
    
    test_names.append(ID)
    train_texts.append(train_text)
train_text_df = pd.DataFrame({'id': test_names, 'text': train_texts})

print("Training files & text")
train_text_df.head()

Training files & text


Unnamed: 0,id,text
0,3321A3E87AD3,I do agree that some students would benefit fr...
1,DFEAEC512BAB,Should students design a summer project for sc...
2,2E4AFCD3987F,"Dear State Senator\n\n,\n\nIn the ruels of vot..."
3,EB6C2AF20BFE,People sometimes have a different opinion than...
4,A91A08E523D5,"Dear senator,\n\nAs you know the Electoral Col..."


In [8]:
# Get the ground truth NER tags for all the texts 
all_entities = []
print(train_text_df.iterrows())

essay_num = 0
for index, row in train_text_df.iterrows():
    essay_num += 1
    if (essay_num % 1000 == 0):
        print("Computed Tags for", essay_num, "essays")
    
    #initialize all tags to "O"
    num_words_in_essay = len(row['text'].split())
    entities = ["O"]*num_words_in_essay
    
    #Get Cooresponding ground truth rows
    text_id = row["id"]
    cooresponding_ground_truth = train_df[train_df['id'] == text_id]
    
    for gt_ind, gt_row in cooresponding_ground_truth.iterrows():
        discourse_type = gt_row['discourse_type']
        prediction_string = gt_row['predictionstring']
        
        list_ix = [int(x) for x in prediction_string.split(' ')]
        
        beginning_index = list_ix[0]
        entities[beginning_index] = "B-" + discourse_type
        
        for i in list_ix[1:]:
            entities[i] = "I-" + discourse_type
    all_entities.append(entities)
    
train_text_df['entities'] = all_entities
train_text_df.to_csv('train_NER.csv',index=False)

<generator object DataFrame.iterrows at 0x7fa0a97e2580>
Computed Tags for 1000 essays
Computed Tags for 2000 essays
Computed Tags for 3000 essays
Computed Tags for 4000 essays
Computed Tags for 5000 essays
Computed Tags for 6000 essays
Computed Tags for 7000 essays
Computed Tags for 8000 essays
Computed Tags for 9000 essays
Computed Tags for 10000 essays
Computed Tags for 11000 essays
Computed Tags for 12000 essays
Computed Tags for 13000 essays
Computed Tags for 14000 essays
Computed Tags for 15000 essays


In [9]:
print("ground truth NER tags")
train_text_df.head()

ground truth NER tags


Unnamed: 0,id,text,entities
0,3321A3E87AD3,I do agree that some students would benefit fr...,"[B-Lead, I-Lead, I-Lead, I-Lead, I-Lead, I-Lea..."
1,DFEAEC512BAB,Should students design a summer project for sc...,"[O, O, O, O, O, O, O, O, B-Position, I-Positio..."
2,2E4AFCD3987F,"Dear State Senator\n\n,\n\nIn the ruels of vot...","[O, O, O, O, B-Position, I-Position, I-Positio..."
3,EB6C2AF20BFE,People sometimes have a different opinion than...,"[B-Lead, I-Lead, I-Lead, I-Lead, I-Lead, I-Lea..."
4,A91A08E523D5,"Dear senator,\n\nAs you know the Electoral Col...","[O, O, B-Lead, I-Lead, I-Lead, I-Lead, I-Lead,..."


In [10]:
#TODO: CAN SIMPLIFY THIS MAYBE? (O, B-Lead', 'I-Lead', 'B-Position', 'I-Position', 'B-Claim', 'I-Claim', 'B-Evidence', 'I-Evidence', 'B-Concluding Statement', 'I-Concluding Statement')

output_labels = ['O', 'B-Lead', 'I-Lead', 'B-Position', 'I-Position', 'B-Claim', 'I-Claim', 'B-Counterclaim', 'I-Counterclaim', 
          'B-Rebuttal', 'I-Rebuttal', 'B-Evidence', 'I-Evidence', 'B-Concluding Statement', 'I-Concluding Statement']

labels_to_ids = {v:k for k,v in enumerate(output_labels)}
ids_to_labels = {k:v for k,v in enumerate(output_labels)}
labels_to_ids


{'O': 0,
 'B-Lead': 1,
 'I-Lead': 2,
 'B-Position': 3,
 'I-Position': 4,
 'B-Claim': 5,
 'I-Claim': 6,
 'B-Counterclaim': 7,
 'I-Counterclaim': 8,
 'B-Rebuttal': 9,
 'I-Rebuttal': 10,
 'B-Evidence': 11,
 'I-Evidence': 12,
 'B-Concluding Statement': 13,
 'I-Concluding Statement': 14}

## Create Dataloaders

In [11]:
from other import NERDataset

In [12]:
IDS = train_df["id"].unique()

train_idx = np.random.choice(np.arange(len(IDS)),int(0.9*len(IDS)),replace=False)
valid_idx = np.setdiff1d(np.arange(len(IDS)),train_idx)

data = train_text_df[['id','text', 'entities']]
train_dataset = data.loc[data['id'].isin(IDS[train_idx]),['text', 'entities']].reset_index(drop=True)
test_dataset = data.loc[data['id'].isin(IDS[valid_idx])].reset_index(drop=True)


In [13]:
print("FULL Dataset: {}".format(data.shape))
print("TRAIN Dataset: {}".format(train_dataset.shape))
print("TEST Dataset: {}".format(test_dataset.shape))

FULL Dataset: (15594, 3)
TRAIN Dataset: (14034, 2)
TEST Dataset: (1560, 3)


In [53]:
train_dataset.head()

Unnamed: 0,text,entities
0,Should students design a summer project for sc...,"[O, O, O, O, O, O, O, O, B-Position, I-Positio..."
1,"Dear State Senator\n\n,\n\nIn the ruels of vot...","[O, O, O, O, B-Position, I-Position, I-Positio..."
2,People sometimes have a different opinion than...,"[B-Lead, I-Lead, I-Lead, I-Lead, I-Lead, I-Lea..."
3,"Dear senator,\n\nAs you know the Electoral Col...","[O, O, B-Lead, I-Lead, I-Lead, I-Lead, I-Lead,..."
4,"""Can you imagine a time in the future when no ...","[B-Lead, I-Lead, I-Lead, I-Lead, I-Lead, I-Lea..."


In [54]:
test_dataset.head()

Unnamed: 0,id,text,entities
0,3321A3E87AD3,I do agree that some students would benefit fr...,"[B-Lead, I-Lead, I-Lead, I-Lead, I-Lead, I-Lea..."
1,617D56A15483,Do you want to go on daily trips and help peop...,"[B-Lead, I-Lead, I-Lead, I-Lead, I-Lead, I-Lea..."
2,35279D8353D6,"Dear TEACHER_NAME,\n\nI heard that you are pla...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ..."
3,3E2658E49362,So we can all agree that Venus is a very inter...,"[B-Lead, I-Lead, I-Lead, I-Lead, I-Lead, I-Lea..."
4,0A2EE9B6BF75,"With the rise of technology in schools, many q...","[B-Lead, I-Lead, I-Lead, I-Lead, I-Lead, I-Lea..."


## Make Dataloaders

In [28]:
max_length = 1024

tokenizer = AutoTokenizer.from_pretrained(".") 
training_set = NERDataset(train_dataset, tokenizer, max_length, False)
testing_set = NERDataset(test_dataset, tokenizer, max_length, True)

train_params = {'batch_size': 4,
                'shuffle': True,
                'num_workers': 1,
                'pin_memory':True
                }

test_params = {'batch_size': 4,
                'shuffle': False,
                'num_workers': 1,
                'pin_memory':True
                }

training_loader = DataLoader(training_set, **train_params)
testing_loader = DataLoader(testing_set, **test_params)

# # TEST DATASET
#test_texts_set = NERDataset(test_texts, tokenizer, max_length, True)
#test_texts_loader = DataLoader(test_texts_set, **test_params)

loading file spiece.model
loading file tokenizer.json
loading file added_tokens.json
loading file special_tokens_map.json
loading file tokenizer_config.json


## Download Model

In [15]:
#Download an NER Tagger model -- BigBird I think
from transformers import *

config_model = AutoConfig.from_pretrained('./config.json') 
model = AutoModelForTokenClassification.from_pretrained('./pytorch_model.bin',config=config_model)
model.to('cpu')

loading configuration file ./config.json
Model config BigBirdConfig {
  "_name_or_path": "./config.json",
  "architectures": [
    "BigBirdForTokenClassification"
  ],
  "attention_probs_dropout_prob": 0.1,
  "attention_type": "block_sparse",
  "block_size": 64,
  "bos_token_id": 1,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "gradient_checkpointing": false,
  "hidden_act": "gelu_new",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",
    "2": "LABEL_2",
    "3": "LABEL_3",
    "4": "LABEL_4",
    "5": "LABEL_5",
    "6": "LABEL_6",
    "7": "LABEL_7",
    "8": "LABEL_8",
    "9": "LABEL_9",
    "10": "LABEL_10",
    "11": "LABEL_11",
    "12": "LABEL_12",
    "13": "LABEL_13",
    "14": "LABEL_14"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "label2id": {
    "LABEL_0": 0,
    "LABEL_1": 1,
    "LABEL_10": 10,
    "LABEL_11": 11,
    "LABEL_12": 12,
    "LABEL_13": 13,
    "LABEL_14": 14,
    "LABEL

BigBirdForTokenClassification(
  (bert): BigBirdModel(
    (embeddings): BigBirdEmbeddings(
      (word_embeddings): Embedding(50358, 768, padding_idx=0)
      (position_embeddings): Embedding(4096, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BigBirdEncoder(
      (layer): ModuleList(
        (0): BigBirdLayer(
          (attention): BigBirdAttention(
            (self): BigBirdBlockSparseAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
            )
            (output): BigBirdSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
        

## Train Model

In [16]:
def train(epoch):
    tr_loss, tr_accuracy = 0, 0
    nb_tr_examples, nb_tr_steps = 0, 0
    #tr_preds, tr_labels = [], []
    
    # put model in training mode
    model.train()
    print("Length of loader: ", len(training_loader))
    for idx, batch in enumerate(training_loader):
        print(idx)
        return
        
        ids = batch['input_ids'].to('cpu', dtype = torch.long)
        mask = batch['attention_mask'].to('cpu', dtype = torch.long)
        labels = batch['labels'].to('cpu', dtype = torch.long)

        loss, tr_logits = model(input_ids=ids, attention_mask=mask, labels=labels,
                               return_dict=False)
        tr_loss += loss.item()

        nb_tr_steps += 1
        nb_tr_examples += labels.size(0)
        
        if idx % 1==0:
            loss_step = tr_loss/nb_tr_steps
            print(f"Training loss after {idx:04d} training steps: {loss_step}")
           
        # compute training accuracy
        flattened_targets = labels.view(-1) # shape (batch_size * seq_len,)
        active_logits = tr_logits.view(-1, model.num_labels) # shape (batch_size * seq_len, num_labels)
        flattened_predictions = torch.argmax(active_logits, axis=1) # shape (batch_size * seq_len,)
        
        # only compute accuracy at active labels
        active_accuracy = labels.view(-1) != -100 # shape (batch_size, seq_len)
        #active_labels = torch.where(active_accuracy, labels.view(-1), torch.tensor(-100).type_as(labels))
        
        labels = torch.masked_select(flattened_targets, active_accuracy)
        predictions = torch.masked_select(flattened_predictions, active_accuracy)
        
        #tr_labels.extend(labels)
        #tr_preds.extend(predictions)

        tmp_tr_accuracy = accuracy_score(labels.cpu().numpy(), predictions.cpu().numpy())
        tr_accuracy += tmp_tr_accuracy
    
        # gradient clipping
        torch.nn.utils.clip_grad_norm_(
            parameters=model.parameters(), max_norm=10
        )
        
        # backward pass
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    epoch_loss = tr_loss / nb_tr_steps
    tr_accuracy = tr_accuracy / nb_tr_steps
    print(f"Training loss epoch: {epoch_loss}")
    print(f"Training accuracy epoch: {tr_accuracy}")

## Run below to train own model from scratch (need GPU)

In [98]:

optimizer = torch.optim.Adam(params=model.parameters(), lr=2.5e-5)

#Do the training step
num_epochs = 5
learning_rates = [2.5e-5, 2.5e-5, 2.5e-6, 2.5e-6, 2.5e-7]

for epoch in range(num_epochs):
    print(f"### Training epoch: {epoch + 1}")
    for g in optimizer.param_groups: 
        g['lr'] = learning_rates[epoch]
    lr = optimizer.param_groups[0]['lr']
    print(f'### LR = {lr}\n')

    train(epoch)
    torch.cuda.empty_cache()
    gc.collect()

### Training epoch: 1
### LR = 2.5e-05

Length of loader:  3509
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
0
### Training epoch: 2
### LR = 2.5e-05

Length of loader:  3509
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
0
### Training epoch: 3
### LR = 2.5e-06

Length of loader:  3509
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment va

In [None]:
#Run this to save the model
import random
x = random.randint(0,123789174)

torch.save(model.state_dict(), f'new_model_{x}.pt')

## Run the cell below to load a model

In [17]:
path = "./bigbird_v26.pt"
model.load_state_dict(torch.load(path, map_location=torch.device('cpu')))
print('Model loaded.')

Model loaded.


## Inference Code

In [18]:
def inference(batch):         
    # MOVE BATCH TO GPU AND INFER
    ids = batch["input_ids"].to('cpu')
    mask = batch["attention_mask"].to('cpu')
    outputs = model(ids, attention_mask=mask, return_dict=False)
    all_preds = torch.argmax(outputs[0], axis=-1).cpu().numpy() 

    # INTERATE THROUGH EACH TEXT AND GET PRED
    predictions = []
    for k,text_preds in enumerate(all_preds):
        token_preds = [ids_to_labels[i] for i in text_preds]

        prediction = []
        word_ids = batch['wids'][k].numpy()  
        previous_word_idx = -1
        for idx,word_idx in enumerate(word_ids):                            
            if word_idx == -1:
                pass
            elif word_idx != previous_word_idx:              
                prediction.append(token_preds[idx])
                previous_word_idx = word_idx
        predictions.append(prediction)
    
    return predictions

In [22]:
def get_predictions(df=test_dataset, loader=testing_loader):
    
    # put model in training mode
    model.eval()
    
    # GET WORD LABEL PREDICTIONS
    y_pred2 = []
    for batch in loader:
        labels = inference(batch)
        print(labels)
        print(1/0)
        y_pred2.extend(labels)

    final_preds2 = []
    for i in range(len(df)):

        idx = df.id.values[i]
        #pred = [x.replace('B-','').replace('I-','') for x in y_pred2[i]]
        pred = y_pred2[i] # Leave "B" and "I"
        preds = []
        j = 0
        while j < len(pred):
            cls = pred[j]
            if cls == 'O': j += 1
            else: cls = cls.replace('B','I') # spans start with B
            end = j + 1
            while end < len(pred) and pred[end] == cls:
                end += 1
            
            if cls != 'O' and cls != '' and end - j > 7:
                final_preds2.append((idx, cls.replace('I-',''),
                                     ' '.join(map(str, list(range(j, end))))))
        
            j = end
        
    oof = pd.DataFrame(final_preds2)
    oof.columns = ['id','class','predictionstring']

    return oof

In [23]:
sub = get_predictions(test_texts, test_texts_loader)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
[['B-Lead', 'I-Lead', 'I-Lead', 'I-Lead', 'I-Lead', 'I-Lead', 'I-Lead', 'I-Lead', 'I-Lead', 'I-Lead', 'I-Lead', 'I-Lead', 'I-Lead', 'I-Lead', 'I-Lead', 'I-Lead', 'I-Lead', 'I-Lead', 'I-Lead', 'I-Lead', 'I-Lead', 'I-Lead', 'I-Lead', 'I-Lead', 'I-Lead', 'I-Lead', 'I-Lead', 'I-Lead', 'I-Lead', 'I-Lead', 'I-Lead', 'I-Lead', 'I-Lead', 'I-Lead', 'I-Lead', 'I-Lead', 'I-Lead', 'I-Lead', 'I-Lead', 'I-Lead', 'I-Lead', 'I-Lead', 'I-Lead', 'I-Lead', 'I-Lead', 'I-Lead', 'I-Lead', 'I-Lead', 'I-Lead', 'I-Lead', 'I-Lead', 'I-Lead', 'I-Lead', 'I-Lead', 'I-Lead', 'I-Lead', 'I-Lead', 'I-Lead', 'I-Lead', 'I-Lead', 'I-Lead', 'I-Lead', 'I-Lead', 'I-Lead', 'I-Lead', 'I-Lead', 'I-Lead', 'I-Lead', 'B-Position', 'I-Position', 'I-Posi

ZeroDivisionError: division by zero

In [31]:
valid = train_df.loc[train_df['id'].isin(IDS[valid_idx])]
valid


Unnamed: 0,id,discourse_id,discourse_start,discourse_end,discourse_text,discourse_type,discourse_type_num,predictionstring
14,6B4F7A0165B9,1.622644e+12,36.0,512.0,The ability to stay connected to people we kno...,Lead,Lead 1,5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 ...
15,6B4F7A0165B9,1.622644e+12,513.0,593.0,"However, this device is taken to areas that i...",Position,Position 1,92 93 94 95 96 97 98 99 100 101 102 103 104
16,6B4F7A0165B9,1.622644e+12,594.0,726.0,Within a vehicle capable of traveling upwards ...,Claim,Claim 1,105 106 107 108 109 110 111 112 113 114 115 11...
17,6B4F7A0165B9,1.622644e+12,727.0,1244.0,The most common of these distractions is a cel...,Evidence,Evidence 1,125 126 127 128 129 130 131 132 133 134 135 13...
18,6B4F7A0165B9,1.622644e+12,1245.0,1439.0,Conversely people may believe that laws in pre...,Counterclaim,Counterclaim 1,221 222 223 224 225 226 227 228 229 230 231 23...
...,...,...,...,...,...,...,...,...
144256,6B5809C83978,1.618239e+12,277.0,349.0,they will have more information on the situati...,Claim,Claim 3,44 45 46 47 48 49 50 51 52 53 54 55
144257,6B5809C83978,1.618239e+12,462.0,657.0,I was having trouble deciding what I should do...,Evidence,Evidence 1,73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 8...
144258,6B5809C83978,1.618239e+12,910.0,1162.0,"I was going to wear an outfit, but I could no...",Evidence,Evidence 2,151 152 153 154 155 156 157 158 159 160 161 16...
144259,6B5809C83978,1.618239e+12,1427.0,1688.0,My family could not decide where to go for va...,Evidence,Evidence 3,247 248 249 250 251 252 253 254 255 256 257 25...


In [55]:
print(len(valid_idx))
valid = train_df.loc[train_df['id'].isin(IDS[valid_idx])]
print(len(valid))
print(len(valid['id'].unique()))

test_dataset.head()

1560
14699
1560


Unnamed: 0,id,text,entities
0,3321A3E87AD3,I do agree that some students would benefit fr...,"[B-Lead, I-Lead, I-Lead, I-Lead, I-Lead, I-Lea..."
1,617D56A15483,Do you want to go on daily trips and help peop...,"[B-Lead, I-Lead, I-Lead, I-Lead, I-Lead, I-Lea..."
2,35279D8353D6,"Dear TEACHER_NAME,\n\nI heard that you are pla...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ..."
3,3E2658E49362,So we can all agree that Venus is a very inter...,"[B-Lead, I-Lead, I-Lead, I-Lead, I-Lead, I-Lea..."
4,0A2EE9B6BF75,"With the rise of technology in schools, many q...","[B-Lead, I-Lead, I-Lead, I-Lead, I-Lead, I-Lea..."


In [72]:
print("Calculating Ground truth labels")
gt_labels = []
for i, row in test_dataset.iterrows():
    gt_labels.append(row['entities'])
print(f"Calculated gt labels for {len(gt_labels)} essays")
    
print()
print("Getting the predicted labels")
pred_labels = []
for i, batch in enumerate(testing_loader):
    print(f"starting {i} batch out of {len(testing_loader)}")
    labels = inference(batch)
    for essay_prediction in labels:
        pred_labels.append(essay_prediction)
        
    if (i == 5):
        break

Calculating Ground truth labels
Calculated gt labels for 1560 essays

Getting the predicted labels
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
starting 0th batch out of 390
starting 1th batch out of 390
starting 2th batch out of 390
starting 3th batch out of 390
starting 4th batch out of 390
starting 5th batch out of 390
{'input_ids': tensor([ 65, 415, 567,  ...,   0,   0,   0]), 'attention_mask': tensor([1, 1, 1,  ..., 0, 0, 0]), 'wids': tensor([-1,  0,  1,  ..., -1, -1, -1])}
{'input_ids': tensor([  65, 2242,  446,  ...,    0,    0,    0]), 'attention_mask': tensor([1, 1, 1,  ..., 0, 0, 0]), 'wids': tensor([-1,  0,  1,  ..., -1, -1, -1])}
{'input_ids': tensor([   65, 23521, 13469,  ...,     0,     0,     0]), 'attention_mask': tensor([1, 1, 1,  ...,

In [82]:
from hmm import get_accuracy

gt_labels2 = gt_labels[:24]

print(f"Calculated gt labels for {len(gt_labels2)} essays")
print(f"Calculated pred labels for {len(pred_labels)} essays")

print("The accuracy on the validation set is", get_accuracy(pred_labels, gt_labels2))

Calculated gt labels for 24 essays
Calculated pred labels for 24 essays
The accuracy on the validation set is 0.9125361620057859
