In [1]:
import json
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from tqdm import tqdm

import torch
from torch.nn import Linear
from torch.optim import AdamW
from torch.cuda import empty_cache
from torch.utils.data import DataLoader

from transformers import AutoTokenizer, AutoModelForTokenClassification

In [2]:
with open('./data/train.json', 'r') as fp:
    data = json.load(fp)

In [3]:
print(len(data))
print(data[0].keys())

6807
dict_keys(['document', 'full_text', 'tokens', 'trailing_whitespace', 'labels'])


In [None]:
# idx = 99

# print(data[idx]['full_text'])
# print(data[idx]['tokens'])
# print(data[idx]['trailing_whitespace'])

# pii = []

# for i, label in enumerate(data[idx]['labels']):
#     if label != 'O':
#         pii.append(i)
#         print(data[idx]['tokens'][i])

In [4]:
from dataset import CustomDataset
from model import model, tokenizer

Some weights of DistilBertForTokenClassification were not initialized from the model checkpoint at distilbert/distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [5]:
data[0].keys()

dict_keys(['document', 'full_text', 'tokens', 'trailing_whitespace', 'labels'])

In [8]:
x = tokenizer(
    data[0]['tokens'], add_special_tokens=False, is_split_into_words=True
)

In [20]:
tokenizer.tokenize(data[0]['tokens'][9])

['nat', '##hal', '##ie']

In [10]:
x.word_ids(batch_index=0)

[0,
 1,
 2,
 3,
 4,
 4,
 5,
 6,
 6,
 7,
 8,
 9,
 9,
 9,
 10,
 10,
 12,
 13,
 14,
 16,
 17,
 18,
 19,
 20,
 21,
 22,
 23,
 24,
 25,
 26,
 27,
 28,
 29,
 30,
 31,
 32,
 33,
 34,
 36,
 37,
 38,
 40,
 41,
 42,
 43,
 44,
 45,
 46,
 47,
 48,
 49,
 50,
 51,
 52,
 52,
 53,
 53,
 54,
 55,
 55,
 56,
 56,
 57,
 58,
 59,
 60,
 60,
 60,
 61,
 62,
 64,
 64,
 64,
 65,
 66,
 67,
 68,
 69,
 70,
 70,
 70,
 71,
 72,
 73,
 74,
 75,
 76,
 77,
 78,
 79,
 79,
 79,
 80,
 81,
 82,
 83,
 84,
 86,
 87,
 88,
 89,
 90,
 91,
 92,
 93,
 94,
 95,
 96,
 97,
 98,
 99,
 100,
 100,
 102,
 103,
 104,
 105,
 106,
 107,
 108,
 108,
 110,
 111,
 112,
 113,
 114,
 115,
 117,
 119,
 120,
 121,
 122,
 123,
 124,
 125,
 126,
 127,
 128,
 129,
 130,
 131,
 132,
 133,
 134,
 136,
 138,
 140,
 141,
 142,
 142,
 144,
 146,
 147,
 148,
 148,
 148,
 149,
 150,
 151,
 152,
 154,
 156,
 157,
 158,
 159,
 160,
 161,
 162,
 163,
 164,
 165,
 166,
 166,
 167,
 168,
 169,
 170,
 171,
 172,
 173,
 174,
 176,
 177,
 179,
 181,
 182,
 183,
 18

In [None]:
%%time
ds = CustomDataset(data=data, tokenizer=tokenizer, max_len=512)

In [None]:
train_data = DataLoader(ds, batch_size=32, shuffle=True)
num_labels = len(ds.label2id.keys())

In [None]:
num_epochs = 1
learning_rate = 2e-5

optimizer = AdamW(model.parameters(), lr=learning_rate)
total_steps = len(train_data) * num_epochs

In [None]:
device = 'cuda'

In [None]:
all_losses = []

for epoch in range(num_epochs):
    model.train()
    with tqdm(total=len(train_data), desc=f'Epoch {epoch+1}/{num_epochs}') as pbar:
        for batch in train_data:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device, dtype=torch.long)
    
            # print(f"""Input IDs: {torch.cuda.memory_allocated(input_ids)/1e9}
            # Attention masks: {torch.cuda.memory_allocated(attention_mask)/1e9}
            # Labels: {torch.cuda.memory_allocated(labels)/1e9}
            # """)
    
            outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
            # print(f"""Outputs: {torch.cuda.memory_allocated(outputs)/1e9}""")
    
            loss = outputs.loss
            all_losses.append(loss)
            loss.backward()
    
            optimizer.step()
            optimizer.zero_grad()

            pbar.set_postfix({'Loss': f'{loss.item():.4f}'})
            pbar.update(1)

In [None]:
all_losses = [a.detach().to('cpu', torch.float16).numpy() for a in all_losses]

In [None]:
plt.plot(all_losses)
plt.show()

In [None]:
def evaluate_model(model, data_loader):
    model.eval()  # Set the model to evaluation mode
    total_loss = 0
    correct_predictions = 0
    total_samples = 0
    
    with torch.no_grad():  # Disable gradient calculation for evaluation
        for batch in data_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device, dtype=torch.long)
            
            outputs = model(input_ids, attention_mask=attention_mask)
            
            _, predicted_labels = torch.max(outputs, 1)
            correct_predictions += (predicted_labels == labels).sum().item()
            total_samples += labels.size(0)
    
    accuracy = correct_predictions / total_samples
    
    return accuracy

accuracy = evaluate_model(model, train_data)

print(f"Accuracy: {accuracy:.4f}")