In [None]:
import json
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from tqdm import tqdm

import torch
from torch.nn import Linear, CrossEntropyLoss
from torch.optim import AdamW
from torch.cuda import empty_cache
from torch.utils.data import DataLoader
from torch.nn.utils.rnn import pad_sequence

from transformers import AutoTokenizer, AutoModelForTokenClassification

from dataset import CustomDataset, label2id, id2label
from datasets import Dataset
from model import model, tokenizer

In [None]:
with open('./data/train.json', 'r') as fp:
    data = json.load(fp)

In [None]:
%%time
x = Dataset.from_list(data)
x = x.map(tokenizer_and_align, num_proc=16)

ds = x.map(chunk_examples, num_proc=16, batched=True, batch_size=10, remove_columns=x.column_names)

In [None]:
num_epochs = 5
learning_rate = 5e-5
batch_size=16

optimizer = AdamW(model.parameters(), lr=learning_rate)
total_steps = len(ds) * num_epochs

device = 'cuda'
loss_fn = CrossEntropyLoss(
    weight=torch.tensor([1, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100]).to('cuda', dtype=torch.bfloat16)
)

In [None]:
train_data = DataLoader(ds, batch_size=batch_size, shuffle=False)
num_labels = len(label2id.keys())

In [None]:
all_losses = []

for epoch in range(num_epochs):
    model.train()
    with tqdm(total=len(ds)//batch_size, desc=f'Epoch {epoch+1}/{num_epochs}') as pbar:
        for s in range(0, len(ds), batch_size):
            batch = ds[s:s+batch_size]
            
            input_ids = torch.tensor(batch['input_ids']).to(device)
            attention_mask = torch.tensor(batch['attention_mask']).to(device)
            labels = torch.tensor(batch['labels']).to(device)
    
            outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
    
            loss = loss_fn(
                outputs.logits.reshape(len(labels), 15, 512),
                labels
            )
            
            all_losses.append(loss)
            loss.backward()
    
            optimizer.step()
            optimizer.zero_grad()

            pbar.set_postfix({'Loss': f'{loss.item():.4f}'})
            pbar.update(1)

In [None]:
all_losses = [a.detach().to('cpu', torch.float16).numpy() for a in all_losses]
plt.plot(all_losses[-100:])
plt.show()

In [None]:
label_metrics = dict.fromkeys(label2id.values())
for k, v in label_metrics.items():
    label_metrics[k] = {'total_samples': 0, 'total_predicted': 0, 'correct_predictions': 0}

In [None]:
%%time
model.eval()
correct_predictions = 0
total_samples = 0

with torch.no_grad():  # Disable gradient calculation for evaluation
    for s in tqdm(range(0, len(ds), batch_size)):
        batch = ds[s:s+batch_size]
        
        input_ids = torch.tensor(batch['input_ids']).to(device)
        attention_mask = torch.tensor(batch['attention_mask']).to(device)
        labels = torch.tensor(batch['labels']).to(device)
        
        outputs = model(input_ids, attention_mask=attention_mask)
        
        _, predicted_labels = torch.max(outputs.logits, -1)

        for p, l in zip(predicted_labels.flatten(), labels.flatten()):
            
            if l == -100:
                continue

            if p==l:
                correct_predictions +=1
                label_metrics[l.item()]['correct_predictions'] += 1

            label_metrics[l.item()]['total_samples'] += 1
            label_metrics[p.item()]['total_predicted'] += 1

            total_samples +=1

In [None]:
import pandas as pd

In [None]:
df = pd.DataFrame.from_records(label_metrics).T

In [None]:
100*df.total_samples/df.total_samples.sum()

In [None]:
tokenizer(data[0]['full_text'], max_length=12, return_overflowing_tokens=True)['input_ids']

In [None]:
# Smaller sequences
# Different model
# Write test case for validating data
# Data augmetation
# Resampling data?

In [None]:
# What would validation look like?
# Document random sequence of tokens
# Same place tokenization
# Same place lables (Should make sense)

In [None]:
import numpy as np

In [None]:
# idx = np.random.randint(0, len(data))
# start= np.random.randint(0, len(data[idx]['tokens']))
buffer = 2000
idx, start = 0, 0

temp = data[idx]
for tokens, labels, ws in zip(temp['tokens'][start: start+buffer], temp['labels'][start: start+buffer], temp['trailing_whitespace'][start: start+buffer]):
    if labels == 'O':
        continue
    
    local = {'tokens': [tokens], 'labels': [labels], 'trailing_whitespace': [ws]}
    ans = tokenizer_and_align(local)

    print(f"""
    Original: {tokens} {labels}
    Transformed: {ans['tokens']} {ans['aligned_tokens']['input_ids']} {ans['aligned_labels']}
    """)
    # print(f'{tokens}\t{labels}\t{ans}')

In [None]:
tokenizer.tokenize('gwilliams@yahoo.com')

In [None]:
temp = data[0]
print(temp['full_text'])

In [1]:
import evaluate

seqeval = evaluate.load("seqeval")

Downloading builder script:   0%|          | 0.00/6.34k [00:00<?, ?B/s]

ImportError: To be able to use evaluate-metric/seqeval, you need to install the following dependencies['seqeval'] using 'pip install seqeval' for instance'