In [None]:
import os
import sys
import torch
import json
import pandas as pd
import numpy as np
from tqdm import tqdm
from loguru import logger
from typing import List, Dict
from torch.nn.utils.rnn import pad_sequence
from transformers import AutoModelForTokenClassification, AutoTokenizer
from torch.nn.utils.rnn import pad_sequence
from datasets import Dataset

device = 'cuda'

log_format = "<level>{time:YYYY-MM-DD HH:mm:ss} | {level: <8} | {name}:{function}:{line} - {message}</level>"

logger.remove()
logger.add(sys.stdout, format=log_format, level='INFO', colorize=True)

In [None]:
label2id = {
    'O': 0,
    'B-NAME_STUDENT': 1,
    'B-EMAIL': 2,
    'B-USERNAME': 3,
    'B-ID_NUM': 4,
    'B-PHONE_NUM': 5,
    'B-URL_PERSONAL': 6,
    'B-STREET_ADDRESS': 7,
    'I-NAME_STUDENT': 8,
    'I-EMAIL': 9,
    'I-USERNAME': 10,
    'I-ID_NUM': 11,
    'I-PHONE_NUM': 12,
    'I-URL_PERSONAL': 13,
    'I-STREET_ADDRESS': 14
}

id2label = {
    0: 'O',
    1: 'B-NAME_STUDENT',
    2: 'B-EMAIL',
    3: 'B-USERNAME',
    4: 'B-ID_NUM',
    5: 'B-PHONE_NUM',
    6: 'B-URL_PERSONAL',
    7: 'B-STREET_ADDRESS',
    8: 'I-NAME_STUDENT',
    9: 'I-EMAIL',
    10: 'I-USERNAME',
    11: 'I-ID_NUM',
    12: 'I-PHONE_NUM',
    13: 'I-URL_PERSONAL',
    14: 'I-STREET_ADDRESS'
}

In [None]:
def get_data(path):
    with open(path, 'r') as fp:
        data = json.load(fp)
        
    x = Dataset.from_dict(data)
    logger.info(f'Size of dataset: {len(x)}')

    return x


def get_model(model_path, tokenizer_path):
    model = AutoModelForTokenClassification.from_pretrained(model_path, ignore_mismatched_sizes=True)
    tokenizer = AutoTokenizer.from_pretrained(tokenizer_path)
    model.to(device)
    return model, tokenizer


def eval_model(trained_model, eval_dataset):
    metrics = {
        'document': [None],
        'token': [None],
        'label': [None]
    }
    
    trained_model.eval()
    
    with torch.no_grad():
        prev_document_idx = None
        curr_document_counter = 0

        for batch in eval_dataset:
            document_idx = batch['document_id']
            input_ids = torch.tensor(batch['input_ids']).reshape(1, -1).to(device)
            attention_mask = torch.tensor(batch['attention_mask']).reshape(1, -1).to(device)
            word_ids = batch['word_ids']
            
            if document_idx != prev_document_idx:
                curr_document_counter = 0

            logger.info(f'Evaluating: {document_idx}, previous document: {prev_document_idx}, counter: {curr_document_counter}')
    
            outputs = model(input_ids, attention_mask=attention_mask)
            softmax_scores = torch.nn.functional.softmax(outputs.logits, dim=-1)
            
            _, predicted_labels = torch.max(outputs.logits, -1)
    
            i = 0
            idx = 0 if document_idx != prev_document_idx else len(batch['input_ids'])*curr_document_counter
            logger.info(f'Starting idx: {idx}')
            
            for a, p, s, w in zip(
                attention_mask[0],
                predicted_labels[0],
                softmax_scores[0],
                word_ids
            ):
                if w is None:
                    continue

                # Ignore scores if attention mask is not applied
                updated_p = p.item() if a.item() != 0 else -100

                # If the prediction class is 0 and confidence is less than 0.6
                # Assign the second most 
#                 if updated_p == 0 and s[0].item() < 0.7:
#                     _, updated_p = torch.max(outputs.logits[0, i, 1:], -1)
#                     updated_p = updated_p.item() + 1
#                     logger.info(f'Updated the prediction based on threshold to {updated_p}')

                if updated_p <= 0:
                    idx += 1
                    i += 1
                    continue
                    
                if metrics['token'][-1] == int(w):
                    continue
    
                metrics['document'].append(document_idx)
                metrics['token'].append(int(w))
                metrics['label'].append(id2label[updated_p])
                idx += 1
                i += 1
                
            prev_document_idx = document_idx
            curr_document_counter += 1

    return metrics

In [None]:
dataset_path = './data/processed/test/test_processed.json'
tokenizer_path = './model/20240406_2252/tokenizer/'
model_path = './model/20240406_2252/model/'

In [None]:
model, tokenizer = get_model(model_path, tokenizer_path)
test_ds = get_data(dataset_path)

In [None]:
test_metrics = eval_model(
    trained_model=model,
    eval_dataset=test_ds
)

In [None]:
df = pd.DataFrame(test_metrics)
df.reset_index(inplace=True)
df.columns = ['row_id', 'document', 'token', 'label']

### Rough

In [None]:
df.query('document==7').token

In [None]:
for t, label in zip(df.query('document==7').token, df.query('document==7').label):
    print(f"{data[0]['tokens'][int(t)]} {label}")

In [None]:
with open('./data/train.json', 'r') as fp:
    data = json.load(fp)