# <span style="font-family:cursive;text-align:center">⬇️ Import Libraries</span>

In [None]:
import numpy as np
import pandas as pd
import glob
import os
from datasets import Dataset
import torch
# Set the display options for pandas
pd.set_option('display.max_colwidth', 200)
pd.set_option('display.max_rows', None)

In [None]:
# Set the device to cuda if available, otherwise cpu
device = 'cuda' if torch.cuda.is_available() else 'cpu'
device

# <span style="font-family:cursive;text-align:center">⬇️ Import Data</span>

In [None]:
# Load the data from a tsv file
data = pd.read_csv("/kaggle/input/demo-data/dataset/train/boxes_transcripts_labels/004a4c67-561d-4d9c-9ef2-47cb15fbdf08_document-3_page-1.tsv", header = None)
# Assign column names to the data
data.columns = ['start_index', 'end_index', 'x_top_left', 'y_top_left', 'x_bottom_right', 'y_bottom_right', 'transcript', 'field']

In [None]:
data.sample(100)

In [None]:
data['field'].unique()

In [None]:
# Define a dictionary of ner labels and their corresponding numeric codes
ner_labels = { 0 : "OTHER",
1 : "employerName",
2 : "employerAddressStreet_name",
3 : "employerAddressCity",
4 : "employerAddressState",
5 : "employerAddressZip",
6 : "einEmployerIdentificationNumber",
7 : "employeeName",
8 : "ssnOfEmployee",
9 : "box1WagesTipsAndOtherCompensations",
10 : "box2FederalIncomeTaxWithheld",
11 : "box3SocialSecurityWages",
12 : "box4SocialSecurityTaxWithheld",
13 : "box16StateWagesTips",
14 : "box17StateIncomeTax",
15 : "taxYear"
}

In [None]:
# Create a reverse dictionary of ner labels and their numeric codes
ner_labels_dict = {}
for key in ner_labels.keys():
    ner_labels_dict.update({ner_labels[key] : key})

In [None]:
ner_labels_dict

In [None]:
# Create a new column in the data with the numeric codes for the ner tags
data['ner_tags'] = data['field'].map(lambda x: ner_labels_dict[x])

In [None]:
# Convert the ner tags column to a list
data['ner_tags'].to_list()

In [None]:
# Convert the transcript column to a list
data['transcript'].to_list()

In [None]:
# Get the unique values of the ner tags column
data['ner_tags'].unique()

In [None]:
# Define a function to load data from a given path
def load_data(path):
    directory = path
    # Get all the tsv files in the directory
    tsv_file = glob.glob(directory + '/*.tsv')
    # Create an empty dataframe with two columns: transcript and ner_tags
    data = pd.DataFrame(columns = ['transcript', 'ner_tags'])
    # Loop through each tsv file in the directory
    for filename in tsv_file:
        # Read the tsv file as a dataframe
        data_i = pd.read_csv(filename, header=None)
        # Assign column names to the dataframe
        data_i.columns = ['start_index', 'end_index', 'x_top_left', 'y_top_left', 'x_bottom_right', 'y_bottom_right', 'transcript', 'field']
        # Drop any rows that have missing values in the transcript column
        data_i.dropna(subset=['transcript'], inplace=True)
        # Convert the transcript column to a list
        transcript = data_i['transcript'].to_list()
        # Map the field column to the numeric codes using the reverse dictionary
        data_i['field'] = data_i['field'].map(lambda x: ner_labels_dict[x])
        # Convert the field column to a list of ner tags
        ner_labels = data_i['field'].to_list()
        # Get the length of the transcript list
        transcript_len = len(transcript)
        
        # If the transcript list is longer than 300, split it into two parts and append them as separate rows in the data dataframe
        if transcript_len > 300:
            transcript_1 = transcript[:(transcript_len//2)]
            ner_labels_1 = ner_labels[:(transcript_len//2)]
            transcript_2 = transcript[(transcript_len//2):]
            ner_labels_2 = ner_labels[(transcript_len//2):]
            data.loc[len(data)] = [transcript_1, ner_labels_1]
            data.loc[len(data)] = [transcript_2, ner_labels_2]
        # Otherwise, append the transcript list and the ner tags list as a single row in the data dataframe    
        else:
            data.loc[len(data)] = [transcript, ner_labels]
    # Return the data dataframe
    return data

In [None]:
# Measure the execution time of loading the train and validation data
%%time
train_data = load_data('/kaggle/input/demo-data/dataset/train/boxes_transcripts_labels')
val_data = load_data('/kaggle/input/demo-data/dataset/val_w_ann/boxes_transcripts_labels')

In [None]:
train_data.loc[0]

In [None]:
val_data.head()

In [None]:
train_data.shape, val_data.shape

# Transformer preprocessing

In [None]:
# Show the first row of the train data
from transformers import AutoTokenizer
# Load the tokenizer for the distilbert-base-uncased model
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

In [None]:
# Get the first transcript and ner tags from the train data
example_1 = train_data['transcript'][0]
ner = train_data['ner_tags'][0]

In [None]:
# Tokenize the transcript using the tokenizer
tokens = tokenizer(example_1, is_split_into_words=True, truncation=True)
# Show the tokens
tokens

In [None]:
# Define a function to tokenize a given data
def tokenize_func(data):
    # Return the tokenized transcript using the tokenizer
    return tokenizer(data['transcript'], is_split_into_words=True, truncation=True)

In [None]:
# Define a function to tokenize a given data
def tokenize_data(data):
    # Convert the data to a Dataset object from the datasets library
    data = Dataset.from_pandas(data)
    # Apply the tokenize function to the data and map it to a new column
    data = data.map(tokenize_func)
    # Remove the index column from the data
    data = data.remove_columns(['__index_level_0__'])
    # Return the data as a pandas dataframe
    return data.to_pandas()

In [None]:
# Get the transcript column from the train data
train_data['transcript']

In [None]:
# Tokenize the train and validation data and convert them to pandas dataframes
train = tokenize_data(train_data)
val = tokenize_data(val_data)

In [None]:
train.head()

In [None]:
# Define a function to create ner tags for each token in a given transcript and ner tags list
def create_ner_tags(tokens, ner_tags):
    # Preprocess the tokens using the tokenizer
    preprocessed_tokens = tokenizer(tokens, is_split_into_words=True, truncation=True)
    # Get the input ids and word ids from the preprocessed tokens
    input_ids = preprocessed_tokens['input_ids']
    word_ids = preprocessed_tokens.word_ids()
    
    # Create an empty list for storing the ner tags for each token
    ner_tags_for_tokens = []
    # Loop through each token id in the input ids list
    for ind, token in enumerate(input_ids):
        # If the word id is None, append -100 to the ner tags list (this means that this token will be ignored for loss calculation)
        if word_ids[ind] == None:
            ner_tags_for_tokens.append(-100)
        # If the word id is equal to the previous word id, append -100 to the ner tags list (this means that this token is part of a subword and will be ignored for loss calculation)
        elif word_ids[ind] == word_ids[ind - 1]:
            ner_tags_for_tokens.append(-100)
        # Otherwise, append the corresponding ner tag from the ner tags list (this means that this token is a whole word and will be used for loss calculation)
        else:
            ner_tags_for_tokens.append(ner_tags[word_ids[ind]])
    # Return the ner tags for tokens list
    return ner_tags_for_tokens

In [None]:
# Create ner tags for each token in the example transcript and ner tags list
tags = create_ner_tags(example_1, ner)

In [None]:
# Loop through each token id and the corresponding ner tag in the input ids and tags lists
for ind, tok in enumerate(tokenizer.convert_ids_to_tokens(tokens['input_ids'])):
    print(tok, tags[ind])

## Creating ner preprocesssed column

In [None]:
# Measure the execution time of creating ner tags for the train and validation data
%%time
train['ner_tags'] = train_data.apply(lambda x: create_ner_tags(x['transcript'], x['ner_tags']), axis=1)

In [None]:
# Apply the create_ner_tags function to the validation data and store the result as a new column
val['ner_tags'] = val_data.apply(lambda x: create_ner_tags(x['transcript'], x['ner_tags']), axis=1)

In [None]:
# Drop the transcript column from the train and validation data
train.drop('transcript', axis=1, inplace=True)
val.drop('transcript', axis=1, inplace=True)

In [None]:
# Convert the train and validation data to Dataset objects from the datasets library
train = Dataset.from_pandas(train)
train = train.rename_column("ner_tags", "labels")
# Rename the ner_tags column to labels in both datasets
val = Dataset.from_pandas(val)
val = val.rename_column("ner_tags", "labels")

In [None]:
# Set the format of both datasets to torch
train.set_format('torch')
val.set_format('torch')

In [None]:
train

In [None]:
# Import the DataCollatorForTokenClassification class from the transformers library
from transformers import DataCollatorForTokenClassification
# Create a data collator object using the tokenizer
data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)

In [None]:
!pip install evaluate seqeval

In [None]:
# Load the seqeval evaluation metric from the evaluate module
import evaluate

seqeval = evaluate.load("seqeval")

In [None]:
# Import the load_metric function from the datasets library
from datasets import load_metric
metric = load_metric("seqeval")

In [None]:
# labels = [ner_labels[i] for i in train["ner_tags"]]
# metric.compute(predictions=[labels], references=[labels])

In [None]:
ner_labels_dict

In [None]:
# Get a list of ner labels from the dictionary keys
list(ner_labels.keys())

In [None]:
# Define a dictionary of id to label mapping for each numeric code
id2label = {
    0 : "OTHER",
    1 : "employerName",
    2 : "employerAddressStreet_name",
    3 : "employerAddressCity",
    4 : "employerAddressState",
    5 : "employerAddressZip",
    6 : "einEmployerIdentificationNumber",
    7 : "employeeName",
    8 : "ssnOfEmployee",
    9 : "box1WagesTipsAndOtherCompensations",
    10 : "box2FederalIncomeTaxWithheld",
    11 : "box3SocialSecurityWages",
    12 : "box4SocialSecurityTaxWithheld",
    13 : "box16StateWagesTips",
    14 : "box17StateIncomeTax",
    15 : "taxYear"
}
# Define a dictionary of label to id mapping for each ner label
label2id = {
    'OTHER': 0,
     'employerName': 1,
     'employerAddressStreet_name': 2,
     'employerAddressCity': 3,
     'employerAddressState': 4,
     'employerAddressZip': 5,
     'einEmployerIdentificationNumber': 6,
     'employeeName': 7,
     'ssnOfEmployee': 8,
     'box1WagesTipsAndOtherCompensations': 9,
     'box2FederalIncomeTaxWithheld': 10,
     'box3SocialSecurityWages': 11,
     'box4SocialSecurityTaxWithheld': 12,
     'box16StateWagesTips': 13,
     'box17StateIncomeTax': 14,
     'taxYear': 15
}

# <span style="font-family:cursive;text-align:center">⬇️ Training</span>

In [None]:
# Import the AutoModelForTokenClassification class from the transformers library
from transformers import AutoModelForTokenClassification, TrainingArguments, Trainer
# Load a model for token classification from a pretrained model name, specifying the number of labels and the id2label and label2id dictionaries
model = AutoModelForTokenClassification.from_pretrained(
    "distilbert-base-uncased", num_labels=16, id2label=id2label, label2id=label2id
)

In [None]:
# Move the model to the device (cuda or cpu)
model.to(device)

In [None]:
# Create a training arguments object with various hyperparameters and settings for training and evaluation
training_args = TrainingArguments(
    output_dir="/kaggle/working/",
    learning_rate=2e-5,
    logging_strategy = 'epoch',
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=20,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    report_to = 'none'
)

In [None]:
# Create a trainer object with the model, the training arguments, the train and validation datasets, the tokenizer, the data collator, and the metric
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train,
    eval_dataset=val,
    tokenizer=tokenizer,
    data_collator=data_collator,
#     compute_metrics=metric,
)

In [None]:
# Train the model using the trainer object
trainer.train()

# <span style="font-family:cursive;text-align:center">⬇️ Inference</span>

In [None]:
from transformers import pipeline
# Create a classifier object using the ner pipeline, the trained model, the tokenizer, and the device
classifier = pipeline("ner", model=model, tokenizer=tokenizer, device=device)
# Set the tokenizer attribute is_split_into_words to True (this means that the input is already split into words)
classifier.tokenizer.is_split_into_words = True

In [None]:
# Apply the classifier to a list of words
tokens = classifier(["77796.34", "3759.51" ,"withheby" ,"2018", "W-2", "and" ,"EARNINGS", "SUMMARY", "ADP", "Employee"])
# Create an empty list for storing the labels
labels = []
# Loop through each element in the tokens list
for li in tokens:
    # Loop through each dictionary in the element
    for dic in li:
        # Append the entity value from the dictionary to the labels list
        labels.append(dic['entity'])

In [None]:
labels

In [None]:
tokens

In [None]:
# Define a function to extract labels from inference output
def extract_labels_from_inference(opt):
    lab = []
    for li in opt:
        lab.append(li[0]['entity'])
    return lab

In [None]:
# Apply the function to the tokens list
extract_labels_from_inference(tokens)

In [None]:
words = "77796.34 3759.51 withheby 2018 W-2 and EARNINGS SUMMARY ADP Employee"
words = words.split()

In [None]:
len(words)

In [None]:
len(labels)

In [None]:
tokens

In [None]:
# Preprocess the words using the tokenizer (without adding special tokens)
preprocessed_tokens = tokenizer(words, is_split_into_words=True, truncation=True, add_special_tokens = False)

In [None]:
preprocessed_tokens

## Inference on val data

In [None]:
# Define a function to load validation data from a given path
def load_val_data(path):
    directory = path

    tsv_file = glob.glob(directory + '/*.tsv')

    data = pd.DataFrame(columns = ['transcript'])

    for filename in tsv_file:
        data_i = pd.read_csv(filename, header=None)
        data_i.columns = ['start_index', 'end_index', 'x_top_left', 'y_top_left', 'x_bottom_right', 'y_bottom_right', 'transcript']
        data_i.dropna(subset=['transcript'], inplace=True)
        
        transcript = data_i['transcript'].to_list()
        
        data.loc[len(data)] = [transcript]
        
    return data

In [None]:
# Load the validation data without labels from a given path
val_data_without_labels = load_val_data('/kaggle/input/demo-data/dataset/val/boxes_transcripts')

In [None]:
# Get the third transcript from the validation data without labels
tok = val_data_without_labels['transcript'][2]

In [None]:
tok, len(tok)

In [None]:
# Define a function to predict the ner tags for a given transcript
def predict_val(example):
    ner_tags = []
    len_example = len(example)
    if len_example > 300:
        
        example_1 = example[:(len_example//2)]
        example_2 = example[(len_example//2):]
        
        lab_1 = classifier(example_1)
        lab_2 = classifier(example_2)
        
        lab_1 = extract_labels_from_inference(lab_1)
        lab_2 = extract_labels_from_inference(lab_2)
        
        ner_tags += lab_1
        ner_tags += lab_2
    
    else:
        
        lab = classifier(example)
        lab = extract_labels_from_inference(lab)
        ner_tags += lab
    
    return ner_tags

In [None]:
# Get all the tsv files in a given directory

tsv_file = glob.glob('/kaggle/working/val/boxes_transcripts' + '/*.tsv')

In [None]:
len(tsv_file)

In [None]:
import os
# Define a function to generate validation tsv files with predicted labels
def generate_val_tsvs(path):
    
    if not os.path.exists('val'):
        os.mkdir('val')
        os.mkdir('val/boxes_transcripts')
    
    directory = path

    tsv_file = glob.glob(directory + '/*.tsv')

    for filename in tsv_file:
        data_i = pd.read_csv(filename, header=None)
        data_i.columns = ['start_index', 'end_index', 'x_top_left', 'y_top_left', 'x_bottom_right', 'y_bottom_right', 'transcript']
        data_i.dropna(subset=['transcript'], inplace=True)
        
        transcript = data_i['transcript'].to_list()
        
        pred = predict_val(transcript)
        
        data_i['field'] = pred
        
        data_i.to_csv(f"val/boxes_transcripts/{filename.split('/')[-1]}", index=False, header=None)

In [None]:
# Measure the execution time of generating the validation tsv files with labels
%%time
generate_val_tsvs('/kaggle/input/demo-data/dataset/val/boxes_transcripts')

# <span style="font-family:cursive;text-align:center">⬇️ Generating metrics.tsv</span>

In [None]:
import os
import csv
import pandas as pd

'''
Entities:
1. employerName
2. employerAddressStreet_name
3. employerAddressCity
4. employerAddressState
5. employerAddressZip
6. einEmployerIdentificationNumber
7. employeeName
8. ssnOfEmployee
9. box1WagesTipsAndOtherCompensations
10. box2FederalIncomeTaxWithheld
11. box3SocialSecurityWages
12. box4SocialSecurityTaxWithheld
13. box16StateWagesTips
14. box17StateIncomeTax
15. taxYear
'''



'''
Description: The fuction yields the standard precision, recall and f1 score metrics

arguments:
    TP -> int
    FP -> int
    FN -> int

returns: float, float, float
'''
def performance(TP, FP, FN):
    
    if (TP+FP) == 0:
        precision = 0
    else:
        precision = TP/float((TP+FP))
        
    if (TP+FN) == 0:
        recall = 0
    else:
        recall = TP/float((TP+FN))
    
    if (recall!= 0) and (precision!= 0):
        f1_score = (2.0*precision*recall)/(precision+recall)
    else:
        f1_score = 0
    
    return precision, recall, f1_score
    
    
    
    
'''
Description: The fuction yields a dataframe containing entity-wise performance metrics

arguments:
    true_labels -> list
    pred_labels -> lisyt
    
returns: pandas dataframe
'''
def get_dataset_metrics(true_labels, pred_labels):
    
    metrics_dict = dict()
    
    for true_label, pred_label in zip(true_labels, pred_labels):
        if true_label not in metrics_dict:
            metrics_dict[true_label] = {"TP":0, "FP":0, "FN":0, "Support":0}
        
        if true_label != "OTHER":
            metrics_dict[true_label]["Support"] += 1
            
            if true_label == pred_label:
                metrics_dict[true_label]["TP"] += 1
            
            elif pred_label == "OTHER":
                metrics_dict[true_label]["FN"] += 1
            
        else:
            if pred_label != "OTHER":
                metrics_dict[pred_label]["FP"] += 1
           
    df = pd.DataFrame()
    
    for field in metrics_dict:
        precision, recall, f1_score = performance(metrics_dict[field]["TP"], metrics_dict[field]["FP"], metrics_dict[field]["FN"])
        support = metrics_dict[field]["Support"]
        
        if field != "OTHER":
            temp_df = pd.DataFrame([[precision, recall, f1_score, support]], columns=["Precision", "Recall", "F1-Score", "Support"], index=[field])
#             df = df.append(temp_df)
            df = pd.concat([df, temp_df])
    
    return df




'''
Description: The fuction yields a dataframe containing entity-wise performance metrics for a single document
(make sure the doc id is the same)

arguments:
    doc_true -> tsv file with with labels in the last column (8 th column (1-indexed))
    doc_pred -> tsv file with labels in the last column (8 th column (1-indexed)), as predicted by the model
    
returns: list, list
'''
def get_doc_labels(doc_true, doc_pred):

    true_labels = [row[-1] for row in csv.reader(open(doc_true, "r"))]
    pred_labels = [row[-1] for row in csv.reader(open(doc_pred, "r"))]

    return true_labels, pred_labels

'''
Description: The fuction yields a dataframe containing entity-wise performance metrics for all documents
(make sure the doc ids are the same in both the paths)

arguments:
    doc_true -> string (directory containing the ground truth tsv files)
    doc_pred -> string (directory containing the predicted tsv files)
    save -> bool (saves the metrics file in your working directory)
returns: pandas dataframe
'''
def get_dataset_labels(true_path, pred_path, save=False):
    
    y_true, y_pred = [], []
    
    for true_file in os.listdir(true_path):
        for pred_file in os.listdir(pred_path):
            if (".tsv" in true_file) and (".tsv" in pred_file):
                if true_file == pred_file:
                    
                    true_file, pred_file = f"{true_path}/{true_file}", f"{pred_path}/{pred_file}"
                    true_labels, pred_labels = get_doc_labels(true_file, pred_file)
                    
                    y_true.extend(true_labels)
                    y_pred.extend(pred_labels)
            
    df = get_dataset_metrics(y_true, y_pred)
    print(df)
    if save == True:
        df.to_csv("eval_metrics.tsv")



if __name__ == "__main__":
    
    # template to run your own evaluation

    doc_true = f"/kaggle/input/demo-data/dataset/val_w_ann/boxes_transcripts_labels"
    doc_pred = f"/kaggle/working/val/boxes_transcripts"

    get_dataset_labels(doc_true, doc_pred, save=True)

In [None]:
eval_df = pd.read_csv("/kaggle/working/eval_metrics.tsv")
eval_df