In [None]:
# https://huggingface.co/docs/transformers/custom_datasets#tok_ner

# Introduction

This notebook is intended to serve as a simple starter book for Transformer's Trainer API. It is intended to be as simple as possible and close to the original HuggingFace Transformers example @ https://huggingface.co/docs/transformers/custom_datasets#tok_ner

* Model used = Distillibert uncased
    * This means that the outputs will be clipped to 512 as the model processes 512 tokens only.
    * The 512 is the input sequence limit which was present in the BERT family of models
    * Indeed, if you want to be in the top on leadership board, this model should be replaced OR a rolling window data processing should be done
* Train for 3 Epochs. This model will overfit as you train more

In [None]:
from sklearn.model_selection import train_test_split
import numpy as np
import pandas as pd
import os
import re
import torch
from transformers import AutoTokenizer
from datasets import Dataset
from transformers import AutoModelForSeq2SeqLM, DataCollatorForSeq2Seq, Seq2SeqTrainingArguments, Seq2SeqTrainer
from transformers import AutoModelForSequenceClassification
from datasets import Dataset, load_metric, DatasetDict
import nltk
import numpy as np
from tqdm.auto import tqdm

# Switches & Constants

* **ISLOCAL** can control if you want to run the same notebook in your local environment. (It merely switches the names of the directories between my local environment and Kaggle)
    * Set to False to run in Kaggle
* **TOKENIZER_NAME** should be the base name of the model you are interested in (HuggingFace Models https://huggingface.co/models?pipeline_tag=text-classification )
* **USE_SMALL_DATASET** can be used to verify small changes in code. It will only train the notebook for total size of SMALL_DATASET_SIZE constant (default = 50)
* **USE_LOCAL_DATASET** = True will look for the tokenized dataset + tokenizer + model all in the local directory. This option **must** be used when submitting to Kaggle. But when using local resources, you can disable this option.
* **BATCH_SIZE** depends on you machine memory. In Kaggle environment with Distillibert, batch size of 8 works fine and I suspect it can be increased to 16/32 as well.
* **EPOCHS** is your choice. But viewing the metrics, after 3/4 Epochs, the model seems to start overfit severely.

In [None]:
ISLOCAL = False
TOKENIZER_NAME = "distilbert-base-uncased" #"bert-base-uncased" # "allenai/longformer-base-4096" # "distilbert-base-uncased"
USE_SMALL_DATASET = False #If you want to train on whole dataset
USE_LOCAL_DATASET = True
BATCH_SIZE = 8
EPOCHS = 4

CHECKPOINT = "checkpoint-125"
INFER_ONLY = True

DATASET_PATH = "raw_dataset"
TOKENIZED_PATH = "{}_tokenized".format(TOKENIZER_NAME)
TOKENIZER_PATH = "{}_er".format(TOKENIZER_NAME)
MODEL_PATH = "{}init_model".format(TOKENIZER_NAME)
TRAINOUT_PATH = "{}-finetuned".format(TOKENIZER_NAME)

#SHOULD_I_TRAIN_MODEL = False #if only inference is needed
SMALL_DATASET_SIZE = 50

if ISLOCAL == True:
    train_directory = "./train"
    test_directory = "./test"
    main_directory = "./"
    output_directory = "./output"
else:
    train_directory = "../input/feedback-prize-2021/train"
    test_directory = "../input/feedback-prize-2021/test"
    main_directory = "../input/feedback-prize-2021/"
    output_directory = "../input/distillibertmodelner/output"

# Helper Function

* Function to read a text file and return contents as a string

In [None]:
def read_train_file(currid = "423A1CA112E2", curr_dir = train_directory):
    with open(os.path.join(curr_dir, "{}.txt".format(currid)), "r") as f:
        filetext = f.read()
        
    return filetext

### Read CSV Files

Read the train and submission files

* train will be used for the training loop
* ss will be used for generating the predictions

In [None]:
train = pd.read_csv(os.path.join(main_directory, "train.csv"))
ss = pd.read_csv(os.path.join(main_directory,'sample_submission.csv'))

# Data Cleanup

In case some integer processing is needed, it is better to change discourse start and end to numeric types.

In [None]:
train['discourse_start'] = train['discourse_start'].astype(int)
train['discourse_end'] = train['discourse_end'].astype(int)

discourse_labels = {x:i for i,x in enumerate(train['discourse_type'].unique())}

In [None]:
ss.head()

# Helper Function 02

## **add_ner_start_ends** 
* This function takes a dataframe as input. (train dataframe)
* This function then returns the 1st position of prediction string and the last position of prediction string concatenated in the same dataframe
* This is useful as a final output in pandas dataframe is easier to process / visualize


## **add_ner_label_ids**
* We will need labels for Beginning-Lead, Inside-Lead, Beginning-Claim, Inside-Claim etc. So, this function simply adds numbers to the pandas dataframe for each row on what is the numeric label for beginning and inside for this particular category

## **labels_to_discourse**
* Return the labels against category as a dictionary

In [None]:
def add_ner_start_ends(df):
    '''
    Process the predictionstring and return the very first and the very last index of the words for the current row.
    This may be helpful in tokenizing for the NER tasks using numpy later.
    df should be in the same format as train.csv
    '''
    ret = []

    for i in tqdm(df.itertuples(), total = len(df)):
        word_start = getattr(i, "predictionstring").split()[0]
        word_end = getattr(i, "predictionstring").split()[-1]
        word_start = int(word_start)
        word_end = int(word_end)
        
        assert word_end >= word_start
        
        ret.append([int(word_start), int(word_end)])

    ret = pd.DataFrame(ret)
    ret.columns = ['word_start', 'word_end']
    df = pd.concat([df, ret], axis = 1)
    return df


def add_ner_label_ids(df):
    d_labels = df['discourse_type'].unique()

    df['label_b'] = 0
    df['label_e'] = 0

    for i, x in tqdm(enumerate(df['discourse_type'].unique())):
        #print(i, " --> ", x)
        df.loc[ df['discourse_type'] == x, 'label_b'] = i * 2 + 1
        df.loc[ df['discourse_type'] == x, 'label_e'] = i * 2 + 2

    return df

def labels_to_discourse(df):
    from collections import defaultdict
    labels_to_discourse = defaultdict(str)
    labels_to_discourse[0] = 'no-label'
    
    for i, x in tqdm(enumerate(df['discourse_type'].unique())):
        labels_to_discourse[i * 2 + 1] = x
        labels_to_discourse[i * 2 + 2] = x

    return labels_to_discourse


## **prepare_dataset**
* This function will return a tokenized dataset after reading a processed pandas dataframe.
* If input was 
    * Input : "This is a text which is also a claim. But this other text is Evidence"
    * Output : { "tokens": ["This", "is", "a", "text", "which", "is", "also", "a", "claim.", "But", "this", "other", "text", "is", "Evidence"],
                "ner_tags":[1     , 2   , 2  ,  2    ,       2,    2,       2, 2 , 2       ,   3  ,   4   ,   4    , 4     ,   4,     4      ] }
    * where, we assume that ner_tag == 1 for Lead-Beginning, 2 for Lead-Inside, 3 for Evidence-Beginning, 4 for Evidence-Inside
    
## **prepare_prediction**
* Simple function to load the submission text files and return their tokens (separated strings)

In [None]:
def prepare_dataset(df, direc = train_directory, limit = None):
    ret = []
    print("Total records to be processed are ", len(df['id'].unique()))
    
    
    count = 0
    ids = 0
    for x in tqdm( df['id'].unique(), total =  len(df['id'].unique()) ):
        count += 1
        ids += 1
        if limit is not None:
            if count >= limit:
                break
        ft = read_train_file(x) #, # direc)
        text_splits = ft.split()
        #row = {"id" : x}
        row = {"id" : ids}
        
        # now we need to update the "ner_tags"
        result = np.zeros( len(text_splits), dtype = "uint16" )
        for j, y in enumerate(df.loc[ df['id'] == x ].itertuples()):
            #print(j, getattr(y, "discourse_type"))
            result[ getattr(y, "word_start") ] = getattr(y, "label_b")
            result[ getattr(y, "word_start") + 1 : getattr(y, "word_end") ] = getattr(y, "label_e")
        row.update( {"ner_tags" : result})
        row.update( {"tokens" : text_splits})

        ret.append(row.copy())
    return ret

def prepare_prediction(df, direc = test_directory):
    ret = []
    for x in tqdm( df['id'].unique(), total = len(df['id'].unique() )):
        row = {"tokens": read_train_file(x, test_directory).split()}
        ret.append(row.copy())
        
    return pd.DataFrame(ret)

# Start of work

* Prepare the train dataframe and return the tokenized + ner_tags version
* Expected time to run = 7 minutes

In [None]:
if USE_LOCAL_DATASET == False:
    train = add_ner_start_ends(train)
    train = add_ner_label_ids(train)
    ret = prepare_dataset(train)
    

    #ret[0]
labels_to_d = labels_to_discourse(train)

In [None]:
if USE_LOCAL_DATASET == False:
    df_train = pd.DataFrame(ret)

    del(ret)

## Split the dataset

* Split data into 3 sets for local debugging / troubleshooting

In [None]:
if USE_LOCAL_DATASET == False:
    X_train, X_test = train_test_split(df_train, random_state = 91, train_size = 0.7)

    X_test, X_valid = train_test_split(X_test, random_state = 91, train_size = 0.5)

    del(df_train)

## Convert to transformers dataset

* https://huggingface.co/docs/datasets/

In [None]:
if USE_LOCAL_DATASET == False:
    train_set = Dataset.from_pandas(X_train)
    test_set = Dataset.from_pandas(X_test)
    valid_set = Dataset.from_pandas(X_valid)

In [None]:
if USE_LOCAL_DATASET == False:
    raw_datasets = DatasetDict( {
        'train' : train_set,
        'test': test_set,
        'validation': valid_set
    })

# Dataset Checkpoint

Either we will load the dataset using local storage OR we will save the currently created dataset on disk.

This is based on the switch settings of **USE_LOCAL_DATASET**

In [None]:
paths = os.path.join(output_directory, DATASET_PATH)

import datasets

if USE_LOCAL_DATASET == False:
    print("Saving dataset to disk. ", paths)
    
    raw_datasets.save_to_disk( paths )
    
else:
    assert os.path.exists(paths), "The dataset local path does not exist. Please retrain the notebook with appropriate switches."
    raw_datasets = datasets.load_from_disk( paths )

# Tokenizer Checkpoint

Either we will load the tokenizer using local storage **OR** we will save the tokenizer to disk after downloading from Internet.

This is based on setting of the **USE_LOCAL_DATASET** switch

In [None]:
from transformers import LongformerTokenizerFast, BertTokenizer, AutoTokenizer
paths = os.path.join(output_directory, TOKENIZER_PATH)

if USE_LOCAL_DATASET == False:
    tokenizer = AutoTokenizer.from_pretrained(TOKENIZER_NAME, add_prefix_space=True)
    tokenizer.save_pretrained(paths)
else:
    assert os.path.exists(paths), "Tokenizer local path is not found. Please recheck configuration of the notebook"
    tokenizer = AutoTokenizer.from_pretrained(paths)

# Helper Functions 03

## **tokenize_predictions**
* This applies the loaded tokenizer to the Submission Text file dataset. (No NER Processing is needed)

## **tokenize_and_align_labels**
* This is taken from HuggingFace resource. This function takes into account the addition of special tokens to the text and then aligns the NER tags which we had assigned.
* For example,
    * We may have assigned NER tags = [ 0, 1, 2 , 2 , 2 , 2 , 0]
    * After addition of special tokens like [CLS] etc., the position of these NER tags may need to be changed as well.
    * This is taken care of within this tokenize function

In [None]:
def tokenize_predictions(examples):
    tokenized_inputs = tokenizer(examples["tokens"], truncation=True, is_split_into_words=True)
    return tokenized_inputs

def tokenize_and_align_labels(examples, is_train = True):
    
    tokenized_inputs = tokenizer(examples["tokens"], truncation=True, is_split_into_words=True)

    if is_train == False:
        tokenized_inputs["labels"] = examples["tokens"]
        return tokenized_inputs
    
    labels = []
    for i, label in enumerate(examples[f"ner_tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)  # Map tokens to their respective word.
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:                            # Set the special tokens to -100.
            if word_idx is None:
                label_ids.append(-100)
            elif word_idx != previous_word_idx:              # Only label the first token of a given word.
                label_ids.append(label[word_idx])

        labels.append(label_ids)

    tokenized_inputs["labels"] = labels
    return tokenized_inputs

#tokenized_examples = examples.map(tokenize_and_align_labels, batched=True)


# Tokenized Dataset Checkpoint

As always, we will either tokenize the dataset and align labels **OR** save the currently processed dataset to disk

In [None]:
paths = os.path.join(output_directory, TOKENIZED_PATH)

import datasets

if USE_LOCAL_DATASET == False:
    tokenized_datasets = raw_datasets.map(tokenize_and_align_labels, batched=True)
    print("Saving tokenized dataset to disk. ", paths)
    
    tokenized_datasets.save_to_disk( paths )
    
else:
    assert os.path.exists(paths), "The dataset local path does not exist. Please retrain the notebook with appropriate switches."
    tokenized_datasets = datasets.load_from_disk( paths )

# Caution

* If you set the **USE_SMALL_DATASET** switch to ON, then it will take a shuffled training sample of small size to train the model
* Useful to check if after any changes the notebook runs fine end to end.


In [None]:
if USE_SMALL_DATASET:
    print("CAUTION: Using a small subset of data as per the switches. ")
    small_train_dataset = tokenized_datasets["train"].shuffle(seed = 91).select(range(SMALL_DATASET_SIZE))
    small_eval_dataset = tokenized_datasets["validation"].shuffle(seed = 91).select(range(SMALL_DATASET_SIZE))
    small_test_dataset = tokenized_datasets["test"].shuffle(seed = 91).select(range(SMALL_DATASET_SIZE))
else:
    small_train_dataset = tokenized_datasets["train"]
    small_eval_dataset = tokenized_datasets["validation"]
    small_test_dataset = tokenized_datasets["test"]

# Verify,
that all is locked and loaded

In [None]:
print("Total target unique labels are : " , len(discourse_labels) * 2 + 1)

## Data collator

* Data collator makes sure that the sequences have same length. It can utilize padding to achieve this effect
* Check resource from huggingface highlighted in the beginning of notebook

In [None]:
from transformers import DataCollatorForTokenClassification
data_collator = DataCollatorForTokenClassification(tokenizer)

In [None]:
from transformers import AutoModelForTokenClassification, TrainingArguments, Trainer

# Model Checkpoint

* We can load the model from tag using Internet OR
* we can load the model using our last checkpoint after training OR
* we can load the downloaded untrained model from local storage

Switches to control are USE_LOCAL_DATASET & CHECKPOINT where CHECKPOINT should be the name of checkpoint after training model. (like checkpoint-100 etc.)

In [None]:
paths = os.path.join(output_directory, MODEL_PATH)

import datasets

if USE_LOCAL_DATASET == False:
    model = AutoModelForTokenClassification.from_pretrained(TOKENIZER_NAME, 
                                                        num_labels=len(discourse_labels) * 2 + 1) #total labels are twice the discourse types and 1 for NONE token
    print("Saving model to disk. ", paths)
    
    model.save_pretrained( paths )
elif CHECKPOINT is not None:
    if ISLOCAL:
        paths = os.path.join( os.path.join(main_directory, TRAINOUT_PATH), CHECKPOINT)
    else:
        paths = os.path.join( os.path.join(os.path.split(output_directory)[0], TRAINOUT_PATH), CHECKPOINT) #stupid coding
    model = AutoModelForTokenClassification.from_pretrained(paths)
    
else:
    assert os.path.exists(paths), "The model local path does not exist. Please retrain the notebook with appropriate switches."
    model = AutoModelForTokenClassification.from_pretrained(paths)

In [None]:
print("I shall train on dataset of size: ", len(small_train_dataset))

# Training Arguments

* Output directory to save the model
* Logging steps to report training loss. (If using small datasets, the logging may never be done as default value is 500 steps. So we use the formula specified)
* Batch sizes / learning rates are upto your liking

In [None]:
training_args = TrainingArguments(
    output_dir = TRAINOUT_PATH,
    logging_steps = min(500, len(small_train_dataset)),
    save_strategy = "epoch",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size = BATCH_SIZE,
    per_device_eval_batch_size = BATCH_SIZE,
    num_train_epochs=EPOCHS,
    weight_decay=0.01,
    save_total_limit = 3,
    fp16 = True,
    fp16_full_eval = True,
    report_to = "none",
)

In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset = small_train_dataset, 
    eval_dataset = small_eval_dataset,
    data_collator = data_collator,
    tokenizer = tokenizer,
)

# Train the Model

* If the switch INFER_ONLY is true, then we wont train but only generate predictions

In [None]:
if INFER_ONLY == False:
    trainer.train()

# Inference Helpers

## **get_predictions_on_eval**
* Assumes that the trainer has the target model loaded and outputs the predictions of NER tags.

## **get_predictions_on_submission_file**
* Same as above but for the submission file dataframe

In [None]:
def get_predictions_on_eval(eval_set = small_eval_dataset):
    predictions = trainer.predict(eval_set)
    preds = np.argmax( predictions.predictions, axis = -1)
    return preds


def get_predictions_on_submission_file(df_ss):
    '''
        df_ss should be the loaded sample_submission.csv file
    '''
    ss_pred = Dataset.from_pandas( prepare_prediction(df_ss) )
    tokenized_preds = ss_pred.map(tokenize_predictions, batched = True)
    predictions = trainer.predict( tokenized_preds )
    return np.argmax( predictions.predictions, axis = -1)
    
preds = get_predictions_on_submission_file(ss)


# Process Predictions

After predictions are done, we need to split the predictions by the Categories. 
For example,

If we have 1 Lead and 2 Claims in the same file, then we need to split them accordingly.

## **get_groups** 
function finds consecutive groups in a list https://stackoverflow.com/questions/2154249/identify-groups-of-continuous-numbers-in-a-list


## **get_processed_submission_file** 
function returns the processed entries to be submitted for the competition.
* It will only include an entry if its minimum length (min consecutive length) is atleast min_length (default is 5 words)



In [None]:
from itertools import groupby
from operator import itemgetter

def get_groups(data, min_length = 2):
    # https://stackoverflow.com/questions/2154249/identify-groups-of-continuous-numbers-in-a-list
    ranges =[]

    for k,g in groupby(enumerate(data),lambda x:x[0]-x[1]):
        group = (map(itemgetter(1),g))
        group = list(map(int,group))
        if len(group) >= min_length:
            ranges.append(group)
    return ranges



# First we change the Lead-I and Lead-B tokens to simply Lead and so on ...
def get_processed_submission_file(preds, min_length = 5):
    # CAUTION. WE pass and modify the preds variable here. Wont be able to reuse afterwards
    for i in range(2, 15, 2):
        preds[preds == i] = i - 1
    
    ret = []
    for i, j in tqdm(labels_to_d.items()):
        if i % 2 == 0:
            continue #there should be no even number token identifiers anymore. (apart from the 0)
        print("Processing predictions of type : ", labels_to_d[i])
        for ind, a in enumerate(preds):
            tk = (a == i)
            all_groups = get_groups(np.argwhere(tk) , min_length = min_length)
            if len(all_groups) > 0: # we have some groups found for the current discourse type
                for group in all_groups:
                    pred_str = str(group).replace(",", "").replace("[", "").replace("]", "")
                    pred_categ = i
                    pred_categ_label = labels_to_d[i]
                    pred_len = len(group)
                    ret.append({"predictionstring" : pred_str,
                               "pred_categ" : pred_categ,
                               "class": pred_categ_label,
                               "pred_len" : pred_len,
                               "pred_index" : ind})
    return ret

# Cleanup

Cleanup before submission. Assign file IDs and etc.

In [None]:
ret = get_processed_submission_file(preds)
ret = pd.DataFrame(ret)

ret['id'] = ret['pred_index'].map( ss['id'].to_dict() ) #map the FILE IDs using submission file

In [None]:
keep_cols = ["id", "class", "predictionstring"]
ret = ret[keep_cols]

# Submit (Finally!)

In [None]:
ret = ret.sort_values(by = ['id'])
ret.to_csv("submission.csv", index = False)

# That's all folks.