# Essays!! NER-dy Training (KFolds)

This notebook is basically based on [this](https://www.kaggle.com/cdeotte/pytorch-bigbird-ner-cv-0-615) notebook by @cdeotte where he has used PyTorch and BigBird. 

This code is just shorter and easier to debug and I have used **TorchEZ**, a PyTorch Wrapper to make the train, validation and prediction codes resuable, rather than writing them again and again. [Github Link](https://github.com/kishalxd/torchez)

TorchEZ is still under development. Currently, I have not incorporated the use of schedulers (will update soon). The basic usage can be seen in this notebook.

Moreover, I have tried to train 5 folds of the data. Currently using RoBERTa-L. Also used longformer-L as you can see in the attached datasets. (Each epoch takes 2:45:00 hrs to train with a batch size of 1 - LB : 0.587 for 2 epochs) **Verison 16 : Longformer*

**About the Cross-Validation** : The metric calculation has been taken from [this](https://www.kaggle.com/robikscube/student-writing-competition-twitch-stream) notebook by @robikscube 

**Also has the submission, but disabled and made this notebook for training only**

**The Inference and Cross-Validation Notebook can be found here : [Essays!! NERdy indeed | INFER and CV - KFolds ](https://www.kaggle.com/kishalmandal/ner-inference)**

***If you like please UPVOTE :)***

In [None]:
import sys
import os
sys.path.append('../input/torchez/')

import warnings
warnings.filterwarnings("ignore", category=UserWarning)

from transformers import logging
logging.set_verbosity(50)

import pandas as pd
import numpy as np
import transformers
from transformers import AutoModel, AutoConfig, AutoTokenizer, AdamW, AutoModelForTokenClassification
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
import torch
import torch.nn as nn
import ast
from tqdm import tqdm
import gc; gc.enable()
import torchez as ez

In [None]:
df = pd.read_csv('../input/feedback-ner-train/train_folds.csv')#[:10]

In [None]:
df.head()

In [None]:
le = LabelEncoder()
all_tags=[]
for i in tqdm(range(len(df))):
    all_tags.extend(set(ast.literal_eval(df['entities'].values[i])))
    
unique_tags = list(set(all_tags))

In [None]:
le.fit_transform(unique_tags)
dict(zip(le.transform(le.classes_), le.classes_))

In [None]:
class Config:
    batch_size = 4
    lr = 0.125e-6
    max_len = 1024
    num_class = 15
    weight_decay=0.01
    model_name = 'allenai/longformer-base-4096'
    fold = 1
    submission = False

In [None]:
LABEL_ALL_SUBTOKENS = True

class EntityDataset:
    def __init__(self, dataframe, tokenizer, max_len, get_wids):
        self.len = len(dataframe)
        self.data = dataframe
        self.tokenizer = tokenizer
        self.max_len = max_len
        self.get_wids = get_wids # for validation

    def __getitem__(self, index):
        # GET TEXT AND WORD LABELS 
        text = self.data['text'].values[index]        
        word_labels = ast.literal_eval(self.data['entities'].values[index]) if not self.get_wids else None

        # TOKENIZE TEXT
        encoding = self.tokenizer(text.split(),
                             is_split_into_words=True,
                             #return_offsets_mapping=True, 
                             padding='max_length', 
                             truncation=True, 
                             max_length=self.max_len)
        word_ids = encoding.word_ids()  

        # CREATE TARGETS
        if not self.get_wids:
            previous_word_idx = None
            label_ids = []
            for word_idx in word_ids:                            
                if word_idx is None:
                    label_ids.append(-100)
                elif word_idx != previous_word_idx:              
                    label_ids.append( le.transform([word_labels[word_idx]]) )
                else:
                    if LABEL_ALL_SUBTOKENS:
                        label_ids.append( le.transform([word_labels[word_idx]]) )
                    else:
                        label_ids.append(-100)
                previous_word_idx = word_idx
            encoding['labels'] = label_ids

        # CONVERT TO TORCH TENSORS
        item = {key: torch.as_tensor(val) for key, val in encoding.items()}
        if self.get_wids: 
            word_ids2 = [w if w is not None else -1 for w in word_ids]
            item['wids'] = torch.as_tensor(word_ids2)

        return item

    def __len__(self):
        return self.len

In [None]:
class FeedbackModel(ez.Model):
    def __init__(self):
        super(FeedbackModel, self).__init__()
        self.config = AutoConfig.from_pretrained(args.model_name)
        self.config.update({'output_hidden_states':False, 'return_dict':False})
        self.config.num_labels = args.num_class
        self.roberta = AutoModelForTokenClassification.from_pretrained(args.model_name, config=self.config)
        
    def forward(self, ids, mask, labels=None):
        if labels is not None:
            loss, logits = self.roberta(ids, attention_mask=mask, labels=labels)
            return loss, logits
        else:
            logits = self.roberta(ids, attention_mask=mask)
            return logits
    
    def get_optimizer(self):
        optimizer = AdamW(self.parameters(), lr=args.lr, weight_decay = args.weight_decay)
        return optimizer
    
    def training_step(self, input_ids, attention_mask, labels):
        loss, outputs = self(input_ids, attention_mask, labels)
        metric = self.metrics(outputs, labels)

        return {**{'loss':loss}, **metric}
    
    def validation_step(self, input_ids, attention_mask, labels):
        loss, outputs = self(input_ids, attention_mask, labels)
        metric = self.metrics(outputs, labels)
        
        return {**{'loss':loss}, **metric}
    
    def prediction_step(self, input_ids, attention_mask, wids):
        outputs = self(input_ids, attention_mask)
        all_preds = torch.argmax(outputs[0], axis=-1)

        return all_preds
    
    def metrics(self, outputs, targets):
        targets = targets.view(-1)
        outputs = torch.softmax(outputs, dim=1)
        active_logits = outputs.view(-1, args.num_class)
        flattened_predictions = torch.argmax(active_logits, axis=1)
        
        active_accuracy = targets.view(-1) != -100
        targets = torch.masked_select(targets, active_accuracy)
        
        predictions = torch.masked_select(flattened_predictions, active_accuracy)
        
        diff = accuracy_score(targets.cpu().numpy(), predictions.cpu().numpy())
        return {'accuracy': diff}

In [None]:
args= Config()
    
gc.collect()
model = FeedbackModel()
tokenizer = AutoTokenizer.from_pretrained(args.model_name, add_prefix_space=True)

traindf, validdf = df[df['kfold']!=args.fold], df[df['kfold']==args.fold]

# all_tags=[]
# for i in tqdm(range(len(traindf))):
#     all_tags.extend(set(ast.literal_eval(traindf['entities'].values[i])))
    
    
# if args.class_weight_compute == 'global':
#     class_weight = compute_class_weight(classes=unique_tags, y=all_tags, class_weight='balanced')
#     class_weight = torch.tensor(class_weight, dtype=torch.float).to(args.device)

train_dataset = EntityDataset(traindf, tokenizer, args.max_len, get_wids=False)
valid_dataset = EntityDataset(traindf, tokenizer, args.max_len, get_wids=False)

print('='*50)
print(f'Fold : {args.fold}')
print('='*50)

model.fit(
    train_dataset=train_dataset,
    train_batch_size=args.batch_size,
    valid_dataset=valid_dataset,
    valid_batch_size=args.batch_size*8,
    device='cuda',
    epochs=5,
    save=True,
    es=True,
    es_monitor='valid_accuracy',
    es_epochs=1,
    es_mode='max',
    model_path=f'model_f{args.fold}.bin'
)

# Validation

In [None]:
validation_df = pd.DataFrame()
essays=[]
ids=[]
for idx in tqdm(validdf.id.unique()):
    essays.append(open(f'../input/feedback-prize-2021/train/{idx}.txt', 'r').read())
    ids.append(idx)
validation_df['id'] = ids
validation_df['text'] = essays

In [None]:
def get_predictions(df, min_select=1, load_path=None):
    
    test_dataset = EntityDataset(df, tokenizer, args.max_len, get_wids=True)
    model = FeedbackModel()
    preds = model.predict(test_dataset, batch_size=args.batch_size*4, device='cuda', model_path=f'./model_f{args.fold}.bin')
    
    predictions = []
    for k,text_preds in tqdm(enumerate(preds)):
        token_preds = le.inverse_transform(text_preds)
        prediction = []
        word_ids = test_dataset[k]['wids'].cpu().detach().numpy()
        previous_word_idx = -1
        for idx,word_idx in enumerate(word_ids):                            
            if word_idx == -1:
                pass
            elif word_idx != previous_word_idx:              
                prediction.append(token_preds[idx])
                previous_word_idx = word_idx
        predictions.append(prediction)
    final_preds2 = []
    
    for i in range(len(df)):

        idx = df.id.values[i]
        pred = predictions[i] # Leave "B" and "I"
        preds = []
        j = 0
        while j < len(pred):
            cls = pred[j]
            if cls == 'O': j += 1
            else: cls = cls.replace('B','I') # spans start with B
            end = j + 1
            while end < len(pred) and pred[end] == cls:
                end += 1

            if cls != 'O' and cls != '' and end - j > min_select:
                final_preds2.append((idx, cls.replace('I-',''),
                                     ' '.join(map(str, list(range(j, end))))))

            j = end

    oof = pd.DataFrame(final_preds2)
    oof.columns = ['id','class','predictionstring']
    
    del model, test_dataset
    gc.collect()
    return oof

In [None]:
# from Rob Mulla @robikscube
# https://www.kaggle.com/robikscube/student-writing-competition-twitch
def calc_overlap(row):
    """
    Calculates the overlap between prediction and
    ground truth and overlap percentages used for determining
    true positives.
    """
    set_pred = set(row.predictionstring_pred.split(' '))
    set_gt = set(row.predictionstring_gt.split(' '))
    # Length of each and intersection
    len_gt = len(set_gt)
    len_pred = len(set_pred)
    inter = len(set_gt.intersection(set_pred))
    overlap_1 = inter / len_gt
    overlap_2 = inter/ len_pred
    return [overlap_1, overlap_2]


def score_feedback_comp(pred_df, gt_df):
    """
    A function that scores for the kaggle
        Student Writing Competition
        
    Uses the steps in the evaluation page here:
        https://www.kaggle.com/c/feedback-prize-2021/overview/evaluation
    """
    gt_df = gt_df[['id','discourse_type','predictionstring']] \
        .reset_index(drop=True).copy()
    pred_df = pred_df[['id','class','predictionstring']] \
        .reset_index(drop=True).copy()
    pred_df['pred_id'] = pred_df.index
    gt_df['gt_id'] = gt_df.index
    # Step 1. all ground truths and predictions for a given class are compared.
    joined = pred_df.merge(gt_df,
                           left_on=['id','class'],
                           right_on=['id','discourse_type'],
                           how='outer',
                           suffixes=('_pred','_gt')
                          )
    joined['predictionstring_gt'] = joined['predictionstring_gt'].fillna(' ')
    joined['predictionstring_pred'] = joined['predictionstring_pred'].fillna(' ')

    joined['overlaps'] = joined.apply(calc_overlap, axis=1)

    # 2. If the overlap between the ground truth and prediction is >= 0.5, 
    # and the overlap between the prediction and the ground truth >= 0.5,
    # the prediction is a match and considered a true positive.
    # If multiple matches exist, the match with the highest pair of overlaps is taken.
    joined['overlap1'] = joined['overlaps'].apply(lambda x: eval(str(x))[0])
    joined['overlap2'] = joined['overlaps'].apply(lambda x: eval(str(x))[1])


    joined['potential_TP'] = (joined['overlap1'] >= 0.5) & (joined['overlap2'] >= 0.5)
    joined['max_overlap'] = joined[['overlap1','overlap2']].max(axis=1)
    tp_pred_ids = joined.query('potential_TP') \
        .sort_values('max_overlap', ascending=False) \
        .groupby(['id','predictionstring_gt']).first()['pred_id'].values

    # 3. Any unmatched ground truths are false negatives
    # and any unmatched predictions are false positives.
    fp_pred_ids = [p for p in joined['pred_id'].unique() if p not in tp_pred_ids]

    matched_gt_ids = joined.query('potential_TP')['gt_id'].unique()
    unmatched_gt_ids = [c for c in joined['gt_id'].unique() if c not in matched_gt_ids]

    # Get numbers of each type
    TP = len(tp_pred_ids)
    FP = len(fp_pred_ids)
    FN = len(unmatched_gt_ids)
    #calc microf1
    my_f1_score = TP / (TP + 0.5*(FP+FN))
    return my_f1_score

In [None]:
train_df = pd.read_csv('../input/feedback-prize-2021/train.csv')

In [None]:
COMPUTE_VAL_SCORE = True

print('='*50)
print(f'Fold : {args.fold}')
print('='*50)

if COMPUTE_VAL_SCORE: # note this doesn't run during submit
    # VALID TARGETS
    valid = train_df.loc[train_df['id'].isin(validdf.id.values)]
    # OOF PREDICTIONS
    oof = get_predictions(validation_df, min_select=6)

    # COMPUTE F1 SCORE
    f1s = []
    CLASSES = oof['class'].unique()
    print()
    for c in CLASSES:
        pred_df = oof.loc[oof['class']==c].copy()
        gt_df = valid.loc[valid['discourse_type']==c].copy()
        f1 = score_feedback_comp(pred_df, gt_df)
        print(c,f1)
        f1s.append(f1)
    print()
    print('Overall',np.mean(f1s))
    print()

# Submission

In [None]:
# from IPython.display import display
# if args.submission:
#     testdf = pd.DataFrame()

#     essays=[]
#     ids = []
#     for file in os.listdir('../input/feedback-prize-2021/test'):
#         essays.append(open(f'../input/feedback-prize-2021/test/{file}', 'r').read())
#         ids.append(file.split('.')[0])

#     testdf['id'] = ids
#     testdf['text'] = essays

#     preddf = get_predictions(testdf, min_select=6)
#     preddf.to_csv('submission.csv', index=False)
#     display(preddf.head())