# Credit to
-  [CHRIS DEOTTE, PyTorch - BigBird - NER - [CV 0.615]](http://https://www.kaggle.com/cdeotte/pytorch-bigbird-ner-cv-0-615/notebook)
- [CPMP, Faster Metric Computation](http://https://www.kaggle.com/cpmpml/faster-metric-computation/notebook)

## here is the inference part:
[feedback2022_pytorch lightning[infer]](https://www.kaggle.com/fangyu67/feedback2022-pytorch-lightning-infer)


### If you feel useful please upvote :)

In [None]:
#!curl https://raw.githubusercontent.com/pytorch/xla/master/contrib/scripts/env-setup.py -o pytorch-xla-env-setup.py
#!python pytorch-xla-env-setup.py --version 1.8.1 --apt-packages libomp5 libopenblas-dev
#!pip install pytorch-lightning==1.5.10

#import torch_xla
#import torch_xla.core.xla_model as xm
#import torch_xla.distributed.xla_multiprocessing as xmp
#!export XLA_USE_BF16=1

In [None]:
from tqdm.auto import tqdm
import os
import random
import numpy as np
import pandas as pd

import gc
pd.set_option('display.max_columns', None)
gc.enable()

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
#from torch.utils.data import RandomSampler, SequentialSampler,TensorDataset
from torch.optim.lr_scheduler import OneCycleLR#,CosineAnnealingLR
#from torch.optim import lr_scheduler

from pytorch_lightning import LightningModule, LightningDataModule,Trainer
from pytorch_lightning.callbacks import ModelCheckpoint,LearningRateMonitor

# transformer
from transformers import AutoTokenizer, AutoModel, AdamW,AutoConfig,AutoModelForTokenClassification

#
from sklearn.metrics import accuracy_score



def seed_everything(seed=42):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = True
    # Set a fixed value for the hash seed
    os.environ['PYTHONHASHSEED'] = str(seed)


# Config

In [None]:
class CFG:
    def __init__(self):
        self.n_procs=1
        self.num_workers=2
        self.precision = 16
        self.seed=2022
        self.scheduler='lr_logging'
        ##########################
        self.model_name='BigBird'
        self.modelpath='../input/py-bigbird-v26'
        self.tokpath = '../input/py-bigbird-v26'
        self.max_length=1024
        ###############################################################################
        #'ReduceLROnPlateau', 'CosineAnnealingLR', 'CosineAnnealingWarmRestarts'#OneCycleLR
        self.num_labels=15
        self.epochs=5
        self.batch_size = 4 
        self.val_batch_size = self.batch_size 
        #self.stg = 'dp'
        #self.n_fold=4
        #self.trn_fold=[0]#[0,1,2,3]
        ##################################################################
        #factor=0.2 # ReduceLROnPlateau
        #patience=4 # ReduceLROnPlateau
        #eps=1e-6 # ReduceLROnPlateau
        #self.T_max=8 # CosineAnnealingLR
        #T_0=3 # CosineAnnealingWarmRestarts
        ######################################################################
        self.lr=5e-5
        #self.min_lr=1e-7
        #self.weight_decay=1e-6
        #gradient_accumulation_steps=1
        #max_grad_norm=1000
        self.max_lr=self.lr#5e-6 #2e-6 
        #self.n_cycle = 2
        #self.n_epoch = self.epochs
        self.steps_lr=123//(self.batch_size) + 1   #recompute before train
        #self.final_div_factor = 1e3
        #'pct_start': 0.1,               # OneCycleLR
        #'anneal_strategy': 'cos',       # OneCycleLR
        

CFG = CFG()
seed_everything(CFG.seed)

# Data

In [None]:
train_df = pd.read_csv('../input/feedback-prize-2021/train.csv')
READTOKEN = True

print( train_df.shape )
train_df.head(2)


In [None]:
# df_gt
#df_gt = train_df[['id','discourse_type','predictionstring']].copy()
#df_gt['labelList'] = df_gt['predictionstring'].apply(lambda x: [int(num) for num in x.split()])
#df_gt.head(2)

In [None]:
if not READTOKEN:
# https://www.kaggle.com/raghavendrakotala/fine-tunned-on-roberta-base-as-ner-problem-0-533
    text_names, train_texts = [], []
    for f in tqdm(list(os.listdir('../input/feedback-prize-2021/train'))):
        text_names.append(f.replace('.txt', ''))
        text = open('../input/feedback-prize-2021/train/' + f, 'r').read()
        text = text.replace(',',', ')
        train_texts.append(text)
    train_text_df = pd.DataFrame({'id': text_names, 'text': train_texts})
    train_text_df.head()

In [None]:

LOAD_TOKENS_FROM = '../input/feedback2022'

if not READTOKEN:
    all_entities = []
    
    for ii,i in enumerate(train_text_df.iterrows()):
        if ii%100==0: print(ii,', ',end='')
        total = i[1]['text'].split().__len__()
        entities = ["O"]*total
        for j in train_df[train_df['id'] == i[1]['id']].iterrows():
            discourse = j[1]['discourse_type']
            list_ix = [int(x) for x in j[1]['predictionstring'].split(' ')]
            entities[list_ix[0]] = f"B-{discourse}"
            for k in list_ix[1:]: entities[k] = f"I-{discourse}"
        all_entities.append(entities)
    train_text_df['entities'] = all_entities
    train_text_df.to_csv('train_NER.csv',index=False)

else:
    from ast import literal_eval
    train_text_df = pd.read_csv(f'{LOAD_TOKENS_FROM}/train_NER.csv')
    ##pandas saves lists as string, we must convert back
    train_text_df.entities = train_text_df.entities.apply(lambda x: literal_eval(x) )
    
print( train_text_df.shape )
train_text_df.head(3)

In [None]:
# CREATE DICTIONARIES THAT WE CAN USE DURING TRAIN AND INFER
output_labels = ['O', 'B-Lead', 'I-Lead', 'B-Position', 'I-Position', 'B-Claim', 'I-Claim', 'B-Counterclaim', 'I-Counterclaim', 
          'B-Rebuttal', 'I-Rebuttal', 'B-Evidence', 'I-Evidence', 'B-Concluding Statement', 'I-Concluding Statement']

labels_to_ids = {v:k for k,v in enumerate(output_labels)}
ids_to_labels = {k:v for k,v in enumerate(output_labels)}

#{'O': 0,'B-Lead': 1,'I-Lead': 2,'B-Position': 3,'I-Position': 4,'B-Claim': 5,'I-Claim': 6,
# 'B-Counterclaim': 7,'I-Counterclaim': 8,'B-Rebuttal': 9,'I-Rebuttal': 10,'B-Evidence': 11,
#'I-Evidence': 12,'B-Concluding Statement': 13,'I-Concluding Statement': 14}

In [None]:
# for debug
#train_text_df = train_text_df.sample(500)

# CHOOSE VALIDATION INDEXES
IDS = train_df.id.unique()
print('There are',len(IDS),'train texts. We will split 90% 10% for validation.')

# TRAIN VALID SPLIT 90% 10%
train_idx = np.random.choice(np.arange(len(IDS)),int(0.9*len(IDS)),replace=False)
valid_idx = np.setdiff1d(np.arange(len(IDS)),train_idx)

In [None]:
# CREATE TRAIN SUBSET AND VALID SUBSET
data = train_text_df[['id','text', 'entities']]
train_dataset = data.loc[data['id'].isin(IDS[train_idx]),['id','text', 'entities']].reset_index(drop=True)
test_dataset = data.loc[data['id'].isin(IDS[valid_idx])].reset_index(drop=True)

print("FULL Dataset: {}".format(data.shape))
print("TRAIN Dataset: {}".format(train_dataset.shape))
print("TEST Dataset: {}".format(test_dataset.shape))

In [None]:
train_gt = train_df.loc[train_df['id'].isin(IDS[train_idx])][['id','discourse_type', 'predictionstring']].reset_index(drop=True)
test_gt = train_df.loc[train_df['id'].isin(IDS[valid_idx])][['id','discourse_type', 'predictionstring']].reset_index(drop=True)

print(len(train_gt),len(test_gt))
train_gt.head(1)

# Data Class

In [None]:
class Dataset(Dataset):
    def __init__(self, dataframe, tokenizer, max_len):
        self.data = dataframe
        self.tokenizer = tokenizer
        self.max_len = max_len
    
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, index):
        LABEL_ALL_SUBTOKENS=True
        
        # GET TEXT AND WORD LABELS 
        text = self.data.text[index]        
        word_labels = self.data.entities[index]
        text_id = self.data.id[index]

        # TOKENIZE TEXT (use is_split_into_words)
        encoding = self.tokenizer(text.split(),
                             is_split_into_words=True,
                             #return_offsets_mapping=True, 
                             padding='max_length', 
                             truncation=True, 
                             max_length=self.max_len)
        
        # padding and prefix=None
        # map token[0,0,0,1,2] to split['a.b','c','d']
        word_ids = encoding.word_ids()  
        
        # CREATE TARGETS
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids: # tokens wise                       
            if word_idx is None:# padding
                label_ids.append(-100) # label ignored in loss
            elif word_idx != previous_word_idx: # change word   
                label_ids.append( labels_to_ids[word_labels[word_idx]] )
            else: # same word
                if LABEL_ALL_SUBTOKENS:
                    label_ids.append( labels_to_ids[word_labels[word_idx]] )
                else:
                    label_ids.append(-100)
            previous_word_idx = word_idx
            
        return {
            'input_ids': torch.tensor(encoding['input_ids'], dtype=torch.long),
            'attention_mask': torch.tensor(encoding['attention_mask'], dtype=torch.long),
            'labels': torch.tensor(label_ids, dtype=torch.long),
            'word_ids':str(word_ids),
            'text_id':text_id
        }


class DataModule(LightningDataModule):
    def __init__(self, train_df, val_df, tokenizer, cfg=None):
        super().__init__()
        self.train_df = train_df
        self.val_df = val_df
        self.cfg = cfg
        self.tokenizer = tokenizer
        
    
    def setup(self,stage):
        if stage == 'fit':
            self.train_ds = Dataset(self.train_df, self.tokenizer, self.cfg.max_length)
            self.valid_ds = Dataset(self.val_df,   self.tokenizer, self.cfg.max_length)

    
    def train_dataloader(self):
        return DataLoader(
            self.train_ds, batch_size=self.cfg.batch_size, 
            shuffle=True, num_workers=self.cfg.num_workers,
            pin_memory=True
            )
    
    def val_dataloader(self):
        return DataLoader(
            self.valid_ds, batch_size=self.cfg.val_batch_size, 
            shuffle=False, num_workers=self.cfg.num_workers,
            pin_memory=True
            )

# Metric

In [None]:
def calc_overlap3(set_pred, set_gt):# List input -> TP list 
    """
    Calculates if the overlap between prediction and
    ground truth is enough fora potential True positive
    """
    # Length of each and intersection
    try:
        len_gt = len(set_gt)
        len_pred = len(set_pred)
        inter = len(set_gt & set_pred)
        
        overlap_1 = inter / len_gt
        overlap_2 = inter/ len_pred
        return overlap_1 >= 0.5 and overlap_2 >= 0.5
    except:  # at least one of the input is NaN
        return False

def score_feedback_comp_micro3(pred_df, gt_df, discourse_type):
    """
    A function that scores for the kaggle
        Student Writing Competition
        
    Uses the steps in the evaluation page here:
        https://www.kaggle.com/c/feedback-prize-2021/overview/evaluation
    """
    gt_df = gt_df.loc[gt_df['discourse_type'] == discourse_type, 
                      ['id', 'predictionstring']].reset_index(drop=True)
    pred_df = pred_df.loc[pred_df['discourse_type'] == discourse_type,
                      ['id', 'predictionstring']].reset_index(drop=True)
    pred_df['pred_id'] = pred_df.index
    gt_df['gt_id'] = gt_df.index
    
    pred_df['predictionstring'] = [set(pred.split(' ')) for pred in pred_df['predictionstring']]
    gt_df['predictionstring'] = [set(pred.split(' ')) for pred in gt_df['predictionstring']]
    
    # Step 1. all ground truths and predictions for a given class are compared.
    joined = pred_df.merge(gt_df,
                           left_on='id',
                           right_on='id',
                           how='outer',
                           suffixes=('_pred','_gt')
                          )
    
    overlaps = [calc_overlap3(*args) for args in zip(joined.predictionstring_pred, 
                                                     joined.predictionstring_gt)]
    
    # 2. If the overlap between the ground truth and prediction is >= 0.5, 
    # and the overlap between the prediction and the ground truth >= 0.5,
    # the prediction is a match and considered a true positive.
    # If multiple matches exist, the match with the highest pair of overlaps is taken.
    # we don't need to compute the match to compute the score
    TP = joined.loc[overlaps]['gt_id'].nunique()
    
    # 3. Any unmatched ground truths are false negatives
    # and any unmatched predictions are false positives.
    TPandFP = len(pred_df)
    TPandFN = len(gt_df)
    
    #calc microf1
    my_f1_score = 2*TP / (TPandFP + TPandFN)
    return my_f1_score


def score_feedback_comp3(pred_df, gt_df, return_class_scores=False):
    class_scores = {}
    for discourse_type in gt_df.discourse_type.unique():
        class_score = score_feedback_comp_micro3(pred_df, gt_df, discourse_type)
        class_scores[discourse_type] = class_score
    f1 = np.mean([v for v in class_scores.values()])
    if return_class_scores:
        return f1, class_scores
    return f1

#score_feedback_comp3(df_gt, df_gt,True)

# Model

In [None]:
class ModelModule(LightningModule):
    def __init__(self, cfg,train_gt,test_gt):
        super().__init__()
        self.df_pred = pd.DataFrame(columns = ['id','discourse_type','predictionstring'])
        self.df_gt_train = train_gt
        self.df_gt_val = test_gt
        
        self.cfg=cfg
        #self.save_hyperparameters(cfg)
        config = AutoConfig.from_pretrained(self.cfg['modelpath'])
        config.num_labels = self.cfg['num_labels']
        self.model = AutoModelForTokenClassification.from_pretrained(self.cfg['modelpath'],config=config)
        #self.model = AutoModel.from_pretrained(self.hparams.modelpath,config=config)
       
        #self.loss = nn.CrossEntropyLoss()

    def forward(self, input_ids, attention_mask,labels):
        out = self.model(input_ids=input_ids, attention_mask=attention_mask,labels=labels)
        return out.loss,out.logits
    
    def training_step(self, batch, batch_idx):
        loss,logits = self(batch['input_ids'], batch['attention_mask'],batch['labels']) #(N,seq,labels)    
        pred = torch.argmax(logits, axis=2).cpu().detach().numpy() #(N,seq)
        labels = batch['labels'].cpu().detach().numpy() # (N,seq)
        
        self.log('train_loss', loss, on_step=False, on_epoch=True, prog_bar=True)
        return {
            'loss': loss,
            'pred':pred,
            'word_ids':batch['word_ids'],#(N,seq)
            'text_id':batch['text_id'], # (N)
            'labels':labels
        }
    
    def validation_step(self, batch, batch_idx):
        loss,logits = self(batch['input_ids'], batch['attention_mask'],batch['labels'])
        pred = torch.argmax(logits, axis=2).cpu().detach().numpy() #(N,seq)
        labels = batch['labels'].cpu().detach().numpy() # (N,seq)
        
        self.log('val_loss', loss, on_step=False, on_epoch=True, prog_bar=True)
        return {
            'loss': loss,
            'pred':pred,#(N,seq)
            'word_ids':batch['word_ids'],#(N,seq)
            'text_id':batch['text_id'], #(N)
            'labels':labels #(N,seq)
        }
    
    def training_epoch_end(self, output): # [dict1,dict2]:epoch,batch
        pass
        '''
        if not self.df_gt_train.empty:
            self.gt_trTrue=True
        
        for bat in output: # pred, word_ids, text_id,label
            pred = bat['pred']
            word_ids = [literal_eval(e) for e in bat['word_ids']]
            text_id  = bat['text_id']
            labels = bat['labels']
            
            for i in range(len(text_id)):
                self.build_df(text_id[i],pred[i],labels[i],word_ids[i])
        
        f1 = score_feedback_comp3(self.df_pred,self.df_gt_train)
        #self.logger.experiment.add_scalar('train_f1', f1, global_step=self.current_epoch)
        self.log('train_f1', f1,prog_bar=True)
        
        self.df_pred = pd.DataFrame(columns = ['id','discourse_type','predictionstring'])
        '''
    
    def validation_epoch_end(self,output):
        for bat in output:
            pred = bat['pred']
            word_ids = [literal_eval(e) for e in bat['word_ids']]
            text_id  = bat['text_id']
            labels = bat['labels']
            
            for i in range(len(text_id)):
                self.build_df(text_id[i],pred[i],labels[i],word_ids[i])
        
        f1 = score_feedback_comp3(self.df_pred,self.df_gt_val)
        self.log('val_f1', f1,prog_bar=True)
        self.df_pred = pd.DataFrame(columns = ['id','discourse_type','predictionstring'])
    
    
    def build_df(self,id_, pred_, labels_, word_ids_):
        #text_id,pred,labels,word_ids #self.df_pred #self.df_gt
        
        pred_ = [ids_to_labels[i] for i in pred_] # token wise
        prediction = [] #word wise
        
        previous_word_idx = -1
        for idx,word_idx in enumerate(word_ids_):                            
            if word_idx!=None and word_idx != previous_word_idx:
                # use only first subword pred  
                prediction.append(pred_[idx])
                previous_word_idx = word_idx
        
        j = 0
        end = 0
        while j < len(prediction):
            if prediction[j]=='O':
                j+=1
            else:
                cls = prediction[j].replace('B','I') # Take I and B
                end = j + 1
                while end < len(prediction) and prediction[end] == cls:
                    end += 1
                
                if end - j > 3: # 7 to check
                    self.df_pred = self.df_pred.append(
                        pd.Series([id_, cls.replace('I-','') ,' '.join(map(str, list(range(j, end))))], index = self.df_pred.columns), 
                        ignore_index=True)
                j = end
         
        
    def configure_optimizers(self):      
        optimizer = AdamW(
            self.parameters(), lr=self.cfg['lr'],
            #weight_decay=self.hparams.weight_decay
            )

        #CosineAnnealingLR(optimizer=opt, eta_min=self.hparams.min_lr, T_max=self.hparams.T_max)
        scheduler = {
            #'scheduler':CyclicLR(optimizer,base_lr=1e-7, max_lr=2e-2,step_size_up=self.hparams.steps_lr//2,mode="triangular2",cycle_momentum=False),
            'scheduler':OneCycleLR(optimizer,
            max_lr=self.cfg['max_lr'],steps_per_epoch=self.cfg['steps_lr'], 
            epochs=self.cfg['epochs'], #pct_start =0.1,
            ),
            'name':self.cfg['scheduler'],
            'interval':'step',
            'frequency': 1
            }

        return [optimizer], [scheduler]  
        
        
        
        

# Train

In [None]:
###
tag = 'ep{}-len{}'.format(CFG.epochs,CFG.max_length)

CFG.steps_lr=len(train_dataset)//(CFG.batch_size) + 1

tokenizer = AutoTokenizer.from_pretrained(CFG.tokpath,add_prefix_space=True)

dm = DataModule(train_dataset, test_dataset, tokenizer, CFG)
model = ModelModule(CFG.__dict__,train_gt,test_gt)


filename = f"{CFG.model_name}-{tag}"
checkpoint_callback = ModelCheckpoint(monitor='val_f1', dirpath='./', mode='max', filename=filename,save_top_k=1)
lr_logger = LearningRateMonitor(logging_interval="step")


trainer = Trainer(
    gpus=CFG.n_procs,
    max_epochs=CFG.epochs,
    precision=CFG.precision,
    num_sanity_val_steps=0,
    callbacks=[checkpoint_callback,lr_logger],
#    strategy=CFG.stg,
#    log_every_n_steps=5,
    )

trainer.fit(model, datamodule=dm)
    
del model
gc.collect()
torch.cuda.empty_cache()