In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from tqdm import tqdm
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory
import warnings
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
#         print(os.path.join(dirname, filename))
        pass

warnings.filterwarnings("ignore")
# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

## Notebook explaining how to train with minimal code using datasets and Trainer API 

* Loads datafrom dataframe to Datasets
* Uses the BigBird using with 1024 tokens
* Trainer API with fp16 enabled so as to optimize the training processes.
* 

### Improvements
* change the hyperparameter-tunning of the Trainer 
* Improving post processing of labels as mentioned here https://www.kaggle.com/cdeotte/pytorch-bigbird-ner-cv-0-615

## Do Upvote if you find it usefull, It keeps me motivated to do more quality work, Thanks!


### load data and convert text into dataframe, then convert predictionstring to NER IOB format look at this [notebook](http://https://www.kaggle.com/raghavendrakotala/pre-processed-data-for-ner-modeling) to prepare data 

In [None]:
train_df = pd.read_csv('/kaggle/input/feedback-prize-2021/train.csv')

train_df.columns

In [None]:
test_names, test_texts = [], []
for f in tqdm(list(os.listdir('../input/feedback-prize-2021/test'))):
    test_names.append(f.replace('.txt', ''))
    test_texts.append(open('../input/feedback-prize-2021/test/' + f, 'r').read())
test_texts = pd.DataFrame({'id': test_names, 'text': test_texts})
# test_texts['text'] = test_texts['text'].apply(lambda x:x.split())
test_texts.head()

In [None]:
# test_names, train_texts = [], []
# for f in tqdm(list(os.listdir('../input/feedback-prize-2021/train'))[:100]):
#     test_names.append(f.replace('.txt', ''))
#     train_texts.append(open('../input/feedback-prize-2021/train/' + f, 'r').read())
# train_text_df = pd.DataFrame({'id': test_names, 'text': train_texts})
# # train_texts['text'] = test_texts['text'].apply(lambda x:x.split())
# train_text_df.head()

In [None]:
# all_entities = []
# for i in tqdm(train_text_df.iterrows()):
#     total = i[1]['text'].split().__len__()
# #     entities = []
#     entities = ["O" for i in range(total)]
#     for j in train_df[train_df['id'] == i[1]['id']].iterrows():
#         discourse = j[1]['discourse_type']
#         list_ix = j[1]['predictionstring'].split()
#         for li in list_ix[1:]:
# #             print(li, entities)
#             entities[int(li)] = f"I-{discourse}"
#         entities[int(list_ix[0])] = f"B-{discourse}"
#     all_entities.append(entities)

In [None]:
train_text_df = pd.read_csv("/kaggle/input/feedback-prize-ner-tagged-data/feedback_prize_ner_tagged_data.csv")
train_text_df.head()

In [None]:
import ast
train_text_df['entities'] = train_text_df['entities'].apply(lambda x:ast.literal_eval(x))

print(train_text_df['entities'].values[0])

In [None]:
train_text_df = train_text_df[:]

### Define config and Load model, tokenizer

In [None]:
import datasets
from transformers import Trainer, TrainingArguments, DataCollatorWithPadding
from transformers import BigBirdForTokenClassification, BigBirdTokenizerFast
from torch import cuda
import torch

In [None]:
config = {'model_name': '/kaggle/input/huggingfacebigbirdrobertabase/',
         'max_length': 1024,
         'train_batch_size':4,
         'valid_batch_size':8,
         'epochs':5,
         'learning_rate':5e-05,
         'max_grad_norm':10,
          'warmup':0.1,
          "grad_acc":8,
          "model_save_path":"big-bird",
         'device': 'cuda' if cuda.is_available() else 'cpu'}

output_labels = ['O', 'B-Lead', 'I-Lead', 'B-Position', 'I-Position', 'B-Claim', 'I-Claim', 'B-Counterclaim', 'I-Counterclaim', 
          'B-Rebuttal', 'I-Rebuttal', 'B-Evidence', 'I-Evidence', 'B-Concluding Statement', 'I-Concluding Statement']

labels_to_ids = {v:k for k,v in enumerate(output_labels)}
ids_to_labels = {k:v for k,v in enumerate(output_labels)}

In [None]:
train_text_df['labels'] = train_text_df['entities'].apply(lambda x: [labels_to_ids[i] for i in x])

In [None]:
train_text_df.head()

In [None]:
tokenizer = BigBirdTokenizerFast.from_pretrained(config['model_name'])
model = BigBirdForTokenClassification.from_pretrained(config['model_name'],
                                                     num_labels=len(output_labels))

### Test to make sure is_split_into_words, return_offsets parameters are working properly

In [None]:
converted = tokenizer(train_text_df.loc[0].values[1].split(),
                      is_split_into_words=True, return_offsets_mapping=True)

In [None]:
ix = 0
for i,j in zip(tokenizer.convert_ids_to_tokens(converted['input_ids']), converted['offset_mapping']):
    print(i, j)
    ix += 1
    if ix == 15:
        break

### Tokenize the data

* Load data into huggingface datasets.
* Make sure you take care of sub-word tokenizing problem when adding labels to tokenized data.
* use .map to map tokenizer_data function to data
* Look at one sample to see whether mapping is done correctly or not


In [None]:
def tokenizer_data(example):
    encoding = tokenizer(example['text'].split(),
                         is_split_into_words=True,
                         truncation=True,
                         padding='max_length', 
                         return_offsets_mapping=True,
                         max_length=config['max_length'])
    i = 0
    labels = example['labels']
    encoded_labels = np.ones(len(encoding["offset_mapping"]), dtype=int) * -100
    for idx, mapping in enumerate(encoding["offset_mapping"]):
        if mapping[0] == 0 and mapping[1] != 0:
            try:
                encoded_labels[idx] = labels[i]
            except:
                pass
            i += 1
    item = {key: torch.as_tensor(val) for key, val in encoding.items()}
    item['labels'] = torch.as_tensor(encoded_labels)
    return item


In [None]:
dataset = datasets.Dataset.from_pandas(train_text_df)

In [None]:
dataset = dataset.train_test_split(test_size=0.1)
dataset

In [None]:
text = dataset['train'][1]

# print(text['text'], text['entities'], text['labels'])

In [None]:
converted = tokenizer_data(text)

converted

In [None]:
converted

i=0
for token, label in zip(tokenizer.convert_ids_to_tokens(converted["input_ids"]), 
                        converted["labels"]):
    print(token, label, converted['offset_mapping'][i])
    i+=1
    if i == 15:
        break

In [None]:
dataset = dataset.map(tokenizer_data)

dataset

In [None]:
dataset.set_format(type='torch', columns=['input_ids', 'attention_mask',
                                         'labels'])

dataset

### Check the shape of inputs_ids and attention_mask are same or not

In [None]:
for a in dataset['train']:
    if a['input_ids'].shape[0] != a['attention_mask'].shape[0]:
        print(a)
        break

### Define training argument and train the model

In [None]:
trainer_args = TrainingArguments('test_trainer',
                                report_to='none',
                                 num_train_epochs=config['epochs'],
                                evaluation_strategy ='epoch',
                                per_device_train_batch_size=config['train_batch_size'],
                                per_device_eval_batch_size=config['valid_batch_size'],
                                fp16=True,
                                save_strategy = "epoch",
                                 warmup_ratio= config['warmup'],
                                 gradient_accumulation_steps=config['grad_acc'],
                                 logging_strategy="epoch",
                                 save_total_limit=1
                                )

trainer = Trainer(model=model,
                  args=trainer_args, 
                  train_dataset = dataset['train'],
                  eval_dataset=dataset['test'],
#                   data_collator = data_collator,
                  tokenizer=tokenizer)

In [None]:
trainer.train()

In [None]:
device = config['device']

### Write inference function, loop through the test_text and dump into submission file

In [None]:
trainer.model.eval()
def inference(sentence):
    inputs = tokenizer(sentence.split(),
                        is_split_into_words=True, 
                        return_offsets_mapping=True, 
                        padding='max_length', 
                        truncation=True, 
                        max_length=4096,
                        return_tensors="pt")

    # move to gpu
    ids = inputs["input_ids"].to(device)
    mask = inputs["attention_mask"].to(device)
    # forward pass
    outputs = trainer.model(input_ids=ids, attention_mask=mask, return_dict=False)
#     print(outputs)
    logits = outputs[0]
    
    active_logits = logits.view(-1, model.num_labels) # shape (batch_size * seq_len, num_labels)
    flattened_predictions = torch.argmax(active_logits, axis=1) # shape (batch_size*seq_len,) - predictions at the token level
    print(logits.shape, active_logits.shape, flattened_predictions.shape)
    tokens = tokenizer.convert_ids_to_tokens(ids.squeeze().tolist())
    token_predictions = [ids_to_labels[i] for i in flattened_predictions.cpu().numpy()]
    wp_preds = list(zip(tokens, token_predictions)) # list of tuples. Each tuple = (wordpiece, prediction)

    prediction = []
    out_str = []
    off_list = inputs["offset_mapping"].squeeze().tolist()
    for idx, mapping in enumerate(off_list):
#         print(mapping, token_pred[1], token_pred[0],"####")

#         only predictions on first word pieces are important
        if mapping[0] == 0 and mapping[1] != 0:
#             print(mapping, token_pred[1], token_pred[0])
            prediction.append(wp_preds[idx][1])
            out_str.append(wp_preds[idx][0])
        else:
            if idx == 1:
                prediction.append(wp_preds[idx][1])
                out_str.append(wp_preds[idx][0])
            continue
    return prediction, out_str

In [None]:
final_preds = []
import pdb
for i in tqdm(range(len(test_texts))):
    idx = test_texts.id.values[i]
    pred, _ = inference(test_texts.text.values[i])
    pred = [x.replace('B-','').replace('I-','') for x in pred]
    preds = []
    j = 0
    while j < len(pred):
        cls = pred[j]
        if cls == 'O':
            j += 1
        end = j + 1
        while end < len(pred) and pred[end] == cls:
            end += 1
            
        if cls != 'O' and cls != '' and end - j > 7:
            final_preds.append((idx, cls, ' '.join(map(str, list(range(j, end))))))
        
        j = end
        
# print(final_preds[1])

In [None]:
len(final_preds)

In [None]:
test_df = pd.read_csv('../input/feedback-prize-2021/sample_submission.csv')
test_df

sub = pd.DataFrame(final_preds)
sub.columns = test_df.columns

sub.head()

In [None]:
sub.to_csv("submission.csv", index=False)