#### A simple Roberta hugging face NER based approach

I have taken ideas from this notebook
https://www.kaggle.com/zzy990106/pytorch-ner-infer.

I have created the baseline traning pipeline based on roberta for token classification.

This is the training notebook.

Find the prediction notebook here: https://www.kaggle.com/revathiprakash/feedback-prize-baseline-roberta-pytorch-wip/edit/run/82697901


In [None]:
import random
import os
import time
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from torch.cuda.amp import autocast, GradScaler
from tqdm import tqdm
from sklearn.model_selection import *
from transformers import *


In [None]:
CFG = {
    'fold_num': 5, 
    'seed': 42,
    'model': '../input/roberta-base',
    'max_len': 512,
    'epochs': 3,
    'train_bs': 16,
    'valid_bs': 32,
    'lr': 1e-4,
    'num_workers': 0,
    'weight_decay': 1e-5,
}

In [None]:
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

seed_everything(CFG['seed'])

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [None]:
label_list = ['o', 'B-Lead', 'I-Lead', 'B-Position', 'I-Position', 'B-Claim', 'I-Claim', 'B-Counterclaim', 'I-Counterclaim', 
          'B-Rebuttal', 'I-Rebuttal', 'B-Evidence', 'I-Evidence', 'B-Concluding Statement', 'I-Concluding Statement']
label_encoding_dict = {'o': 1,
                       'B-Lead': 2,
                       'I-Lead': 3,
                       'B-Position': 4,
                       'I-Position': 5,
                       'B-Claim': 6,
                       'I-Claim': 7,
                       'B-Counterclaim': 8, 'I-Counterclaim': 9,
                       'B-Rebuttal': 10, 'I-Rebuttal': 11,
                       'B-Evidence': 12, 'I-Evidence': 13, 'B-Concluding Statement': 14,
                       'I-Concluding Statement' :15
                       
                      }


In [None]:
tokenizer = AutoTokenizer.from_pretrained(CFG['model'], add_prefix_space=True)

In [None]:
train=pd.read_csv('../input/feedback-train/finaldata.csv')

In [None]:
train_names, train_texts = [], []
for f in tqdm(list(os.listdir('../input/feedback-prize-2021/train'))):
    train_names.append(f.replace('.txt', ''))
    train_texts.append(open('../input/feedback-prize-2021/train/' + f, 'r').read())
train_texts = pd.DataFrame({'id': train_names, 'text': train_texts})
train_texts['text'] = train_texts['text'].apply(lambda x:x.split())

In [None]:

train_texts=pd.merge(train_texts, train[['id','token_class_merged']], how='left', on =['id'])
train_texts['token_class_merged']=train_texts['token_class_merged'].apply(lambda x:(str(x)[1:-1]).split(','))
train_texts.head(2)

In [None]:
train_df=train_texts.iloc[1:13000,]
test_df=train_texts.iloc[13001:,]

In [None]:
label_to_id = {l: i for i, l in enumerate(label_list)}
b_to_i_label = []
for idx, label in enumerate(label_list):
    if label.startswith("B-") and label.replace("B-", "I-") in label_list:
        b_to_i_label.append(label_list.index(label.replace("B-", "I-")))
    else:
        b_to_i_label.append(idx)

In [None]:
def tokenize_and_align_labels(examples):
    label_all_tokens = True
    tokenized_inputs = tokenizer(list(examples["text"]), truncation=True, is_split_into_words=True,
                                 max_length=CFG['max_len'])

    labels = []
    for i, label in enumerate(examples['token_class_merged']):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:
            if word_idx is None:
                label_ids.append(-100)
            elif label[word_idx] == '0':
                label_ids.append(0)
            elif word_idx != previous_word_idx:
                label_ids.append(label_to_id[label[word_idx]])
            else:
                label_ids.append(label_to_id[label[word_idx]] if label_all_tokens else -100)
            previous_word_idx = word_idx
        labels.append(label_ids)
        
    tokenized_inputs["labels"] = labels
    return tokenized_inputs


In [None]:
!pip install seqeval

In [None]:
#uncomment for training
from datasets import Dataset
train_dataset = Dataset.from_pandas(train_df)
test_dataset = Dataset.from_pandas(test_df)
train_tokenized_datasets = train_dataset.map(tokenize_and_align_labels, batched=True)
test_tokenized_datasets = test_dataset.map(tokenize_and_align_labels, batched=True)

In [None]:
## training loop
from datasets import load_metric
os.environ["WANDB_DISABLED"] = "true"
model = AutoModelForTokenClassification.from_pretrained(CFG['model'], num_labels=len(label_list))
args = TrainingArguments(
    "test-ner",
    evaluation_strategy = "epoch",
    learning_rate=CFG['lr'],
    per_device_train_batch_size=CFG['train_bs'],
    per_device_eval_batch_size=CFG['valid_bs'],
    num_train_epochs=CFG['epochs'],
    weight_decay=CFG['weight_decay'],
)

data_collator = DataCollatorForTokenClassification(tokenizer)
metric = load_metric("seqeval")


def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    true_predictions = [[label_list[p] for (p, l) in zip(prediction, label) if l != -100] for prediction, label in zip(predictions, labels)]
    true_labels = [[label_list[l] for (p, l) in zip(prediction, label) if l != -100] for prediction, label in zip(predictions, labels)]

    results = metric.compute(predictions=true_predictions, references=true_labels)
    return {"precision": results["overall_precision"], "recall": results["overall_recall"], "f1": results["overall_f1"], "accuracy": results["overall_accuracy"]}
    
trainer = Trainer(
    model,
    args,
    train_dataset=train_tokenized_datasets,
    eval_dataset=test_tokenized_datasets,
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

trainer.train()


In [None]:
trainer.evaluate()
trainer.save_model('roberta_v1.model')
torch.save(model.state_dict(), './roberta-baseline.pt')



### Please upvote if you find the notebook useful