In [None]:
import pandas as pd
import numpy as np
from tqdm import tqdm, trange
import string
import os
from torch import cuda
import seaborn as sns
import matplotlib.pyplot as plt

import torch
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from transformers import BertTokenizer, BertConfig, BertModel
from keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
import transformers
from transformers import BertForTokenClassification, AdamW, get_linear_schedule_with_warmup
from seqeval.metrics import f1_score, accuracy_score, recall_score
from sklearn.metrics import classification_report
from transformers import RobertaTokenizer, RobertaForTokenClassification
from transformers import AutoTokenizer, AutoModelForMaskedLM

In [None]:
pip install seqeval

In [None]:
def read_data():
    #os.chdir('D:/TU_Graz/Thesis/Datasets/Reddit_features')
    
    train = pd.read_csv("../input/bio-tagged/train_final_all.csv")
    test = pd.read_csv("../input/bio-tagged/test_final_all.csv")
    
    #train = pd.read_csv("train_final_all.csv")
    #test = pd.read_csv("test_final_all.csv")
    data = train.append(test)

    return train, test, data

In [None]:
def data_stats(data):
    frequencies = data.BIO.value_counts()
    tags = {}
    for tag, count in zip(frequencies.index, frequencies):
        if tag != "O":
            if tag[2:5] not in tags.keys():
                tags[tag[2:5]] = count
            else:
                tags[tag[2:5]] += count
        continue
    
    print("Number of tags: {}".format(len(data.BIO.unique())))
    print("Tag frequencies: {}".format(frequencies))
    print("Categories: ")
    print(sorted(tags.items(), key=lambda x: x[1], reverse=True))

In [None]:
def group_sentences(data, category):
    all_sents = []
    sent_ids = data['Sent_id'].unique()
    for curr_id in sent_ids:
        tmp_df = data[data['Sent_id'] == curr_id]
        tmp_df = pd.concat([tmp_df['Token'], tmp_df["Token_index"], tmp_df.iloc[:,4:149], tmp_df[category]], axis = 1)
        records = tmp_df.to_records(index=False)
        all_sents.append(records)
    return all_sents

In [None]:
def set_processor_params():
    device = 'cuda' if cuda.is_available() else 'cpu'
    n_gpu = torch.cuda.device_count()
    torch.cuda.get_device_name(0)
    return device, n_gpu

In [None]:
def set_session_params():
    max_length = 300
    bs = 16
    epochs = 7
    learning_rate = 1e-05
    max_grad_norm = 10
    tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
    #tokenizer = RobertaTokenizer.from_pretrained('roberta-large')
    #tokenizer = AutoTokenizer.from_pretrained('xlm-roberta-base')
    return max_length, bs, epochs, learning_rate, max_grad_norm, tokenizer

In [None]:
def remove_sents_over_threshold(sents, threshold):
    sentences = list()
    for s in sents:
        if len(s) < threshold:
            sentences.append(s)
    return sentences

In [None]:
def tokenize(sentence, sentence_labels):
    tokenized_sentence = []
    labels = []
    for word, label in zip(sentence, sentence_labels):
        str_word = str(word)
        tokenized_word = tokenizer.tokenize(str_word) # Tokenize the word
        n_subwords = len(tokenized_word) # Count subwords
        tokenized_sentence.extend(tokenized_word) # Add to the final tokenized list
        labels.extend([label] * n_subwords) # Add the same label of the original word to all of its subwords
    return tokenized_sentence, labels

In [None]:
def plot_learning_curves():
    sns.set(style='darkgrid')
    sns.set(font_scale=1.5)
    plt.rcParams["figure.figsize"] = (6,6)
    plt.plot(loss_values, 'b-o', label="training loss")
    plt.plot(validation_loss_values, 'r-o', label="validation loss")
    plt.title("Learning curve")
    plt.xlabel("Epoch")
    plt.ylabel("Loss")
    plt.legend()
    plt.show()

In [None]:
train, test, data = read_data() # We'll save the test set for later
#data_stats(data)
device, n_gpu = set_processor_params()

tag_values = list(set(train["BIO"].values))
tag_values.append("PAD")
tag2idx = {t: i for i, t in enumerate(tag_values)}
idx2tag = {i: t for i, t in enumerate(tag_values)}

sents = group_sentences(data, 'BIO')
sents = remove_sents_over_threshold(sents, 300)
sentences = [[word[0] for word in sentence] for sentence in sents]
labels = [[tag2idx[w[len(w)-1]] for w in s] for s in sents]
train_sents, test_sents, train_labels, test_labels = train_test_split(sentences, labels, test_size=0.25, shuffle = False)

MAX_LEN, BATCH_SIZE, EPOCHS, LEARNING_RATE, MAX_GRAD_NORM, tokenizer = set_session_params()

tokenized_texts_and_labels = [
    tokenize(sentence, sentence_labels) for sentence, sentence_labels in zip(train_sents, train_labels)
]

tokenized_texts = [token_label_pair[0] for token_label_pair in tokenized_texts_and_labels]
labels_subwords = [token_label_pair[1] for token_label_pair in tokenized_texts_and_labels]

# Cut the token and label sequences to the max length
input_ids = pad_sequences([tokenizer.convert_tokens_to_ids(txt) for txt in tokenized_texts],
                          maxlen = MAX_LEN, dtype="long", value=0.0, 
                          truncating="post", padding="post")
input_tags = pad_sequences([[l for l in lab] for lab in labels_subwords],
                           maxlen = MAX_LEN, value = tag2idx["PAD"], 
                           padding="post", dtype="long", truncating="post")
attention_masks = [[float(i != 0.0) for i in ii] for ii in input_ids]

# Train and validation split
tr_inputs, val_inputs, tr_tags, val_tags = train_test_split(input_ids, input_tags, test_size=0.2)
tr_masks, val_masks, _, _ = train_test_split(attention_masks, input_ids, test_size=0.2)
tr_inputs = torch.tensor(tr_inputs)
val_inputs = torch.tensor(val_inputs)
tr_tags = torch.tensor(tr_tags)
val_tags = torch.tensor(val_tags)
tr_masks = torch.tensor(tr_masks)
val_masks = torch.tensor(val_masks)

train_data = TensorDataset(tr_inputs, tr_masks, tr_tags)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=BATCH_SIZE)
valid_data = TensorDataset(val_inputs, val_masks, val_tags)
valid_sampler = SequentialSampler(valid_data)
valid_dataloader = DataLoader(valid_data, sampler=valid_sampler, batch_size=BATCH_SIZE)

# Pretrained model params
model = RobertaForTokenClassification.from_pretrained('roberta-base',
                                                      output_attentions = False,
                                                      output_hidden_states = False)
#model = BertForTokenClassification.from_pretrained("xlm-roberta-base", 
#                                                   num_labels = len(tag2idx),
#                                                   output_attentions = False,
#                                                   output_hidden_states = False)
model.cuda(); # Pass the model parameters to gpu

# Set optimizer parameters
param_optimizer = list(model.named_parameters())
no_decay = ['bias', 'gamma', 'beta']
optimizer_grouped_parameters = [
    {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
     'weight_decay_rate': 0.01},
    {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
     'weight_decay_rate': 0.0}]
optimizer = AdamW(optimizer_grouped_parameters, lr=3e-5, eps=1e-8)

# Total number of training steps is number of batches * number of epochs.
total_steps = len(train_dataloader) * EPOCHS

# Create the learning rate scheduler.
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)

In [None]:
loss_values, validation_loss_values = [], []
accuracy_values, validation_accuracy_values = [], []

for i in trange(EPOCHS, desc="Epoch"):
    
    # TRAINING
    # Perform one full pass over the training set
    model.train() # Put the model into training mode
    total_loss, total_accuracy = 0, 0 # Reset the total loss and acc. for current epoch
    
    # Training loop
    for step, batch in enumerate(train_dataloader):
        batch = tuple(t.to(device) for t in batch) # add batch to gpu
        b_input_ids, b_input_mask, b_labels = batch # Input ids, mask and labels of the current batch
        model.zero_grad() # Always clear any previously calculated gradients before performing a backward pass
    
        # Forward pass
        # This will return the loss (rather than the model output) because we have provided the `labels`.
        outputs = model(b_input_ids, attention_mask=b_input_mask, labels=b_labels)
        loss = outputs[0]
        
        # Perform a backward pass to calculate the gradients
        loss.backward()
        total_loss += loss.item() # track train loss
        
        # Clip the norm of the gradient to help prevent the exploding gradients problem
        torch.nn.utils.clip_grad_norm_(parameters=model.parameters(), max_norm=MAX_GRAD_NORM)
        
        optimizer.step() # update parameters
        scheduler.step() # Update the learning rate
        
    avg_train_loss = total_loss / len(train_dataloader) # Calc. avg loss over training data
    print("Average train loss: {}".format(avg_train_loss))
    loss_values.append(avg_train_loss) # Store the loss value for plotting the learning curve

    # VALIDATION
    # After the completion of each training epoch, measure performance on validation set  
    model.eval() # Put the model into evaluation mode
    eval_loss, eval_accuracy = 0, 0 # Reset the validation loss for current epoch
    nb_eval_steps, nb_eval_examples = 0, 0
    predictions , true_labels = [], []
    
    # Validation loop
    for batch in valid_dataloader:
        batch = tuple(t.to(device) for t in batch)
        b_input_ids, b_input_mask, b_labels = batch
    
        # Telling the model not to compute or store gradients, to save memory and speed up validation
        with torch.no_grad():
            # Forward pass, calculate logit predictions
            # This will return the logits rather than the loss because we have not provided labels
            outputs = model(b_input_ids, attention_mask=b_input_mask, labels=b_labels)
            
            #encoded_input = tokenizer(text, return_tensors='pt')
            #output = model(**encoded_input)
        
        logits = outputs[1].detach().cpu().numpy() # Move logits to cpu
        label_ids = b_labels.to('cpu').numpy() # Move labels to cpu
        eval_loss += outputs[0].mean().item() # Valid. loss for current batch
        
        predictions.extend([list(p) for p in np.argmax(logits, axis=2)])
        true_labels.extend(label_ids)
        
    eval_loss = eval_loss / len(valid_dataloader)
    validation_loss_values.append(eval_loss)
    print("Validation loss: {}".format(eval_loss))
    
    # Calculate the accuracy for this batch of test sentences
    pred_tags = [tag_values[p_i] for p, l in zip(predictions, true_labels)
                                 for p_i, l_i in zip(p, l) if tag_values[l_i] != "PAD"]
    valid_tags = [tag_values[l_i] for l in true_labels
                                  for l_i in l if tag_values[l_i] != "PAD"]
    validation_accuracy_values.append(accuracy_score(pred_tags, valid_tags))
    print("Validation Accuracy: {}".format(accuracy_score(pred_tags, valid_tags)))

In [None]:
plot_learning_curves()

In [None]:
print("Prepare test set...")
test_sentences = [[word for word in sentence] for sentence in test_sents]
#test_sentences = [" ".join(sentence) for sentence in test_sentences]
#true_labels = [[tag for w in s] for s in test_labels]

print("Tokenize and predict...")
all_predictions = []
all_true_labels = []

for lab in test_labels:
    all_true_labels.extend(lab)
    
for test_sentence in test_sentences:
    tokenized_sentence = tokenizer.encode(test_sentence)
    input_ids = torch.tensor([tokenized_sentence]).cuda()
    with torch.no_grad():
        output = model(input_ids)
    label_indices = np.argmax(output[0].to('cpu').numpy(), axis=2)
    label_indices = label_indices[0]
    label_indices = label_indices[1:-1]
    
    tokens = tokenizer.convert_ids_to_tokens(input_ids.to('cpu').numpy()[0])
    new_tokens, new_labels = [], []
    for token, label_idx in zip(tokens, label_indices):
        if token.startswith("##"):
            new_tokens[-1] = new_tokens[-1] + token[2:]
        else:
            new_labels.append(tag_values[label_idx])
            new_tokens.append(token)
    all_predictions.extend(new_labels)

In [None]:
all_preds = [tag2idx[label] for label in all_predictions]
report = classification_report(all_true_labels, all_preds)
print(report)

In [None]:
tag2idx

In [None]:
for token, label, true in zip(test_sentences[0], new_labels, true_labels[0]):
    print("{}\t{}\t{}".format(token,label,idx2tag[true]))