In [2]:
import pandas as pd
import numpy as np
from tqdm import tqdm, trange
import string
import json
import os

from keras.preprocessing.sequence import pad_sequences
from torch import cuda, tensor, no_grad
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler

from transformers import AdamW, get_linear_schedule_with_warmup
from transformers import XLMRobertaForTokenClassification, XLMRobertaTokenizer

from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from seqeval.metrics import accuracy_score, f1_score

In [3]:
def read_data():
    #os.chdir('D:/TU_Graz/Thesis/Datasets/Reddit_features')
    train = pd.read_csv("../input/bio-tagged/train_final_all.csv")
    test = pd.read_csv("../input/bio-tagged/test_final_all.csv")
    data = train.append(test)

    return train, test, data

def group_sentences(data, category):
    all_sents = []
    sent_ids = data['Sent_id'].unique()
    for curr_id in sent_ids:
        tmp_df = data[data['Sent_id'] == curr_id]
        tmp_df = pd.concat([tmp_df['Sent_id'], tmp_df['Token'], tmp_df["Token_index"], tmp_df.iloc[:,4:147], tmp_df[category]], axis = 1)
        records = tmp_df.to_records(index=False)
        all_sents.append(records)
    return all_sents

def remove_sents_over_threshold(sents, threshold):
    sentences = list()
    for s in sents:
        if len(s) < threshold:
            sentences.append(s)
    return sentences

def set_processor_params():
    device = 'cuda' if cuda.is_available() else 'cpu'
    n_gpu = cuda.device_count()
    cuda.get_device_name(0)
    return device, n_gpu

def tokenize(sentence, sentence_labels):
    tokenized_sentence = []
    labels = []
    for word, label in zip(sentence, sentence_labels):
        str_word = str(word)
        tokenized_word = tokenizer.tokenize(str_word) # Tokenize the word
        n_subwords = len(tokenized_word) # Count subwords
        tokenized_sentence.extend(tokenized_word) # Add to the final tokenized list
        labels.extend([label] * n_subwords) # Add the same label of the original word to all of its subwords
    return tokenized_sentence, labels

In [4]:
train, test, data = read_data()
device, n_gpu = set_processor_params()

tag_values = list(set(train["BIO"].values))
tag_values.append("PAD")
tag2idx = {t: i for i, t in enumerate(tag_values)}
idx2tag = {v: k for k, v in tag2idx.items()}

print("prepare sent")
sents = group_sentences(train, 'BIO')
for i in range(0, len(sents)):
    sents[i] = sents[i][0:300]
sentences = [[word[1] for word in sentence] for sentence in sents]
labels = [[tag2idx[w[len(w)-1]] for w in s] for s in sents]

MAX_LEN = 350
BATCH_SIZE = 4
EPOCHS = 20
LEARNING_RATE = 3e-5
MAX_GRAD_NORM = 1.0

print("tokenize")
tokenizer = XLMRobertaTokenizer.from_pretrained("xlm-roberta-large", do_lower_case=False)
tokenized_texts_and_labels = [tokenize(sentence, sentence_labels) for sentence, sentence_labels in zip(sentences, labels)]

tokenized_texts = [token_label_pair[0] for token_label_pair in tokenized_texts_and_labels]
labels_subwords = [token_label_pair[1] for token_label_pair in tokenized_texts_and_labels]

print("pad")
# Cut the token and label sequences to the max length
input_ids = pad_sequences([tokenizer.convert_tokens_to_ids(txt) for txt in tokenized_texts], maxlen = MAX_LEN, dtype="long", value=0.0, 
                          truncating="post", padding="post")
input_tags = pad_sequences([[l for l in lab] for lab in labels_subwords], maxlen = MAX_LEN, value = tag2idx["PAD"], 
                           padding="post", dtype="long", truncating="post")
attention_masks = [[float(i != 0.0) for i in ii] for ii in input_ids]

tr_inputs, val_inputs, tr_tags, val_tags = train_test_split(input_ids, input_tags, test_size=0.1, random_state=2021)
tr_masks, val_masks, _, _ = train_test_split(attention_masks, input_ids, test_size=0.1, random_state=2021)

prepare sent
1185
1185
tokenize


Downloading:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/9.10M [00:00<?, ?B/s]

pad


In [5]:
tr_inputs = tensor(tr_inputs)
val_inputs = tensor(val_inputs)
tr_tags = tensor(tr_tags)
val_tags = tensor(val_tags)
tr_masks = tensor(tr_masks)
val_masks = tensor(val_masks)

train_data = TensorDataset(tr_inputs, tr_masks, tr_tags)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=BATCH_SIZE)
valid_data = TensorDataset(val_inputs, val_masks, val_tags)
valid_sampler = SequentialSampler(valid_data)
valid_dataloader = DataLoader(valid_data, sampler=valid_sampler, batch_size=BATCH_SIZE)

# Pretrained model params
model = XLMRobertaForTokenClassification.from_pretrained("xlm-roberta-large", num_labels = len(tag2idx), output_attentions = False, output_hidden_states=False)
model.cuda(); # Pass the model parameters to gpu

Downloading:   0%|          | 0.00/616 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/2.24G [00:00<?, ?B/s]

Some weights of the model checkpoint at xlm-roberta-large were not used when initializing XLMRobertaForTokenClassification: ['lm_head.bias', 'lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.decoder.weight']
- This IS expected if you are initializing XLMRobertaForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLMRobertaForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of XLMRobertaForTokenClassification were not initialized from the model checkpoint at xlm-roberta-large and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-

In [6]:
FULL_FINETUNING = True
if FULL_FINETUNING:
    param_optimizer = list(model.named_parameters())
    no_decay = ['bias', 'gamma', 'beta']
    optimizer_grouped_parameters = [
        {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
         'weight_decay_rate': 0.01},
        {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
         'weight_decay_rate': 0.0}
    ]
else:
    param_optimizer = list(model.classifier.named_parameters())
    optimizer_grouped_parameters = [{"params": [p for n, p in param_optimizer]}]
    
optimizer = AdamW(
    optimizer_grouped_parameters,
    lr=LEARNING_RATE,
    eps=1e-8
)

# Total number of training steps is number of batches * number of epochs.
total_steps = len(train_dataloader) * EPOCHS

# Create the learning rate scheduler.
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)

In [9]:
loss_values, validation_loss_values = [], []
accuracy_values, validation_accuracy_values = [], []

for i in trange(EPOCHS, desc="Epoch"):
    
    # TRAINING
    # Perform one full pass over the training set
    model.train() # Put the model into training mode
    total_loss, total_accuracy = 0, 0 # Reset the total loss and acc. for current epoch
    
    # Training loop
    for step, batch in enumerate(train_dataloader):
        batch = tuple(t.to(device) for t in batch) # add batch to gpu
        b_input_ids, b_input_mask, b_labels = batch # Input ids, mask and labels of the current batch
        model.zero_grad() # Always clear any previously calculated gradients before performing a backward pass
        #cuda.empty_cache() 
        # Forward pass
        # This will return the loss (rather than the model output) because we have provided the `labels`.
        outputs = model(b_input_ids, attention_mask=b_input_mask, labels=b_labels)
        loss = outputs[0]
        
        # Perform a backward pass to calculate the gradients
        loss.backward()
        total_loss += loss.item() # track train loss
        
        # Clip the norm of the gradient to help prevent the exploding gradients problem
        from torch.nn.utils import clip_grad_norm_
        clip_grad_norm_(parameters=model.parameters(), max_norm=MAX_GRAD_NORM)
        
        optimizer.step() # update parameters
        scheduler.step() # Update the learning rate
        
    avg_train_loss = total_loss / len(train_dataloader) # Calc. avg loss over training data
    print("Average train loss: {}".format(avg_train_loss))
    loss_values.append(avg_train_loss) # Store the loss value for plotting the learning curve

    # VALIDATION
    # After the completion of each training epoch, measure performance on validation set  
    model.eval() # Put the model into evaluation mode
    eval_loss, eval_accuracy = 0, 0 # Reset the validation loss for current epoch
    nb_eval_steps, nb_eval_examples = 0, 0
    predictions , true_labels = [], []
    
    # Validation loop
    for batch in valid_dataloader:
        batch = tuple(t.to(device) for t in batch)
        b_input_ids, b_input_mask, b_labels = batch
        #cuda.empty_cache() 
        # Telling the model not to compute or store gradients, to save memory and speed up validation
        with no_grad():
            cuda.empty_cache() 
            # Forward pass, calculate logit predictions
            # This will return the logits rather than the loss because we have not provided labels
            outputs = model(b_input_ids, attention_mask=b_input_mask, labels=b_labels)
            
            #encoded_input = tokenizer(text, return_tensors='pt')
            #output = model(**encoded_input)
        
        logits = outputs[1].detach().cpu().numpy() # Move logits to cpu
        label_ids = b_labels.to('cpu').numpy() # Move labels to cpu
        eval_loss += outputs[0].mean().item() # Valid. loss for current batch
        
        predictions.extend([list(p) for p in np.argmax(logits, axis=2)])
        true_labels.extend(label_ids)
        
    eval_loss = eval_loss / len(valid_dataloader)
    validation_loss_values.append(eval_loss)
    print("Validation loss: {}".format(eval_loss))
    
    # Calculate the accuracy for this batch of test sentences
    pred_tags = [tag_values[p_i] for p, l in zip(predictions, true_labels)
                                 for p_i, l_i in zip(p, l) if tag_values[l_i] != "PAD"]
    valid_tags = [tag_values[l_i] for l in true_labels
                                  for l_i in l if tag_values[l_i] != "PAD"]
    validation_accuracy_values.append(accuracy_score(pred_tags, valid_tags))
    print("Validation Accuracy: {}".format(accuracy_score(pred_tags, valid_tags)))

Epoch:   0%|          | 0/20 [00:00<?, ?it/s]

Average train loss: 0.3614864827653442


Epoch:   5%|▌         | 1/20 [02:39<50:38, 159.93s/it]

Validation loss: 0.23296877443790437
Validation Accuracy: 0.934170104450101
Average train loss: 0.2021774271649591


Epoch:  10%|█         | 2/20 [05:18<47:48, 159.37s/it]

Validation loss: 0.2215116299688816
Validation Accuracy: 0.9383832177652944
Average train loss: 0.15368054970429185


Epoch:  15%|█▌        | 3/20 [07:57<45:05, 159.16s/it]

Validation loss: 0.19733600007990995
Validation Accuracy: 0.9466338980075485
Average train loss: 0.11464848398311196


Epoch:  20%|██        | 4/20 [10:36<42:25, 159.11s/it]

Validation loss: 0.20815670775870482
Validation Accuracy: 0.9466338980075485
Average train loss: 0.08217178747905524


Epoch:  25%|██▌       | 5/20 [13:15<39:45, 159.00s/it]

Validation loss: 0.24258441006143888
Validation Accuracy: 0.9447028877380848
Average train loss: 0.06148220700201359


Epoch:  30%|███       | 6/20 [15:54<37:05, 158.94s/it]

Validation loss: 0.2338101178407669
Validation Accuracy: 0.9487404546651452
Average train loss: 0.04652103779694477


Epoch:  35%|███▌      | 7/20 [18:33<34:25, 158.89s/it]

Validation loss: 0.29998933201034866
Validation Accuracy: 0.9487404546651452
Average train loss: 0.03709957897593354


Epoch:  40%|████      | 8/20 [21:11<31:45, 158.83s/it]

Validation loss: 0.27452956984440485
Validation Accuracy: 0.9468094443956816
Average train loss: 0.025844199915393867


Epoch:  45%|████▌     | 9/20 [23:50<29:06, 158.77s/it]

Validation loss: 0.30547475231190524
Validation Accuracy: 0.9494426402176775
Average train loss: 0.018392425092601666


Epoch:  50%|█████     | 10/20 [26:29<26:26, 158.69s/it]

Validation loss: 0.3243030616392692
Validation Accuracy: 0.9484771350829456
Average train loss: 0.015611621671609585


Epoch:  55%|█████▌    | 11/20 [29:07<23:46, 158.51s/it]

Validation loss: 0.32566913465658825
Validation Accuracy: 0.9488282278592118
Average train loss: 0.012784849170400073


Epoch:  60%|██████    | 12/20 [31:45<21:06, 158.34s/it]

Validation loss: 0.3572376176714897
Validation Accuracy: 0.9485649082770122
Average train loss: 0.009104769756659527


Epoch:  65%|██████▌   | 13/20 [34:22<18:26, 158.13s/it]

Validation loss: 0.3553126295407613
Validation Accuracy: 0.9508470113227421
Average train loss: 0.00750851441770284


Epoch:  70%|███████   | 14/20 [37:00<15:47, 157.91s/it]

Validation loss: 0.3646807567526897
Validation Accuracy: 0.9494426402176775
Average train loss: 0.005239606875251963


Epoch:  75%|███████▌  | 15/20 [39:37<13:08, 157.63s/it]

Validation loss: 0.39547887903948625
Validation Accuracy: 0.9497937329939437
Average train loss: 0.004076344563377171


Epoch:  80%|████████  | 16/20 [42:14<10:29, 157.44s/it]

Validation loss: 0.39191478515664735
Validation Accuracy: 0.9510225577108751
Average train loss: 0.003095412393554994


Epoch:  85%|████████▌ | 17/20 [44:51<07:51, 157.25s/it]

Validation loss: 0.4033694473405679
Validation Accuracy: 0.9505836917405425
Average train loss: 0.0028177620559917283


Epoch:  90%|█████████ | 18/20 [47:27<05:14, 157.11s/it]

Validation loss: 0.4054140463471413
Validation Accuracy: 0.9504959185464759
Average train loss: 0.0025637121457787607


Epoch:  95%|█████████▌| 19/20 [50:04<02:36, 156.96s/it]

Validation loss: 0.4106761823097865
Validation Accuracy: 0.9504959185464759
Average train loss: 0.0018704419553784888


Epoch: 100%|██████████| 20/20 [52:41<00:00, 158.05s/it]

Validation loss: 0.4191238063077132
Validation Accuracy: 0.9507592381286755





In [10]:
################ CALCULATE INTEREMEDIATE RESULTS ################
test_sents = group_sentences(test, 'BIO')
for i in range(0,len(test_sents)):
    test_sents[i] = test_sents[i][0:300]
    
test_sentences = [[word[1] for word in sentence] for sentence in test_sents]  

test_labels = [[tag2idx[w[len(w)-1]] for w in s] for s in test_sents]
test_labels_str = [[w[len(w)-1] for w in s] for s in test_sents]

reports = dict()
i = 0
for test_sentence in test_sentences:
    tokenized_sentence = tokenizer.encode(test_sentence)
    input_ids = tensor([tokenized_sentence]).cuda()
    input_masks = tensor([[float(i != 0.0) for i in ii] for ii in input_ids]).cuda()
    with no_grad():
        output = model(input_ids, attention_mask = input_masks)
    label_indices = np.argmax(output[0].to('cpu').numpy(), axis=2)
    label_indices = label_indices[0]
    label_indices = label_indices[1:-1]
    
    tokens = tokenizer.convert_ids_to_tokens(input_ids.to('cpu').numpy()[0])
    new_tokens, new_labels = [], []
    for token, label_idx in zip(tokens, label_indices):
        if token.startswith("##"):
            new_tokens[-1] = new_tokens[-1] + token[2:]
        else:
            new_labels.append(tag_values[label_idx])
            new_tokens.append(token)
    report = classification_report(test_labels_str[i], new_labels, output_dict=True)
    reports[test_sents[i][0][0]] = report
    i += 1

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [11]:
import json    
with open("xlmroberta_large_per_submission_tag_evaluation.json", "w") as outfile:
    json.dump(reports, outfile)

In [12]:
########## CALCULATE ALL RESULTS #############

print("Prepare test set...")
test_sents = group_sentences(test, 'BIO')
for i in range(0,len(test_sents)):
    test_sents[i] = test_sents[i][0:300]
    
test_sentences = [[word[1] for word in sentence] for sentence in test_sents]
test_labels = [[tag2idx[w[len(w)-1]] for w in s] for s in test_sents]
test_labels_str = [[w[len(w)-1] for w in s] for s in test_sents]

print("Tokenize and predict...")
all_predictions = []
all_true_labels = []
all_true_labels_str = []

all_predictions_list = []

for lab in test_labels:
    all_true_labels.extend(lab)
    
for lab in test_labels_str:
    all_true_labels_str.extend(lab)

Prepare test set...
Tokenize and predict...


In [13]:
for test_sentence in test_sentences:
    tokenized_sentence = tokenizer.encode(test_sentence)
    input_ids = tensor([tokenized_sentence]).cuda()
    input_masks = tensor([[float(i != 0.0) for i in ii] for ii in input_ids]).cuda()
    with no_grad():
        output = model(input_ids, attention_mask = input_masks)
    label_indices = np.argmax(output[0].to('cpu').numpy(), axis=2)
    label_indices = label_indices[0]
    label_indices = label_indices[1:-1]
    
    tokens = tokenizer.convert_ids_to_tokens(input_ids.to('cpu').numpy()[0])
    new_tokens, new_labels = [], []
    for token, label_idx in zip(tokens, label_indices):
        if token.startswith("##"):
            new_tokens[-1] = new_tokens[-1] + token[2:]
        else:
            new_labels.append(tag_values[label_idx])
            new_tokens.append(token)
    all_predictions.extend(new_labels)
    all_predictions_list.append(new_labels)
    
all_preds = [tag2idx[label] for label in all_predictions]
report = classification_report(all_true_labels_str, all_predictions)
print(report)

#with open("roberta_large_all_submission_tag_evaluation.json", "w") as outfile:
#    json.dump(report, outfile)

               precision    recall  f1-score   support

  B-actor-neg       0.00      0.00      0.00         7
  B-actor-pos       0.00      0.00      0.00       131
    B-gen-neg       0.75      0.06      0.11        53
    B-gen-pos       0.56      0.11      0.18       637
B-keyword-neg       0.29      0.03      0.05       199
B-keyword-pos       0.41      0.08      0.13      2472
  B-movie-neg       0.60      0.05      0.09        59
  B-movie-pos       0.45      0.20      0.28      4597
  I-actor-neg       0.00      0.00      0.00         6
  I-actor-pos       0.00      0.00      0.00       124
    I-gen-neg       0.00      0.00      0.00         5
    I-gen-pos       0.97      0.28      0.43       109
I-keyword-neg       0.00      0.00      0.00        93
I-keyword-pos       0.48      0.04      0.07      1694
  I-movie-neg       0.33      0.01      0.03        72
  I-movie-pos       0.80      0.16      0.27      6019
            O       0.85      0.99      0.91     72944

     acc

In [14]:
import torch
torch.save(model, 'xlm-roberta')

In [15]:
predictions = all_predictions_list
all_outputs = dict()   # Stores all outputs from the test dataset per entity
all_outputs_per_sentence = dict()  # Stores separate dictionaries per entity for every sentence in the dataset

for i in range(0, len(predictions)): # Sentences iteration
    tmp_dict = dict()
    max_len = len(predictions[i])
    for j in range(0, len(predictions[i])-3): # Word iteration
        if predictions[i][j] == 'B-movie-pos':
            if not 'positive_movies' in all_outputs.keys():
                all_outputs['positive_movies'] = []
            if not 'positive_movies' in tmp_dict.keys():
                tmp_dict['positive_movies'] = []
            tmp_entity = test_sentences[i][j]
            k = j+1
            while(predictions[i][k] == 'I-movie-pos' and k < max_len-1):
                tmp_entity += ' ' + test_sentences[i][k]
                if k < len(predictions):
                    k +=1 
            all_outputs['positive_movies'].append(tmp_entity)
            if tmp_entity not in tmp_dict['positive_movies']:
                tmp_dict['positive_movies'].append(tmp_entity)
        
        if predictions[i][j] == 'B-movie-neg':
            if not 'negative_movies' in all_outputs.keys():
                all_outputs['negative_movies'] = []
            if not 'negative_movies' in tmp_dict.keys():
                tmp_dict['negative_movies'] = []
            tmp_entity = test_sentences[i][j]
            k = j+1
            while(predictions[i][k] == 'I-movie-neg'):
                tmp_entity += ' ' + test_sentences[i][k]
                k +=1 
            all_outputs['negative_movies'].append(tmp_entity)
            if tmp_entity not in tmp_dict['negative_movies']:
                tmp_dict['negative_movies'].append(tmp_entity)
            
        if predictions[i][j] == 'B-keyword-pos':
            if not 'positive_keywords' in all_outputs.keys():
                all_outputs['positive_keywords'] = []
            if not 'positive_keywords' in tmp_dict.keys():
                tmp_dict['positive_keywords'] = []
            tmp_entity = test_sentences[i][j]
            k = j+1
            while(predictions[i][k] == 'I-keyword-pos'):
                tmp_entity += ' ' + test_sentences[i][k]
                #tmp_entity test_sentences[i][k]
                k +=1 
            all_outputs['positive_keywords'].append(tmp_entity)
            if tmp_entity not in tmp_dict['positive_keywords']:
                tmp_dict['positive_keywords'].append(tmp_entity)

                    
        if predictions[i][j] == 'B-keyword-neg':
            if not 'negative_keywords' in all_outputs.keys():
                all_outputs['negative_keywords'] = []
            if not 'negative_keywords' in tmp_dict.keys():
                tmp_dict['negative_keywords'] = []
            tmp_entity = test_sentences[i][j]
            k = j+1
            while(predictions[i][k] == 'I-keyword-neg'):
                tmp_entity += ' ' + test_sentences[i][k]
                k +=1 
            all_outputs['negative_keywords'].append(tmp_entity)
            if tmp_entity not in tmp_dict['negative_keywords']:
                tmp_dict['negative_keywords'].append(tmp_entity)
                    
                    
        if predictions[i][j] == 'B-actor-pos':
            if not 'positive_actors' in all_outputs.keys():
                all_outputs['positive_actors'] = []
            if not 'positive_actors' in tmp_dict.keys():
                tmp_dict['positive_actors'] = []
            tmp_entity = test_sentences[i][j]
            k = j+1
            while(predictions[i][k] == 'I-actor-pos'):
                tmp_entity += ' ' + test_sentences[i][k]
                k +=1 
            all_outputs['positive_actors'].append(tmp_entity)
            if tmp_entity not in tmp_dict['positive_actors']:
                tmp_dict['positive_actors'].append(tmp_entity)
            
        if predictions[i][j] == 'B-actor-neg':
            if not 'negative_actors' in all_outputs.keys():
                all_outputs['negative_actors'] = []
            if not 'negative_actors' in tmp_dict.keys():
                tmp_dict['negative_actors'] = []
            tmp_entity = test_sentences[i][j]
            k = j+1
            while(predictions[i][k] == 'I-actor-neg'):
                tmp_entity += ' ' + test_sentences[i][k]
                k +=1 
            all_outputs['negative_actors'].append(tmp_entity)
            if tmp_entity not in tmp_dict['negative_actors']:
                tmp_dict['negative_actors'].append(tmp_entity)
            
        if predictions[i][j] == 'B-gen-pos':
            if not 'positive_genres' in all_outputs.keys():
                all_outputs['positive_genres'] = []
            if not 'positive_genres' in tmp_dict.keys():
                tmp_dict['positive_genres'] = []
            tmp_entity = test_sentences[i][j]
            k = j+1
            while(predictions[i][k] == 'I-gen-pos'):
                tmp_entity += ' ' + test_sentences[i][k]
                k +=1 
            all_outputs['positive_genres'].append(tmp_entity)
            if tmp_entity not in tmp_dict['positive_genres']:
                tmp_dict['positive_genres'].append(tmp_entity)
            
        if predictions[i][j] == 'B-gen-neg':
            if not 'negative_genres' in all_outputs.keys():
                all_outputs['negative_genres'] = []
            if not 'negative_genres' in tmp_dict.keys():
                tmp_dict['negative_genres'] = []
            tmp_entity = test_sentences[i][j]
            k = j+1
            while(predictions[i][k] == 'I-gen-neg'):
                tmp_entity += ' ' + test_sentences[i][k]
                k +=1 
            all_outputs['negative_genres'].append(tmp_entity)
            if tmp_entity not in tmp_dict['negative_genres']:
                tmp_dict['negative_genres'].append(tmp_entity)

    if i < len(test_sents):
        all_outputs_per_sentence[test_sents[i][0][0]] = tmp_dict

In [16]:
def split_keyphrases(dict):
    for key in dict.keys():
        if "positive_keywords" in dict[key].keys():
            tmp_pos_keys = []
            for keyphrase in dict[key]["positive_keywords"]:
                keywords = keyphrase.split(" ")
                tmp_pos_keys.extend(keywords)
            dict[key]["positive_keywords"] = list(set(tmp_pos_keys))
        if "negative_keywords" in dict[key].keys():
            tmp_neg_keys = []
            for keyphrase in dict[key]["negative_keywords"]:
                keywords = keyphrase.split(" ")
                tmp_neg_keys.extend(keywords)
            dict[key]["negative_keywords"] = list(set(tmp_neg_keys))
    return dict

In [17]:
all_outputs_per_sentence = split_keyphrases(all_outputs_per_sentence)
    
with open("xlm-roberta_large_best_umatched_format_1.json", "w") as outfile:
    json.dump(all_outputs_per_sentence, outfile)

In [19]:
report = classification_report(test_labels_str, all_predictions_list)
print(report)

  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

   actor-neg       0.00      0.00      0.00         7
   actor-pos       0.00      0.00      0.00       131
     gen-neg       0.75      0.06      0.11        53
     gen-pos       0.33      0.07      0.11       639
 keyword-neg       0.10      0.01      0.02       200
 keyword-pos       0.19      0.04      0.06      2475
   movie-neg       0.29      0.03      0.06        64
   movie-pos       0.11      0.05      0.07      4599

   micro avg       0.13      0.04      0.07      8168
   macro avg       0.22      0.03      0.05      8168
weighted avg       0.15      0.04      0.07      8168



In [20]:
all_true_labels_bukvi = [idx2tag[tag] for tag in all_true_labels]
for token, pred_label, true_label in zip(test_sentences[5], all_predictions_list[5], test_labels[5]):
    print("{}\t{}\t{}".format(token, pred_label, idx2tag[true_label]))

Movies	O	O
surrounding	O	O
characters	O	O
who	O	O
suddenly	O	O
stop	O	O
giving	O	O
a	O	O
fuck	O	O
Examples	O	O
;	O	O
NEW_LINE	O	O
NEW_LINE	O	O
*	O	O
[	O	O
Office	B-movie-pos	B-movie-pos
Space	B-movie-pos	I-movie-pos
]	O	O
(	O	O
NEW_LINE	O	O
NEW_LINE	O	O
*	O	O
[	O	O
Fight	B-movie-pos	B-movie-pos
Club	B-movie-pos	I-movie-pos
]	O	O
(	O	O
NEW_LINE	O	O
NEW_LINE	O	O
*	O	O
[	O	O
Falling	O	B-movie-pos
Down	O	I-movie-pos
]	O	O
(	O	O
NEW_LINE	O	O
NEW_LINE	O	O
*	O	O
God	B-movie-pos	B-movie-pos
Bless	B-movie-pos	I-movie-pos
America	O	I-movie-pos


In [21]:
model_name = 'xlm_roberta_best'

dict_save = open("t2idx_xlmroberta.json", "w")
json.dump(tag2idx, dict_save)
dict_save.close()

dict_save = open("idx2t_xlmroberta.json", "w")
json.dump(idx2tag, dict_save)
dict_save.close()