In [None]:
#First of all, you must install next software requirements

#!/opt/conda/bin/python3.7 -m pip install --upgrade pip
#!pip install seqeval
#!pip install tensorflow-addons

In [None]:
import sys
sys.path.append('libs/')

import datetime, os
import random
import time

from tqdm import tqdm
from tabulate import tabulate
import numpy as np
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.preprocessing.sequence import pad_sequences
from utils import build_matrix_embeddings as bme, plot_model_performance, logits_to_tokens, report_to_df
from transformers import (
    TF2_WEIGHTS_NAME,
    BertConfig,
    BertTokenizer,
    TFBertForTokenClassification,
    create_optimizer)


from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report as eskclarep
from seqeval.metrics import precision_score, recall_score, f1_score, accuracy_score
from IPython.core.display import display, HTML

# ****** DEFINICION DE PARAMETROS *********
MAX_LEN        = 348
NUM_LABELS     = 12
WORD_PAD_TOKEN = 0

ESPECIAL_TOKEN = 9
SEP_TOKEN      = 10
PAD_TOKEN      = 11
WORD_PAD_TOKEN = 0

configuration = BertConfig()
BERT_MODEL = "bert-base-multilingual-cased"

#MODEL         = 'model'
log_dir       = "saved_model/logs/model/"
save_dir      = "saved_model/model/" 

le_dicti = {'B-NEG': 0, 'B-NSCO': 1, 'B-UNC': 2, 'B-USCO': 3, 'I-NEG': 4, 'I-NSCO': 5, 'I-UNC': 6, 'I-USCO': 7, 'O': 8, '[CLS]': 9, '[SEP]': 10, '[PAD]': 11}

le_dict = {}
for key in le_dicti:
    #print(key, '->', le_dict[key])
    le_dict[le_dicti[key]] = key

In [None]:
#Cargar CSV
import pandas as pd
from sklearn import preprocessing

def process_csv(data_path):    
    df = pd.read_csv(data_path, encoding="utf-8")
    df.loc[:, "Sentence #"] = df["Sentence #"].fillna(method="ffill")
    enc_tag = preprocessing.LabelEncoder()
    df.loc[:, "Tag"] = enc_tag.fit_transform(df["Tag"])
    sentences_l = df.groupby("Sentence #")["Word"].apply(list).values
    sentences = sentences_l.tolist()
    tag_l = df.groupby("Sentence #")["Tag"].apply(list).values
    tag = tag_l.tolist()
    return sentences, tag, enc_tag

In [None]:

test_data_csv = "sentences_test.csv"
X_test, y_test, enc_tag_test   = process_csv(test_data_csv)

In [None]:
print(X_test[0])
print(y_test[0])
print(len(X_test))

In [None]:
def convert_to_input(sentences, tags, in_ou_put):
    input_id_list       = []
    attention_mask_list = [] 
    token_type_id_list  = []
    
    if in_ou_put == 1:
        label_id_list   = []
    else:
        label_id_list   = 0
    
    for x,y in tqdm(zip(sentences,tags),total=len(tags)):
        tokens = []
        
        if in_ou_put == 1:
            label_ids = []
        
        for word, label in zip(x, y):
            word_tokens = tokenizer.tokenize(str(word))
            tokens.extend(word_tokens)
            # Use the real label id for the first token of the word, 
            # and padding ids for the remaining tokens
            if in_ou_put == 1:
                #label_ids.extend([label_map[label]] + [pad_token_label_id] * (len(word_tokens) - 1))
                label_ids.extend([label] + [SEP_TOKEN] * (len(word_tokens) - 1))
        
        # special_tokens_count =  2
        
        #if len(tokens) > LEN_SENTS - special_tokens_count:
        #    tokens = tokens[: (LEN_SENTS - special_tokens_count)]

        #    if in_ou_put == 1:
        #        label_ids = label_ids[: (LEN_SENTS - special_tokens_count)]
        
        if in_ou_put == 1:
            #label_ids = [pad_token_label_id] + label_ids + [pad_token_label_id]
            label_ids = [ESPECIAL_TOKEN] + label_ids + [ESPECIAL_TOKEN]
        
        inputs = tokenizer.encode_plus(tokens, add_special_tokens=True, max_length=MAX_LEN)
        
        input_ids       = inputs["input_ids"]
        token_type_ids  = inputs["token_type_ids"]
        attention_masks = inputs["attention_mask"]
        
        #print(attention_masks)
        #attention_masks = [17] + [1] * (len(input_ids)-2) + [17]
        #print(attention_masks)
        
        attention_mask_list.append(attention_masks)
        input_id_list.append(input_ids)
        token_type_id_list.append(token_type_ids)
        
        if in_ou_put == 1:
            label_id_list.append(label_ids)

    input_id_list       = pad_sequences(maxlen=MAX_LEN, sequences=input_id_list,       dtype="int32", padding="post", value=WORD_PAD_TOKEN)
    token_type_id_list  = pad_sequences(maxlen=MAX_LEN, sequences=token_type_id_list,  dtype="int32", padding="post")
    attention_mask_list = pad_sequences(maxlen=MAX_LEN, sequences=attention_mask_list, dtype="int32", padding="post")
    
    if in_ou_put == 1:
        label_id_list   = pad_sequences(maxlen=MAX_LEN, sequences=label_id_list, dtype="int32", padding="post", value=PAD_TOKEN)
        #label_id_list   = [to_categorical(i, num_classes=num_labels, dtype ="int32") for i in label_id_list]
        #label_id_list   = np.array(label_id_list)

    return input_id_list, token_type_id_list, attention_mask_list, label_id_list

In [None]:

MODEL_CLASSES = {"bert": (BertConfig, TFBertForTokenClassification, BertTokenizer)}
config_class, model_class, tokenizer_class = MODEL_CLASSES['bert']
config = config_class.from_pretrained(BERT_MODEL, num_labels=NUM_LABELS)

tokenizer = tokenizer_class.from_pretrained(BERT_MODEL, do_lower_case=False)


In [None]:
#Loading the model
print("\nLoading the model for negation and speculation detection ...")

new_model = tf.saved_model.load(save_dir)

print ("\nModel loaded ...")

In [None]:
#print(len(X_test))

In [None]:
#Note: Sentences must be previously tokenized (e.g Using Spacy for spanish)
#The model receives a tokenized sentence and returns a negation or speculation label for each token.
#Next sentences have been proviously tokenized. 
#It is recommended to use  Spacy tokenizerfor spanish language.

    
#demo_input_ids_train, demo_token_ids_train, demo_attention_masks_train, label_ids_train = convert_to_input(X_test, y_test, 1)
#demo_input_ids_train1, demo_token_ids_train1, demo_attention_masks_train1, label_ids_train1 = convert_to_input(X_test[0:100], y_test[0:100], 1)
#demo_input_ids_train2, demo_token_ids_train2, demo_attention_masks_train2, label_ids_train2 = convert_to_input(X_test[100:200], y_test[100:200], 1)
#demo_input_ids_train3, demo_token_ids_train3, demo_attention_masks_train3, label_ids_train3 = convert_to_input(X_test[200:300], y_test[200:300], 1)
#demo_input_ids_train4, demo_token_ids_train4, demo_attention_masks_train4, label_ids_train4 = convert_to_input(X_test[300:400], y_test[300:400], 1)
#demo_input_ids_train5, demo_token_ids_train5, demo_attention_masks_train5, label_ids_train5 = convert_to_input(X_test[400:500], y_test[400:500], 1)
#demo_input_ids_train6, demo_token_ids_train6, demo_attention_masks_train6, label_ids_train6 = convert_to_input(X_test[500:600], y_test[500:600], 1)
#demo_input_ids_train7, demo_token_ids_train7, demo_attention_masks_train7, label_ids_train7 = convert_to_input(X_test[600:700], y_test[600:700], 1)
#demo_input_ids_train8, demo_token_ids_train8, demo_attention_masks_train8, label_ids_train8 = convert_to_input(X_test[700:776], y_test[700:776], 1)

demo_input_ids_train1, demo_token_ids_train1, demo_attention_masks_train1, label_ids_train1 = convert_to_input(X_test[0:150], y_test[0:150], 1)
demo_input_ids_train2, demo_token_ids_train2, demo_attention_masks_train2, label_ids_train2 = convert_to_input(X_test[150:300], y_test[150:300], 1)
demo_input_ids_train3, demo_token_ids_train3, demo_attention_masks_train3, label_ids_train3 = convert_to_input(X_test[300:450], y_test[300:450], 1)
demo_input_ids_train4, demo_token_ids_train4, demo_attention_masks_train4, label_ids_train4 = convert_to_input(X_test[450:600], y_test[450:600], 1)
demo_input_ids_train5, demo_token_ids_train5, demo_attention_masks_train5, label_ids_train5 = convert_to_input(X_test[600:750], y_test[600:750], 1)
demo_input_ids_train6, demo_token_ids_train6, demo_attention_masks_train6, label_ids_train6 = convert_to_input(X_test[750:900], y_test[750:900], 1)
demo_input_ids_train7, demo_token_ids_train7, demo_attention_masks_train7, label_ids_train7 = convert_to_input(X_test[900:1050], y_test[900:1050], 1)
demo_input_ids_train8, demo_token_ids_train8, demo_attention_masks_train8, label_ids_train8 = convert_to_input(X_test[1050:1200], y_test[1050:1200], 1)
demo_input_ids_train9, demo_token_ids_train9, demo_attention_masks_train9, label_ids_train9 = convert_to_input(X_test[1200:1350], y_test[1200:1350], 1)
demo_input_ids_train10, demo_token_ids_train10, demo_attention_masks_train10, label_ids_train10 = convert_to_input(X_test[1350:1500], y_test[1350:1500], 1)
demo_input_ids_train11, demo_token_ids_train11, demo_attention_masks_train11, label_ids_train11 = convert_to_input(X_test[1500:1587], y_test[1500:1587], 1)



In [None]:
print (X_test[0])
print(y_test[0])

In [None]:
#Make predictions
#demo_prediction = new_model([demo_input_ids_train, demo_token_ids_train, demo_attention_masks_train])
#demo_pred_tags = np.argmax(demo_prediction, -1)

demo_prediction1 = new_model([demo_input_ids_train1, demo_token_ids_train1, demo_attention_masks_train1])
demo_prediction2 = new_model([demo_input_ids_train2, demo_token_ids_train2, demo_attention_masks_train2])
demo_prediction3 = new_model([demo_input_ids_train3, demo_token_ids_train3, demo_attention_masks_train3])
demo_prediction4 = new_model([demo_input_ids_train4, demo_token_ids_train4, demo_attention_masks_train4])
demo_prediction5 = new_model([demo_input_ids_train5, demo_token_ids_train5, demo_attention_masks_train5])
demo_prediction6 = new_model([demo_input_ids_train6, demo_token_ids_train6, demo_attention_masks_train6])
demo_prediction7 = new_model([demo_input_ids_train7, demo_token_ids_train7, demo_attention_masks_train7])
demo_prediction8 = new_model([demo_input_ids_train8, demo_token_ids_train8, demo_attention_masks_train8])

demo_prediction9 = new_model([demo_input_ids_train9, demo_token_ids_train9, demo_attention_masks_train9])
demo_prediction10 = new_model([demo_input_ids_train10, demo_token_ids_train10, demo_attention_masks_train10])
demo_prediction11 = new_model([demo_input_ids_train11, demo_token_ids_train11, demo_attention_masks_train11])



In [None]:
print(demo_prediction1.shape)
print(demo_prediction2.shape)
print(demo_prediction3.shape)
print(demo_prediction4.shape)
print(demo_prediction5.shape)
print(demo_prediction6.shape)
print(demo_prediction7.shape)
print(demo_prediction8.shape)
print(demo_prediction9.shape)
print(demo_prediction10.shape)
print(demo_prediction11.shape)

In [None]:
demo_pred_tags1 = np.argmax(demo_prediction1, -1)
demo_pred_tags2 = np.argmax(demo_prediction2, -1)
demo_pred_tags3 = np.argmax(demo_prediction3, -1)
demo_pred_tags4 = np.argmax(demo_prediction4, -1)
demo_pred_tags5 = np.argmax(demo_prediction5, -1)
demo_pred_tags6 = np.argmax(demo_prediction6, -1)
demo_pred_tags7 = np.argmax(demo_prediction7, -1)
demo_pred_tags8 = np.argmax(demo_prediction8, -1)
demo_pred_tags9 = np.argmax(demo_prediction9, -1)
demo_pred_tags10 = np.argmax(demo_prediction10, -1)
demo_pred_tags11 = np.argmax(demo_prediction11, -1)

In [None]:
print(demo_pred_tags1.shape)
print(demo_pred_tags2.shape)
print(demo_pred_tags3.shape)
print(demo_pred_tags4.shape)
print(demo_pred_tags5.shape)
print(demo_pred_tags6.shape)
print(demo_pred_tags7.shape)
print(demo_pred_tags8.shape)
print(demo_pred_tags9.shape)
print(demo_pred_tags10.shape)
print(demo_pred_tags11.shape)

In [None]:
demo_pred_tags_0 = np.vstack((demo_pred_tags1, demo_pred_tags2))
demo_pred_tags_1 = np.vstack((demo_pred_tags_0, demo_pred_tags3))
demo_pred_tags_2 = np.vstack((demo_pred_tags_1, demo_pred_tags4))
demo_pred_tags_3 = np.vstack((demo_pred_tags_2, demo_pred_tags5))
demo_pred_tags_4 = np.vstack((demo_pred_tags_3, demo_pred_tags6))
demo_pred_tags_5 = np.vstack((demo_pred_tags_4, demo_pred_tags7))
demo_pred_tags_6 = np.vstack((demo_pred_tags_5, demo_pred_tags8))
demo_pred_tags_7 = np.vstack((demo_pred_tags_6, demo_pred_tags9))
demo_pred_tags_8 = np.vstack((demo_pred_tags_7, demo_pred_tags10))
demo_pred_tags   = np.vstack((demo_pred_tags_8, demo_pred_tags11))

In [None]:
print(demo_pred_tags_0.shape)
print(demo_pred_tags_1.shape)
print(demo_pred_tags.shape)

In [None]:
demo_y_pred = logits_to_tokens(demo_pred_tags, le_dict)
print(demo_y_pred)

In [None]:
#demo_y_pred = logits_to_tokens(demo_pred_tags, le_dict)
#print(demo_y_pred)


In [None]:
print(type(label_ids_train1))
print(label_ids_train1.shape)

In [None]:
label_ids_train_0 = np.vstack((label_ids_train1, label_ids_train2))
label_ids_train_1 = np.vstack((label_ids_train_0, label_ids_train3))
label_ids_train_2 = np.vstack((label_ids_train_1, label_ids_train4))
label_ids_train_3 = np.vstack((label_ids_train_2, label_ids_train5))
label_ids_train_4 = np.vstack((label_ids_train_3, label_ids_train6))
label_ids_train_5 = np.vstack((label_ids_train_4, label_ids_train7))
label_ids_train_6 = np.vstack((label_ids_train_5, label_ids_train8))
label_ids_train_7 = np.vstack((label_ids_train_6, label_ids_train9))
label_ids_train_8 = np.vstack((label_ids_train_7, label_ids_train10))
label_ids_train   = np.vstack((label_ids_train_8, label_ids_train11))
print(label_ids_train.shape)

In [None]:
y_true = logits_to_tokens(label_ids_train, le_dict)#label_ids_train

print(y_true[0])

In [None]:
li1 = sum(y_true, [])
li2 = sum(demo_y_pred, [])#demo_y_pred

results = pd.DataFrame(columns=['Expected', 'Predicted'])

results['Expected']  = li1
results['Predicted'] = li2

In [None]:
print("precision: {:.1%}".format(precision_score(y_true, demo_y_pred)))
print("   recall: {:.1%}".format(recall_score(y_true,    demo_y_pred)))
print(" accuracy: {:.1%}".format(accuracy_score(y_true,  demo_y_pred)))
print(" F1-score: {:.1%}".format(f1_score(y_true,        demo_y_pred)))

In [None]:
report = eskclarep(results['Expected'], results['Predicted'])
print(report_to_df(report))