# Detecting negation and Uncertainty using BERT

In [None]:
import sys
sys.path.append('../input/libsutils')

import tensorflow as tf
import matplotlib.pyplot as plt
import numpy as np
import pickle
import pandas as pd
import requests
import tarfile
from itertools import islice

from tabulate import tabulate
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report as eskclarep
from seqeval.metrics import precision_score, recall_score, f1_score, accuracy_score
from seqeval.metrics import classification_report as seqclarep
from sklearn.preprocessing import LabelBinarizer
from itertools import chain

from tensorflow import keras
from tensorflow.keras.layers import Dense, Input, Dropout, TimeDistributed
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.models import Model
from tensorflow.keras.callbacks import ModelCheckpoint
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical

from sklearn import preprocessing
from tokenizers import BertWordPieceTokenizer
from utils import build_matrix_embeddings as bme, plot_model_performance, logits_to_tokens, report_to_df
from transformers import (
    TF2_WEIGHTS_NAME,
    BertConfig,
    BertTokenizer,
    TFBertForTokenClassification,
    create_optimizer)

from tensorflow.keras.callbacks import ModelCheckpoint
from tensorflow.keras.callbacks import TensorBoard
import tensorflow.keras.backend as kb

from IPython.core.display import display, HTML
from crfta import CRF as crf4

import datetime, os
import random
import time
from tqdm import tqdm

SEED = 42
os.environ['PYTHONHASHSEED']=str(SEED)
os.environ['TF_CUDNN_DETERMINISTIC'] = '1'  # TF 2.1+
random.seed(SEED)
np.random.seed(SEED)
tf.random.set_seed(SEED)

MODEL         = 'model-08-bert'
logs_base_dir = "./logs"
log_dir       = logs_base_dir + "/" + MODEL
save_base_dir = './model-save'
save_dir      = save_base_dir + "/" + MODEL

os.makedirs(logs_base_dir, exist_ok=True)
os.makedirs(log_dir,       exist_ok=True)
os.makedirs(save_base_dir, exist_ok=True)
os.makedirs(save_dir,      exist_ok=True)

%load_ext tensorboard

# ****** DEFINICION DE PARAMETROS *********
LEN_SENTS   = 190
MAX_LEN     = 348 #2860
NUM_TAGS    = 9 + 3

# ****** DEFINICION DE HIPERPARAMETROS *********
_EPOCHS      = 5
_DROPOUT     = 0.4
_BACH_SIZE   = 64

configuration = BertConfig()

BERT_MODEL = "bert-base-multilingual-cased"
ESPECIAL_TOKEN = 9
SEP_TOKEN      = 10
PAD_TOKEN      = 11
WORD_PAD_TOKEN = 0

gpus = tf.config.experimental.list_physical_devices('GPU')
if gpus:
    try:
        # Currently, memory growth needs to be the same across GPUs
        for gpu in gpus:
            tf.config.experimental.set_memory_growth(gpu, True)
            
        logical_gpus = tf.config.experimental.list_logical_devices('GPU')
        print(len(gpus), "Physical GPUs,", len(logical_gpus), "Logical GPUs")
    except RuntimeError as e:
        # Memory growth must be set before GPUs have been initialized
        print(e)

### Function to load data

In [None]:
def process_csv(data_path):
    df = pd.read_csv(data_path, encoding="utf-8")
    df.loc[:, "Sentence #"] = df["Sentence #"].fillna(method="ffill")
    enc_tag = preprocessing.LabelEncoder()
    df.loc[:, "Tag"] = enc_tag.fit_transform(df["Tag"])
    sentences_l = df.groupby("Sentence #")["Word"].apply(list).values
    sentences = sentences_l.tolist()
    tag_l = df.groupby("Sentence #")["Tag"].apply(list).values
    tag = tag_l.tolist()
    return sentences, tag, enc_tag

## Loading  datasets

In [None]:
train_data_csv            = "../input/vectors/sentences_train.csv"
X_train, y_train, enc_tag_train = process_csv(train_data_csv)

test_data_csv             = "../input/vectors/sentences_test.csv"
X_test, y_test, enc_tag_test   = process_csv(test_data_csv)

dev_data_csv              = "../input/vectors/sentences_dev.csv"
X_dev, y_dev, enc_tag_dev = process_csv(dev_data_csv)

## Print data loaded 

In [None]:
print(X_train[0])
print(y_train[0])
print(enc_tag_test)

#print(X_train[0][0])
#print(X_train[0][1])
#print(X_train[0][2])

In [None]:
list1 = y_train + y_test + y_dev
tag_list = []
for sublist in list1:
    for x in sublist:
        if x not in tag_list:
            tag_list.append(x)
            
tag_list.append(9)
tag_list.append(10)
tag_list.append(11)

In [None]:
print(tag_list)
le_dicti = {}
le_dict  = dict(zip(enc_tag_test.transform(enc_tag_test.classes_), enc_tag_test.classes_))
#print([le_dict.get(_, '[pad]') for _ in tag_list])
#print(le_dict)
le_dict[9]  = '[CLS]'
le_dict[10] = '[SEP]'
le_dict[11] = '[PAD]'

for key in le_dict:
    #print(key, '->', le_dict[key])
    le_dicti[le_dict[key]] = key
print(le_dicti)

In [None]:
#print(np.shape(tag_list))
#label_map = {label: i for i, label in enumerate(tag_list)}
#num_labels = len(label_map)
num_labels = len(tag_list)
#print(label_map)
print(num_labels)


## Loading BERT Embedding

In [None]:
MODEL_CLASSES = {"bert": (BertConfig, TFBertForTokenClassification, BertTokenizer)}
config_class, model_class, tokenizer_class = MODEL_CLASSES['bert']
config = config_class.from_pretrained(BERT_MODEL, num_labels=num_labels)

tokenizer = tokenizer_class.from_pretrained(BERT_MODEL, do_lower_case=False)

## Function that converts inputs data to BERT inputs

In [None]:
def convert_to_input(sentences, tags, in_ou_put):
    input_id_list       = []
    attention_mask_list = [] 
    token_type_id_list  = []
    len_tokens          = []
    len_sentences       = []
    
    
    if in_ou_put == 1:
        label_id_list   = []
    else:
        label_id_list   = 0
    
    for x,y in tqdm(zip(sentences,tags),total=len(tags)):
        tokens = []
        
        if in_ou_put == 1:
            label_ids = []
        
        for word, label in zip(x, y):
            word_tokens = tokenizer.tokenize(str(word))
            tokens.extend(word_tokens)
            # Use the real label id for the first token of the word, 
            # and padding ids for the remaining tokens
            if in_ou_put == 1:
                label_ids.extend([label] + [SEP_TOKEN] * (len(word_tokens) - 1))
        
        
        if in_ou_put == 1:
            label_ids = [ESPECIAL_TOKEN] + label_ids + [ESPECIAL_TOKEN]
        
        len_tokens.append(len(label_ids))
        len_sentences.append(len(x))
        
        inputs = tokenizer.encode_plus(tokens, add_special_tokens=True, max_length=MAX_LEN)
        
        input_ids       = inputs["input_ids"]
        token_type_ids  = inputs["token_type_ids"]
        attention_masks = inputs["attention_mask"]
        
        attention_mask_list.append(attention_masks)
        input_id_list.append(input_ids)
        token_type_id_list.append(token_type_ids)
        
        if in_ou_put == 1:
            label_id_list.append(label_ids)

    input_id_list       = pad_sequences(maxlen=MAX_LEN, sequences=input_id_list,       dtype="int32", padding="post", value=WORD_PAD_TOKEN)
    token_type_id_list  = pad_sequences(maxlen=MAX_LEN, sequences=token_type_id_list,  dtype="int32", padding="post")
    attention_mask_list = pad_sequences(maxlen=MAX_LEN, sequences=attention_mask_list, dtype="int32", padding="post")
    
    print(">>> :", max(len_tokens))
    print(">>>> :", max(len_sentences))
    
    if in_ou_put == 1:
        label_id_list   = pad_sequences(maxlen=MAX_LEN, sequences=label_id_list, dtype="int32", padding="post", value=PAD_TOKEN)


    return input_id_list, token_type_id_list, attention_mask_list, label_id_list

In [None]:
input_ids_train, token_ids_train, attention_masks_train, label_ids_train = convert_to_input(X_train, y_train, 1)

input_ids_test,  token_ids_test,  attention_masks_test,  label_ids_test  = convert_to_input(X_test,  y_test,  1)

input_ids_dev,   token_ids_dev,   attention_masks_dev,   label_ids_dev   = convert_to_input(X_dev,   y_dev,   1)

In [None]:
#input_ids_train.shape, token_ids_train.shape, attention_masks_train.shape, label_ids_train.shape

In [None]:
#input_ids_dev.shape,   token_ids_dev.shape, attention_masks_dev.shape,     label_ids_dev.shape

##  Model definition

In [None]:
optimizer = keras.optimizers.Adam(lr=5e-5)
    
#loss      = tf.keras.losses.BinaryCrossentropy(from_logits = True)
#loss      = tf.keras.losses.CategoricalCrossentropy(from_logits = True)
loss      = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)

In [None]:
def create_model():
    encoder        = model_class.from_pretrained(BERT_MODEL,
                                                 from_pt=bool(".bin" in BERT_MODEL),
                                                 config=config)
    
    input_ids      = Input(shape=(MAX_LEN,), dtype=tf.int32)
    token_type_ids = Input(shape=(MAX_LEN,), dtype=tf.int32)
    attention_mask = Input(shape=(MAX_LEN,), dtype=tf.int32)
    
    embedding = encoder(
        input_ids=input_ids, token_type_ids=token_type_ids, attention_mask=attention_mask
    )[0]
    
    embedding = Dropout(_DROPOUT)(embedding)
    tag_logits = Dense(num_labels, activation='softmax')(embedding)
    
    
    model = keras.Model(
        inputs     = [input_ids, token_type_ids, attention_mask],
        outputs    = [tag_logits],
    )
    
    model.compile(optimizer=optimizer, loss=loss, metrics=['accuracy'])
    
    return model

## Model creation

In [None]:
use_tpu  = None
use_gpu  = None
factor_b = None
try:
    tpu      = tf.distribute.cluster_resolver.TPUClusterResolver()
    use_tpu  = True
except:
    use_tpu  = False

try:
    gpus     = tf.config.experimental.list_logical_devices('GPU')
    use_gpu  = True
except:
    use_gpu = False
    
if use_tpu:
    print('tpu')
    factor_b = 8
    # Create distribution strategy
    tf.config.experimental_connect_to_cluster(tpu)
    tf.tpu.experimental.initialize_tpu_system(tpu)
    strategy = tf.distribute.experimental.TPUStrategy(tpu)

    # Create model
    with strategy.scope():
        model = create_model()
elif use_gpu:
    print('gpu')
    factor_b = 0.25
    # Create distribution strategy
    if len(gpus) > 1:
        strategy = tf.distribute.MirroredStrategy([gpu.name for gpu in gpus])
    else:
        strategy = tf.distribute.get_strategy()
    
    # Create model
    with strategy.scope():
        model = create_model()
else:
    model = create_model()
    
model.summary()

## Training the model

In [None]:
history = model.fit([input_ids_train, token_ids_train, attention_masks_train], label_ids_train,
                    validation_data  = ([input_ids_dev, token_ids_dev, attention_masks_dev], label_ids_dev),
                    batch_size       = int(_BACH_SIZE * factor_b),
                    epochs           = _EPOCHS,
                    verbose          = 1
                   )

In [None]:
#tf.saved_model.save(model, export_dir=save_dir)

In [None]:
scores = model.evaluate([input_ids_test, token_ids_test, attention_masks_test], label_ids_test)
print(f"{model.metrics_names[1]}: {scores[1] * 100}")

In [None]:
plot_model_performance(
    train_loss     = history.history.get('loss', []),
    train_acc      = history.history.get('accuracy', []),
    train_val_loss = history.history.get('val_loss', []),
    train_val_acc  = history.history.get('val_accuracy', [])
)

In [None]:
prediction = model.predict([input_ids_test, token_ids_test, attention_masks_test])

In [None]:
pred_tags = np.argmax(prediction,-1)

print(pred_tags[0])

#le_dict = dict(zip(enc_tag_test.transform(enc_tag_test.classes_), enc_tag_test.classes_))
#print('predicted tags')
#print([le_dict.get(_, '[pad]') for _ in pred_tags])

In [None]:
y_pred = logits_to_tokens(pred_tags, le_dict)

print(y_pred[0])

In [None]:
y_true = logits_to_tokens(label_ids_test, le_dict)

print(y_true[0])

In [None]:
li1 = sum(y_true, [])
li2 = sum(y_pred, [])

results = pd.DataFrame(columns=['Expected', 'Predicted'])

results['Expected']  = li1
results['Predicted'] = li2

In [None]:
print("precision: {:.1%}".format(precision_score(y_true, y_pred)))
print("   recall: {:.1%}".format(recall_score(y_true,    y_pred)))
print(" accuracy: {:.1%}".format(accuracy_score(y_true,  y_pred)))
print(" F1-score: {:.1%}".format(f1_score(y_true,        y_pred)))

In [None]:
report = eskclarep(results['Expected'], results['Predicted'])
print(report_to_df(report))