In [None]:
try:
    import seqeval
    import sklearn
    import transformers
except ModuleNotFoundError as err:
    !pip install seqeval
    !pip install -U scikit-learn
    !pip install transformers

In [None]:
import os
import shutil
 
dir = './'
for f in os.listdir(dir):
    if not os.path.isdir(f):
        os.remove(os.path.join(dir, f))
    else:
        shutil.rmtree(f)
    
print(os.listdir(dir))

In [None]:
import sys
sys.path.append('../input/libsutils')

import tensorflow as tf
import matplotlib.pyplot as plt
import numpy as np
import pickle
import pandas as pd
import requests
import tarfile
from itertools import islice

from tabulate import tabulate
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report as eskclarep
from seqeval.metrics import precision_score, recall_score, f1_score, accuracy_score
from seqeval.metrics import classification_report as seqclarep
from sklearn.preprocessing import LabelBinarizer
from sklearn.model_selection import KFold
from itertools import chain

from tensorflow import keras
from tensorflow.keras.layers import Dense, Input, Dropout, TimeDistributed
from tensorflow.keras.optimizers.legacy import Adam
from tensorflow.keras.models import Model
from tensorflow.keras.callbacks import ModelCheckpoint
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical

from sklearn import preprocessing
#from tokenizers import BertWordPieceTokenizer
from utils import build_matrix_embeddings as bme, plot_model_performance, logits_to_tokens, report_to_df
from transformers import (
    TF2_WEIGHTS_NAME,
    BertConfig,
    BertTokenizer,
    TFBertForTokenClassification,
    create_optimizer)

from tensorflow.keras.callbacks import ModelCheckpoint
from tensorflow.keras.callbacks import TensorBoard
import tensorflow.keras.backend as kb

from IPython.core.display import HTML
#from crfta import CRF as crf4

import datetime, os, warnings
import random
import time
from tqdm import tqdm

In [None]:
# prevent logs
warnings.filterwarnings('ignore')
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'

SEED = 42
os.environ['PYTHONHASHSEED']=str(SEED)
os.environ['TF_CUDNN_DETERMINISTIC'] = '1'  # TF 2.1+
random.seed(SEED)
np.random.seed(SEED)
tf.random.set_seed(SEED)

MODEL         = 'model-bert-breast_sin'
logs_base_dir = "./logs"
log_dir       = logs_base_dir + "/" + MODEL
save_base_dir = './model-save'
save_dir      = save_base_dir + "/" + MODEL

os.makedirs(logs_base_dir, exist_ok=True)
os.makedirs(log_dir,       exist_ok=True)
os.makedirs(save_base_dir, exist_ok=True)
os.makedirs(save_dir,      exist_ok=True)

%load_ext tensorboard

# ****** DEFINICION DE PARAMETROS *********
LEN_SENTS   = 267
MAX_LEN     = 486
NUM_TAGS    = 45 + 3

# ****** DEFINICION DE HIPERPARAMETROS *********
_EPOCHS      = 15
_DROPOUT     = 0.4
_BACH_SIZE   = 64
VAL_SPLIT    = 0.3
NUM_FOLDS    = 10

prime_data_fold = []
total_data_fold = []

configuration = BertConfig()

BERT_MODEL = "bert-base-multilingual-cased"
ESPECIAL_TOKEN = 45
SEP_TOKEN      = 46
PAD_TOKEN      = 47
WORD_PAD_TOKEN = 0

AUTO = tf.data.experimental.AUTOTUNE

### Se define la Funciones de Carga de Datos

In [None]:
def process_csv(data_path, enc_tag):
    df = pd.read_csv(data_path, encoding="utf-8")
    df.loc[:, "Sentence #"] = df["Sentence #"].fillna(method="ffill")
    if enc_tag == '':
        enc_tag = preprocessing.LabelEncoder()
    
    df.loc[:, "Tag"] = enc_tag.fit_transform(df["Tag"])
    sentences_l = df.groupby("Sentence #")["Word"].apply(list).values
    sentences = sentences_l.tolist()
    tag_l = df.groupby("Sentence #")["Tag"].apply(list).values
    tag = tag_l.tolist()
    return sentences, tag, enc_tag

## Se cargan los datasets

In [None]:
X_data, y_data, enc_tag_data = process_csv("../input/braset-cancer/train_sentences_sin.csv", "")

## Pruebas de carga de datos

In [None]:
print(X_data[0])
print(y_data[0])
print(enc_tag_data)

#print(X_train[0][0])
#print(X_train[0][1])
#print(X_train[0][2])

In [None]:
list1 = y_data
tag_list = []
for sublist in list1:
    for x in sublist:
        if x not in tag_list:
            tag_list.append(x)
            
tag_list.append(45)
tag_list.append(46)
tag_list.append(47)


tag_list.sort()

In [None]:
print(tag_list)
le_dicti = {}
le_dict  = dict(zip(enc_tag_data.transform(enc_tag_data.classes_), enc_tag_data.classes_))
#print([le_dict.get(_, '[pad]') for _ in tag_list])
#print(le_dict)
le_dict[45] = '[CLS]'
le_dict[46] = '[SEP]'
le_dict[47] = '[PAD]'

for key in le_dict:
    #print(key, '->', le_dict[key])
    le_dicti[le_dict[key]] = key
print(le_dicti)

In [None]:
#print(np.shape(tag_list))
#label_map = {label: i for i, label in enumerate(tag_list)}
#num_labels = len(label_map)
num_labels = len(tag_list)
#print(label_map)
print(num_labels)

## Se carga el embedding de bert

In [None]:
MODEL_CLASSES = {"bert": (BertConfig, TFBertForTokenClassification, BertTokenizer)}
config_class, model_class, tokenizer_class = MODEL_CLASSES['bert']
config = config_class.from_pretrained(BERT_MODEL, num_labels=num_labels)

tokenizer = tokenizer_class.from_pretrained(BERT_MODEL, do_lower_case=False)

## Funcion que convierte las entradas en entradas bert

In [None]:
def convert_to_input(sentences, tags, in_ou_put):
    input_id_list       = []
    attention_mask_list = [] 
    token_type_id_list  = []
    len_tokens          = []
    len_sentences       = []
    
    
    if in_ou_put == 1:
        label_id_list   = []
    else:
        label_id_list   = 0
    
    for x,y in tqdm(zip(sentences,tags),total=len(tags)):
        tokens = []
        
        if in_ou_put == 1:
            label_ids = []
        
        for word, label in zip(x, y):
            word_tokens = tokenizer.tokenize(str(word))
            tokens.extend(word_tokens)
            # Use the real label id for the first token of the word, 
            # and padding ids for the remaining tokens
            if in_ou_put == 1:
                label_ids.extend([label] + [SEP_TOKEN] * (len(word_tokens) - 1))
        
        
        if in_ou_put == 1:
            label_ids = [ESPECIAL_TOKEN] + label_ids + [ESPECIAL_TOKEN]
        
        len_tokens.append(len(label_ids))
        len_sentences.append(len(x))
        
        inputs = tokenizer.encode_plus(tokens, add_special_tokens=True, max_length=MAX_LEN)
        
        input_ids       = inputs["input_ids"]
        token_type_ids  = inputs["token_type_ids"]
        attention_masks = inputs["attention_mask"]
        
        attention_mask_list.append(attention_masks)
        input_id_list.append(input_ids)
        token_type_id_list.append(token_type_ids)
        
        if in_ou_put == 1:
            label_id_list.append(label_ids)

    input_id_list       = pad_sequences(maxlen=MAX_LEN, sequences=input_id_list,       dtype="int32", padding="post", value=WORD_PAD_TOKEN)
    token_type_id_list  = pad_sequences(maxlen=MAX_LEN, sequences=token_type_id_list,  dtype="int32", padding="post")
    attention_mask_list = pad_sequences(maxlen=MAX_LEN, sequences=attention_mask_list, dtype="int32", padding="post")
    
    print(">>> :", max(len_tokens))
    print(">>>> :", max(len_sentences))
    
    if in_ou_put == 1:
        label_id_list   = pad_sequences(maxlen=MAX_LEN, sequences=label_id_list, dtype="int32", padding="post", value=PAD_TOKEN)


    return input_id_list, token_type_id_list, attention_mask_list, label_id_list

In [None]:
input_ids, token_ids, attention_masks, label_ids = convert_to_input(X_data, y_data, 1)

In [None]:
#input_ids_train.shape, token_ids_train.shape, attention_masks_train.shape, label_ids_train.shape

## Definición del Modelo

In [None]:
#optimizer = keras.optimizers.Adam(learning_rate=5e-5)
optimizer = Adam(learning_rate=5e-5)
    
#loss      = tf.keras.losses.BinaryCrossentropy(from_logits = True)
#loss      = tf.keras.losses.CategoricalCrossentropy(from_logits = True)
loss      = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)

In [None]:
def create_model():
    encoder        = model_class.from_pretrained(BERT_MODEL,
                                                 from_pt=bool(".bin" in BERT_MODEL),
                                                 config=config)
    
    input_ids      = Input(shape=(MAX_LEN,), dtype=tf.int32)
    token_type_ids = Input(shape=(MAX_LEN,), dtype=tf.int32)
    attention_mask = Input(shape=(MAX_LEN,), dtype=tf.int32)
    
    embedding = encoder(
        input_ids=input_ids, token_type_ids=token_type_ids, attention_mask=attention_mask
    )[0]
    
    embedding = Dropout(_DROPOUT)(embedding)
    tag_logits = Dense(num_labels, activation='softmax')(embedding)
    
    
    model = keras.Model(
        inputs     = [input_ids, token_type_ids, attention_mask],
        outputs    = [tag_logits],
    )
    
    model.compile(optimizer=optimizer, loss=loss, metrics=['accuracy'])
    
    return model

## Creación del modelo

In [None]:
use_tpu  = None
use_gpu  = None
factor_b = None
try:
    #tpu = tf.distribute.cluster_resolver.TPUClusterResolver.connect(tpu="local")
    tpu = tf.distribute.cluster_resolver.TPUClusterResolver(tpu='local')
    use_tpu  = True
except:
    use_tpu  = False

try:
    gpus     = tf.config.experimental.list_logical_devices('GPU')
    use_gpu  = True
except:
    use_gpu = False
    
if use_tpu:
    print('tpu')
    #factor_b = 4
    # Create distribution strategy
    tf.config.experimental_connect_to_cluster(tpu)
    tf.tpu.experimental.initialize_tpu_system(tpu)
    #tf.distribute.experimental.TPUStrategy(tpu)
    
    #print("All devices: ", tf.config.list_logical_devices('TPU'))
    
    strategy = tf.distribute.TPUStrategy(tpu)
    
    tf.keras.backend.clear_session()
    # Create model    
    with strategy.scope():
        factor_b = strategy.num_replicas_in_sync
        model = create_model()
elif use_gpu:
    print('gpu')
    factor_b = 0.25
    # Create distribution strategy
    if len(gpus) > 1:
        strategy = tf.distribute.MirroredStrategy([gpu.name for gpu in gpus])
    else:
        strategy = tf.distribute.get_strategy()
    
    # Create model
    with strategy.scope():
        model = create_model()
else:
    model = create_model()
    
model.summary()

### Funcion de Plot

In [None]:
# ### Convertir el reporte a tabla
def report_to_df2(report, name_model):
    report = [x.split(' ') for x in report.split('\n')]
    header = ['Class Name']+[x for x in report[0] if x!=''] + ['model']
    values = []
    for row in report[1:-5]:
        row = [value for value in row if value!=''] + [name_model]
        if row!=[]:
            values.append(row)
    df = pd.DataFrame(data = values, columns = header)
    return df

## Entranamiento del modelo

In [None]:
kfold = KFold(n_splits=NUM_FOLDS, shuffle=True)

fold_no = 1
for train_index, test_index in kfold.split(input_ids):
    print('*'* 100)
    print('*', ' '*40, '--fold--', fold_no, ' '*44, '*')
    print('*'* 100)
    
    history = model.fit([input_ids[train_index], 
                         token_ids[train_index], 
                         attention_masks[train_index]],
                        label_ids[train_index],
                        validation_split = VAL_SPLIT,
                        batch_size       = int(_BACH_SIZE * factor_b),
                        epochs           = _EPOCHS,
                        verbose          = 1)
    
    scores = model.evaluate([input_ids[test_index], token_ids[test_index], attention_masks[test_index]], label_ids[test_index])
    
    print("\neval: ",
          f"{model.metrics_names[0]}: {scores[1] * 100}",
          f"{model.metrics_names[1]}: {scores[1] * 100}","\n")
    
    if fold_no == 1:
        print('>>>>>>', fold_no)
        plot_model_performance(
            train_loss     = history.history.get('loss', []),
            train_acc      = history.history.get('accuracy', []),
            train_val_loss = history.history.get('val_loss', []),
            train_val_acc  = history.history.get('val_accuracy', [])
        )
    
    prediction = model.predict([input_ids[test_index], token_ids[test_index], attention_masks[test_index]])
    
    pred_tags = np.argmax(prediction,-1)
    #print(pred_tags[0])
    
    y_pred = logits_to_tokens(pred_tags, le_dict)
    #print(y_pred[0])
    
    y_true = logits_to_tokens(label_ids[test_index], le_dict)
    #print(y_true[0])
    
    li1 = sum(y_true, [])
    li2 = sum(y_pred, [])
    
    results = pd.DataFrame(columns=['Expected', 'Predicted'])
    results['Expected']  = li1
    results['Predicted'] = li2
    
    print("precision: {:.1%}".format(precision_score(y_true, y_pred)))
    print("   recall: {:.1%}".format(recall_score(y_true,    y_pred)))
    print(" accuracy: {:.1%}".format(accuracy_score(y_true,  y_pred)))
    print(" F1-score: {:.1%}".format(f1_score(y_true,        y_pred)), "\n")
    
    info = []

    info.append("precision: {:.1%}".format(precision_score(y_true, y_pred)))
    info.append("   recall: {:.1%}".format(recall_score(y_true,    y_pred)))
    info.append(" accuracy: {:.1%}".format(accuracy_score(y_true,  y_pred)))
    info.append(" F1-score: {:.1%}".format(f1_score(y_true,        y_pred)))

    prime_data_fold.append(info)
    
    report = eskclarep(results['Expected'], results['Predicted'])
    
    total_data_fold.append(report_to_df2(report, '-fol' + str(fold_no)))
    
    print(report_to_df2(report, '-fol' + str(fold_no)), "\n")
    
    # Increase fold number
    fold_no += 1

### se almacena las estadisticas

In [None]:
output1 = open('./prime_data_fold-10k-model-08_sin.pkl', 'wb')
pickle.dump(prime_data_fold, output1)
output1.close()

output2 = open('./total_data_fold-10k-model-08_sin.pkl', 'wb')
pickle.dump(total_data_fold, output2)
output2.close()

## se almacena el modelo

In [None]:
save_locally = tf.saved_model.SaveOptions(experimental_io_device='/job:localhost')

#tf.saved_model.save(model, export_dir=save_dir)
#tf.saved_model.save(model, export_dir=save_locally)

model.save('./model', options=save_locally)

In [None]:
X_data_val, y_data_val, enc_tag_data_val = process_csv("../input/braset-cancer/sentences_validation_sin.csv", enc_tag_data)

In [None]:
input_ids_val, token_ids_val, attention_masks_val, label_ids_val = convert_to_input(X_data_val, y_data_val, 1)

In [None]:
prediction_val = model.predict([input_ids_val, token_ids_val, attention_masks_val])
    
pred_tags_val = np.argmax(prediction_val, -1)
#print(pred_tags[0])
    
y_pred_val = logits_to_tokens(pred_tags_val, le_dict)
#print(y_pred[0])
    
y_true_val = logits_to_tokens(label_ids_val, le_dict)
#print(y_true[0])
    
li1_val = sum(y_true_val, [])
li2_val = sum(y_pred_val, [])

In [None]:
results_val = pd.DataFrame(columns=['Expected', 'Predicted'])
results_val['Expected']  = li1_val
results_val['Predicted'] = li2_val

In [None]:
results_val.to_csv('file_bert_val.csv')

In [None]:
prediction_total = model.predict([input_ids, 
                                  token_ids, 
                                  attention_masks])
    
pred_tags_total = np.argmax(prediction_total, -1)
#print(pred_tags[0])
    
y_pred_total = logits_to_tokens(pred_tags_total, le_dict)
#print(y_pred[0])
    
y_true_total = logits_to_tokens(label_ids, le_dict)
#print(y_true[0])
    
li1_total = sum(y_true_total, [])
li2_total = sum(y_pred_total, [])

In [None]:
results_total = pd.DataFrame(columns=['Expected', 'Predicted'])
results_total['Expected']  = li1_total
results_total['Predicted'] = li2_total

In [None]:
results_total.to_csv('file_bert2_val_sin.csv')