# TRAIN & SAVE / LOAD & RETRAIN BERT-NER model

This notebook loads a BIO annotated dataset and does all the data preparation required for BERT-NER model training.
It can be also used to load a trained datamodel in order to perform further training.


input parameters:
- filename_sa => annotated dataset to be used as 'supervised annotation'
- filename_da => annotated dataset to be used as 'distant annotation'
- training_source => defines what file will be used for training. Possible values: 'sa' or 'da'.
- validation_source => defines what file will be used for validation. Possible values: 'sa' or 'da'.
- save_model => defines whether the model checkpoint will be saved in a local folder. Possible values: 'yes' or 'no'.
- MODEL_SOURCE => defines whether the model will be loaded from Transformers library, or loaded from a saved model. 
Possible values: 'Transformers' or 'Saved_model'. IF 'saved_models' it will be required to enter parameter 'selected_model'= modelname.checkpoint'

output:
- model.checkpoint file
- hyperparameters file
- evaluation metrics

## Set execution parameters:

In [None]:
# EXECUTION PARAMETERS:

#-----------------------------
# base folder:
#-----------------------------
base_folder = 'training-datasets/'


#-----------------------------
# dataset source files:
#-----------------------------
filename_sa = 'scw_01-23_sa_v6.csv'


# filename_da = '2021-03-21_14-02-50_01-272_INFER_PL1.csv'
# filename_da = 'scw_01_23_da.csv'
# filename_da = 'scw_24_49_da.csv'
# filename_da = 'scw_50_99_da.csv'
# filename_da = 'scw_100_149_da.csv'
# filename_da = 'scw_1-149_220-272_da.csv'
# filename_da = 'scw_nhve_1-149_220-272_da.csv'
# filename_da = 'scw_220_272_da.csv'
filename_da = '2021-03-22_01-22-44_nhve_scw_220_272_INFER.csv'

#-----------------------------
# select datasources for current execution:
#-----------------------------
training_source = 'sa'
validation_source = 'da'

#-----------------------------
# SAVE MODEL 
#-----------------------------
save_model = 'no'
# save_model = 'yes'

#-----------------------------
# TRAINING PARAMETERS
#-----------------------------
max_len = 256   # Max length:
bs = 16         # Batch size: 2, 4, 8, 16
batch_num = bs  # Batch Size:
test_size = 0.3 # Validation/Test Split:
lr = 5e-5       # Learning rate: 1e-5, 2e-5, 3e-5, 5e-5 
eps = 1e-8      # EPS
epochs = 3     # Epochs: 2,3,4,5

#-----------------------------
# Set Finetunning depth:
#-----------------------------
FULL_FINETUNING = True   # True: fine tuning all the layers 
# FULL_FINETUNNING = False # False: only fine tuning the classifier layers

#-----------------------------
# MODEL TRAIN CODE VERSION: 
#-----------------------------
# train_code = 'OLD_CODE'
train_code = 'NEW_CODE'

#-----------------------------
# SET MODEL SOURCE
#-----------------------------
MODEL_SOURCE = 'Transformers'
selected_model = ''

# MODEL_SOURCE = 'Saved_model'
# selected_model = '2021-03-21_14-02-50.checkpoint'

#-----------------------------
# TRAIN MODEL:
#-----------------------------
TRAIN_MODEL = 'yes'
# TRAIN_MODEL = 'no'

#-----------------------------
# GET TRAINING DATA
#-----------------------------
get_training_data = 'yes'
# get_training_data = 'no'

### In case of Loading a Saved mode, select model to be loaded:

In [None]:
if MODEL_SOURCE == 'Saved_model':
    # LAST MODEL SAVED:
    last_saved_model = get_latest_saved_model()
    last_saved_model = last_saved_model[:19]+'.checkpoint'
    print('Last saved model: ')
    print(last_saved_model)

    # ALL MODELS SAVED:
    MODELS_FOLDER = 'models_saved/'
    print('All saved models: ')
    !ls "{MODELS_FOLDER}"

### Print execution parameters:

In [None]:
print("Execution Summary:")
print("Datetime: ", get_current_datetime())
print("SUPERVISED Annotated Dataset: ", filename_sa)
print("DISTANT Annotated Dataset: ", filename_da)
print("Training data source: ", training_source)
print("Validation data source: ", validation_source)
print('train/valid. split: ',test_size)
print('save_model: ', save_model)
print('Max length:' , max_len)
print('Batch Size: ', bs )
print('batch_num: ', bs)
print('learning_rate: ', lr)
print('eps: ', eps)
print('Epochs: ', epochs)
print('FULL_FINETUNING', FULL_FINETUNING)
print('MODEL_SOURCE', MODEL_SOURCE)
print('selected_model', selected_model)
print('Train_code: ', train_code)
print('Train model: ', TRAIN_MODEL)
print('get_training_data', get_training_data)

## 1) INSTALL LIBRARIES 

In [4]:
# Check library version
!pip list | grep -E 'transformers|torch|Keras'

Keras                      2.4.3
Keras-Preprocessing        1.1.2
pytorch-pretrained-bert    0.6.2
pytorch-transformers       1.2.0
sagemaker-pytorch-training 1.3.3
torch                      1.4.0
torchvision                0.5.0
transformers               4.2.2


In [3]:
# pip -q install -r requirements_train.txt

Note: you may need to restart the kernel to use updated packages.


#### If all required packages are installed, set 'pip_install'= 'no', otherwise = 'yes to install

In [9]:
# pip_install = 'yes' 
pip_install = 'yes'

#### Install required packages

In [17]:
if pip_install == 'yes': 
    print('installing seqeval')
    !pip install seqeval --quiet
    print('installing matplotlib')
    !pip install matplotlib --quiet
    print('installing seaborn')
    !pip install seaborn --quiet
    

import pandas as pd
import math
import numpy as np
from seqeval.metrics import f1_score
from seqeval.metrics import classification_report,accuracy_score,f1_score
import matplotlib.pyplot as plt
import seaborn as sns
# % matplotlib inline

if pip_install == 'yes': 
    print('installing transformers')
#     !pip install transformers  #transformers v3.x
    !pip install --upgrade transformers==4.2.2 --quiet
    !pip install transformers[sentencepiece] --quiet #transformers v4.x

    # INSTALLS FOR AWS SAGEMAKER STUDIO PY
    print('installing tensorflow')
    !pip install tensorflow --quiet
    print('installing keras')
    !pip install keras --quiet
    
import torch
import os
from tqdm import tqdm,trange
import torch.nn.functional as F
from torch.optim import Adam
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler

from keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
        # from transformers import BertTokenizer, BertConfig
        # from transformers import BertForTokenClassification, AdamW

# Check library version
print('')
print('Packages installed: ')
!pip list | grep -E 'transformers|torch|Keras'

installing seqeval
installing matplotlib
installing seaborn
installing transformers
installing tensorflow
installing keras

Packages installed: 
Keras                      2.4.3
Keras-Preprocessing        1.1.2
pytorch-pretrained-bert    0.6.2
pytorch-transformers       1.2.0
sagemaker-pytorch-training 1.3.3
torch                      1.4.0
torchvision                0.5.0
transformers               4.2.2


In [15]:
if pip_install == 'yes': 
    print('installing pytorch-pretrained-bert')
    !pip install pytorch-pretrained-bert --quiet
    print('installing pytorch-transformers')
    !pip install pytorch-transformers --quiet
    
import torch
from torch.optim import Adam
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
# from pytorch_pretrained_bert import BertTokenizer, BertConfig
# from pytorch_pretrained_bert import BertForTokenClassification, BertAdam, BertConfig
from pytorch_pretrained_bert import BertAdam
# from transformers import BertTokenizer, BertConfig
# from pytorch_transformers import BertModel, BertTokenizer, BertForTokenClassification, BertConfig
# from transformers import BertModel, BertTokenizer, BertForTokenClassification, BertConfig
import logging

if pip_install == 'yes': 
    print('installing ipywidgets')
    !pip install ipywidgets --quiet
    print('installing ipython notebook')
    !pip install ipython notebook --quiet 
    !jupyter nbextension enable --py widgetsnbextension
    
from ipywidgets import IntProgress

installing pytorch-pretrained-bert
installing pytorch-transformers
installing ipywidgets
installing ipython notebook
Enabling notebook extension jupyter-js-widgets/extension...
      - Validating: [32mOK[0m


In [16]:
# RUN THIS CELL TWICE IF ERROR (IF STILL ERROR RESTART THE ENVIRONMENT)
from transformers import BertModel, BertTokenizer, BertForTokenClassification, BertConfig
from transformers import AutoTokenizer

In [None]:
tokenizer = AutoTokenizer.from_pretrained('bert-base-cased')

# 2) RUN FUNCTIONS

In [None]:
def execution_with_breakpoints(yesno):
    if yesno == 'yes':
        execution_mode = 'with_breakpoints'
    else:
        execution_mode = 'no_breakpoints'
    return execution_mode

In [None]:
def check_breakpoint(execution_mode):
    if execution_mode == 'with_breakpoints':
        stop

In [None]:
class SentenceGetter(object):
    
    def __init__(self, data):
        self.n_sent = 1
        self.data = data
        self.empty = False
        agg_func = lambda s: [(w, p, t) for w, p, t in zip(s["Word"].values.tolist(),
                                                           s["POS"].values.tolist(),
                                                           s["Tag"].values.tolist())]
        self.grouped = self.data.groupby("Sentence").apply(agg_func)
        self.sentences = [s for s in self.grouped]
    
    def get_next(self):
        try:
            s = self.grouped["Sentence: {}".format(self.n_sent)]
            self.n_sent += 1
            return s
        except:
            return None

In [None]:
def turn_BIO_data_into_sentences(data):
    #concat sentence
    getter = SentenceGetter(data)
    
    sentences = [" ".join([s[0] for s in sent]) for sent in getter.sentences]
    sentences_sbw = [[s[0] for s in sent] for sent in getter.sentences]
    labels = [[s[2] for s in sent] for sent in getter.sentences]
    return sentences, sentences_sbw, labels

In [19]:
def tag_values_tag2idx_idx2tag_tag2name(data):

    tags_vals = list(set(data["Tag"].values))
    
    tags_vals = sorted(tags_vals)

    # Add some additional tags:
    # X  tag for word piece support
    # [CLS] and [SEP] as BERT need
    tags_vals.append('X')
    tags_vals.append('[CLS]')
    tags_vals.append('[SEP]')
    tags_vals.append("PAD")

    tag2idx = {t: i for i, t in enumerate(tags_vals)}
    idx2tag = {i: t for i, t in enumerate(tags_vals) }

    print('tags_vals: ', tags_vals)
    print('tag2idx: ', tag2idx)
    print('idx2tag: ', idx2tag)

    tag_values = tags_vals
    
    # Mapping tag to name
    tag2name={tag2idx[key] : key for key in tag2idx.keys()}
    print('tag2name: ', tag2name)

    return tags_vals, tag_values, tag2idx, idx2tag, tag2name

In [25]:
## unit testing
# tags_vals, tag_values, tag2idx, idx2tag, tag2name = tag_values_tag2idx_idx2tag_tag2name(data)
# tags_vals, tag_values, tag2idx, idx2tag, tag2name

In [None]:
# TEXT TOKENIZATION and EXTENSION OF LABELS FOR SPLITTED TOKENS
def tokenize_texts_extend_labels(sentences, labels):
    tokenized_texts = []
    tokenized_labels = []
    for sent, labs in zip(sentences, labels):
        tokenized_sentence = []
        labels = []

        sent_tokens = sent.split()
        for word, label in zip(sent_tokens, labs):

            # Tokenize the word and count # of subwords the word is broken into
            tokenized_word = tokenizer.tokenize(word)
            n_subwords = len(tokenized_word)

            # Add the tokenized word to the final tokenized word list
            tokenized_sentence.extend(tokenized_word)

            # Add the same label to the new list of labels `n_subwords` times
            labels.extend([label] * n_subwords)

        tokenized_texts.append(tokenized_sentence)
        tokenized_labels.append(labels)
    
    return tokenized_texts, tokenized_labels

# tokenized_texts, tokenized_labels = tokenize_texts_extend_labels(sentences, labels)

In [None]:
def tokenize_texts_and_labels(tokenizer, max_len, sentences, labels, tag2idx):
    tokenizer = AutoTokenizer.from_pretrained('bert-base-cased')

    #TOKENIZE TEXTS and LABELS:
    tokenized_texts, tokenized_labels = tokenize_texts_extend_labels(sentences, labels)
    
    # SGD (added to comply with previous versions of code)
    word_piece_labels = tokenized_labels

    # INPUT IDs:
    input_ids = pad_sequences([tokenizer.convert_tokens_to_ids(txt) for txt in tokenized_texts],
                              maxlen=max_len, dtype="long", value=0.0,
                              truncating="post", padding="post")

    # TAGS:
    tags = pad_sequences([[tag2idx.get(l) for l in lab] for lab in tokenized_labels],
                         maxlen=max_len, value=tag2idx["PAD"], padding="post",
                         dtype="long", truncating="post")

    # ATTENTION MASKS:
    attention_masks = [[float(i != 0.0) for i in ii] for ii in input_ids]
        
        
    return tokenized_texts, tokenized_labels, word_piece_labels, input_ids, tags, attention_masks


In [None]:
# PRINT TOKENS OF A GIVEN SENTENCE:

def print_tokens_of_a_given_sentence(n):

    # n = 232

    print("TOKENIZED TEXT:")
    print(len(tokenized_texts[n]))
    print(tokenized_texts[n])
    print("-"*160)
    #
    print("TOKEN LABELS:")
    print(len(labels[n]))
    print(labels[n])
    print("-"*160)
    #
    print('- word_piece_labels:')
    print(len(word_piece_labels[n]))
    print(word_piece_labels[n])
    print("-"*160)
    #
    # print("tokens_ids:")
    # print(len(tokens_ids[n]))
    # print(tokens_ids[n])
    # print("-"*160) 
    #
    print("input_ids:")
    print(len(input_ids[n]))
    print(input_ids[n])
    print("-"*160)
    #
    # print("t_list:")
    # print(t_list[n])
    # print("-"*160)
    #
    print('tags')
    print(len(tags[n]))
    print(tags[n])
    print("-"*160)
    #
    print('attention_masks')
    print(len(attention_masks[n]))
    print(attention_masks[n])
    print("-"*160)
    
    return

In [None]:
### Define Validation Metrics:

# Import f1_score from sequeval.metrics
from seqeval.metrics import f1_score, accuracy_score

# define function to flatten accuracy
def flat_accuracy(preds, labels):
    pred_flat = np.argmax(preds, axis=2).flatten()
    labels_flat = labels.flatten()
    return np.sum(pred_flat == labels_flat) / len(labels_flat)

In [None]:
def plot_training_validation_loss(loss_values, validation_loss_values):
    import matplotlib.pyplot as plt
    %matplotlib inline

    import seaborn as sns

    # Use plot styling from seaborn.
    sns.set(style='darkgrid')

    # Increase the plot size and font size.
    sns.set(font_scale=1.5)
    plt.rcParams["figure.figsize"] = (12,6)

    # Plot the learning curve.
    plt.plot(loss_values, 'b-o', label="training loss")
    plt.plot(validation_loss_values, 'r-o', label="validation loss")

    # Label the plot.
    plt.title("Learning curve")
    plt.xlabel("Epoch")
    plt.ylabel("Loss")
    plt.legend()

    plt.show()
    
    return

In [None]:
# PLOT CONFUSION MATRIX:
import matplotlib.pyplot as plt
import itertools

def plot_confusion_matrix(cm, classes, title='Confusion matrix'):
    plt.imshow(cm, interpolation='nearest', cmap=None)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=0)
    plt.yticks(tick_marks, classes)

    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, cm[i, j],
                 horizontalalignment="center",
                 color="black" if cm[i, j] > thresh else "white")

    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label')
    plt.show()

def plot_matrix(y_true, y_pred):
    from sklearn.metrics import confusion_matrix
    confusion_matrix = confusion_matrix(y_true, y_pred)
#     class_names = ["O", "B-PROD", "I-PROD", "X","[CLS]","[SEP]"]
    class_names = ["B-PROD", "I-PROD", "O"]
    plot_confusion_matrix(confusion_matrix
                          , classes=class_names
                          , title='Confusion matrix')
    
def get_confusion_matrix(y_true, y_pred):
    from sklearn.metrics import confusion_matrix
    confusion_matrix = confusion_matrix(y_true, y_pred)
    class_names = ["B-PROD", "I-PROD", "O"]
    return confusion_matrix, class_names

In [None]:
def flatten_true_pred_labels(y_true, y_pred):
    
    y_true_flat = []

    for s in y_true:
        for t in s:
            y_true_flat.append(t)
            
    y_pred_flat = []

    for s in y_pred:
        for t in s:
            y_pred_flat.append(t)
            
    return y_true_flat, y_pred_flat
    

In [None]:
def get_current_datetime():
    # using time module 
    import time 
    from datetime import datetime

    # ts stores the time in seconds 
    ts = time.time() 
    # print(ts) 

    #convert timestamp to date/time
    dt_object = datetime.fromtimestamp(ts)
    # print("dt_object =", dt_object)
    # print("type(dt_object) =", type(dt_object))

    #get datetime
    datetime = str(dt_object)
    datetime = datetime[:19]

    return datetime

In [None]:
# SAVE TRAINED MODEL

def save_trained_model(BASE_FOLDER):
    datetime = get_current_datetime()
    datetime = datetime.replace(':', '-')

    # print(ts)
    # print(datetime)

    #set bert_out_address 
    bert_out_address = BASE_FOLDER+str(datetime)

    # Make dir if not exits
    if not os.path.exists(bert_out_address):
            os.makedirs(bert_out_address)

    # Save a trained model, configuration and tokenizer
    model_to_save = model.module if hasattr(model, 'module') else model  # Only save the model it-self

    # If we save using the predefined names, we can load using `from_pretrained`
    output_model_file = os.path.join(bert_out_address, "pytorch_model.bin")
    output_config_file = os.path.join(bert_out_address, "config.json")

    # Save model into file
    torch.save(model_to_save.state_dict(), output_model_file)
    model_to_save.config.to_json_file(output_config_file)
    tokenizer.save_vocabulary(bert_out_address)

    return print('model ', datetime, ' saved to folder models_saved/')

In [None]:
# def save_trained_model_checkpoint(BASE_FOLDER, model, epoch, loss):
#     datetime = get_current_datetime()
#     datetime = datetime.replace(':', '-')

#     # print(ts)
#     # print(datetime)

#     #set bert_out_address 
#     PATH = BASE_FOLDER+str(datetime)

#     # Make dir if not exits
#     if not os.path.exists(bert_out_address):
#             os.makedirs(bert_out_address)
    
    
#     torch.save({
#             'epoch': epoch,
#             'model_state_dict': model.state_dict(),
#             'optimizer_state_dict': optimizer.state_dict(),
#             'loss': loss,
#             ...
#             }, PATH)

#     # Save a trained model, configuration and tokenizer
#     model_to_save = model.module if hasattr(model, 'module') else model  # Only save the model it-self

#     # If we save using the predefined names, we can load using `from_pretrained`
#     output_model_file = os.path.join(bert_out_address, "pytorch_model.bin")
#     output_config_file = os.path.join(bert_out_address, "config.json")

#     # Save model into file
#     torch.save(model_to_save.state_dict(), output_model_file)
#     model_to_save.config.to_json_file(output_config_file)
#     tokenizer.save_vocabulary(bert_out_address)

#     return print('model ', datetime, ' saved to folder models_saved/')

In [None]:
def get_latest_saved_model():

    import os.path
    import glob
    import datetime

    path = 'models_saved/'
    list_of_files = glob.glob('models_saved/2021*')
    list_of_files

    latest_file = max(list_of_files, key=os.path.getctime)
    _, filename = os.path.split(latest_file)
    return filename

In [None]:
def load_model(BASE_FOLDER, model_name):
    # set model to be loadaed
    bert_out_address = BASE_FOLDER + model_name

    model = BertForTokenClassification.from_pretrained(bert_out_address,num_labels=7)

    # Set model to GPU
    model.cuda()

    if n_gpu >1:
        model = torch.nn.DataParallel(model)
        
    return model

In [None]:
def inference_sap_bert(test_sentence, inference_model):
    
    model = inference_model
    
    test_sentence = test_sentence.lower()

    
    tokenizer = AutoTokenizer.from_pretrained('bert-base-cased')
    tokenized_sentence = tokenizer.encode(test_sentence)
    #print('tokenized_sentence: ', tokenized_sentence)
    
    input_ids = torch.tensor([tokenized_sentence]).cuda()

    with torch.no_grad():
        output = model(input_ids)
    label_indices = np.argmax(output[0].to('cpu').numpy(), axis=2)
    #print('label_indices: ', label_indices)

    # join bpe split tokens
    tokens = tokenizer.convert_ids_to_tokens(input_ids.to('cpu').numpy()[0])
    #print('tokens: ', tokens)
    
    new_tokens, new_labels = [], []
    for token, label_idx in zip(tokens, label_indices[0]):
        if token.startswith("##"):
            new_tokens[-1] = new_tokens[-1] + token[2:]
        else:
            new_labels.append(tag_values[label_idx])
            new_tokens.append(token)

#     for token, label in zip(new_tokens, new_labels):
#         print("{}\t{}".format(label, token))
        
    prediction = {"Token": new_tokens, "Label": new_labels}
    df = pd.DataFrame(prediction)
    
    df2 = df[df['Token'] != '[CLS]']   
    prediction_df = df2[df2['Token'] != '[SEP]'] 
        
    return prediction_df

def inference_sap_bert_to_list(test_sentence, inference_model):
    
    prediction_df = inference_sap_bert(test_sentence, inference_model)
    prediction_list = prediction_df.values.tolist()

    return prediction_list
    

In [None]:
# # UNIT TESTING (SGD)
# # OTHER SAMPLE SENTENCES:
# test_sentence = 'should be presented sap cloud platform sap_hana backup data ssp backup data etc databackup sstadm sap hana backup prod stop hybris process on all hybris app servers incl'
# test_sentence = 'ng div sap_hana div li li href shop by cloud platform sb cloud img class solution tile icon emptyprimaryimage src medias sap hana_cloud_platform icon'

# test_sentence = 'as sap gold partner delaware consulting and sap work together to implement sap s4hana cloud for rehau which is the first s/4hana live customer in china' 

# # test_sentence = 'the company says that the combination of sap cloud platform with sap business technology platform is all about connecting business processes and experiences so asug members can make confident decisions with integrity'

# inference_sap_bert(test_sentence, model)   

### Language detection:

In [None]:
# # LANGUAGE DETECTION WITH SPACY:
# !pip install spacy-langdetect
# !python -m spacy download en

# import spacy
# from spacy_langdetect import LanguageDetector
# nlp = spacy.load('en')
# nlp.add_pipe(LanguageDetector(), name='language_detector', last=True)

In [None]:
# def language_detection(text):
#     doc = nlp(text)
#     # document level language detection. Think of it like average language of the document!
#     #print(doc._.language)
#     language_score = doc._.language
#     # sentence level language detection
# #     for sent in doc.sents:
# #         print(sent, sent._.language)
#     return language_score

# text = 'This is an english text.'
# language_score = language_detection(text)
# language_score

In [None]:
# def english_yes_no(language_score):
#     language = language_score['language']
#     score = language_score['score']
    
#     if language == 'en' and score > 0.80:
#         veredict = 'yes'
#     else:
#         veredict = 'no'
#     return veredict

# print(english_yes_no(language_score))

In [None]:
# UNIT TESTING:

# filename='scw_01-23_sa_v6.csv'
# max_len=256
# bs=16
# test_size=0.2
# base_folder='training-datasets/'
# BASE_FOLDER='training-datasets/'

# data = pd.read_csv(BASE_FOLDER+filename,sep=",",encoding="latin1").fillna(method='ffill')    
    
# print('data loaded from: ', filename)
# print(data.describe())
# print(data.Tag.value_counts())
# print('-'*20)
# sentences, sentences_sbw, labels = turn_BIO_data_into_sentences(data)
# tags_vals, tag_values, tag2idx, idx2tag, tag2name = tag_values_tag2idx_idx2tag_tag2name(data)

#### Get Train/Validation Dataset (Tokenized, Tensors and DataLoaders)

In [24]:
## unit testing:
# BASE_FOLDER = 'training-datasets/'
# filename = 'scw_01-23_sa_v6.csv'

# data = pd.read_csv(BASE_FOLDER+filename,sep=",",encoding="latin1").fillna(method='ffill')    
# data[:5]

In [None]:
# BASE_FOLDER = 'training-datasets/'
# filename = '2021-03-21_14-02-50_01-272_INFER_PL1.csv'

# data = pd.read_csv(BASE_FOLDER+filename,sep=",",encoding="latin1").fillna(method='ffill')    
# data
# data.describe()
# print('data loaded from: ', filename)
# print(data.describe())
# print(data.Tag.value_counts())

In [None]:
def get_train_validation_dataset(filename, max_len, bs, test_size, base_folder):
    if base_folder == '':
        BASE_FOLDER = 'training-datasets/'
    else:
        BASE_FOLDER = base_folder
        
    data = pd.read_csv(BASE_FOLDER+filename,sep=",",encoding="latin1").fillna(method='ffill')    
    
    print('data loaded from: ', filename)
    print(data.describe())
    print(data.Tag.value_counts())
    
    sentences, sentences_sbw, labels = turn_BIO_data_into_sentences(data)
    
    tags_vals, tag_values, tag2idx, idx2tag, tag2name = tag_values_tag2idx_idx2tag_tag2name(data)
    
    # Set GPUs 
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    n_gpu = torch.cuda.device_count()
    torch.cuda.get_device_name(0)
    
    # TOKENIZE TEXTS AND LABELS:
    max_len = 256
    MAX_LEN = max_len
    model_max_length = max_len
    
    tokenizer = AutoTokenizer.from_pretrained('bert-base-cased')

    tokenized_texts, tokenized_labels, word_piece_labels, input_ids, tags, attention_masks = tokenize_texts_and_labels(tokenizer, max_len, sentences, labels, tag2idx)

    # SPLIT TRAINING/ VALIDATION DATASET:
    tr_inputs, val_inputs, tr_tags, val_tags = train_test_split(input_ids, tags, 
                                                                random_state=2018, test_size=test_size)
    tr_masks, val_masks, _, _ = train_test_split(attention_masks, input_ids,
                                                 random_state=2018, test_size=test_size)

    # CONVERT TO TORCH TENSORS (since we are operating in Pytorch)
    tr_inputs = torch.tensor(tr_inputs)
    val_inputs = torch.tensor(val_inputs)
    tr_tags = torch.tensor(tr_tags)
    val_tags = torch.tensor(val_tags)
    tr_masks = torch.tensor(tr_masks)
    val_masks = torch.tensor(val_masks)
    
    # SET BATCH-SIZE (BS): val_inputs, tag2name
    bs = 16
    batch_num = bs
    
    # DEFINE DATALOADERS: 
    #We shuffle the data at training time with the RandomSampler 
    # and at test time we just pass them sequentially with the SequentialSampler.
    train_data = TensorDataset(tr_inputs, tr_masks, tr_tags)
    train_sampler = RandomSampler(train_data)
    train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=bs)

    valid_data = TensorDataset(val_inputs, val_masks, val_tags)
    valid_sampler = SequentialSampler(valid_data)
    valid_dataloader = DataLoader(valid_data, sampler=valid_sampler, batch_size=bs)
    
    return train_data, train_sampler, train_dataloader, valid_data, valid_sampler, valid_dataloader, tag2idx, tag_values, input_ids, tags, attention_masks, val_inputs, tag2name, idx2tag, val_masks, val_tags

In [None]:
# STOP

# EJECUCION

In [None]:
execution_mode = execution_with_breakpoints('no')  # 'yes'or 'no'

In [None]:
check_breakpoint(execution_mode) # insert to set breakpoint check

#### Import libraries for retraining

In [None]:
# !pip install tensorflow --quiet
# !pip install keras --quiet
# from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler

# from keras.preprocessing.sequence import pad_sequences
# from sklearn.model_selection import train_test_split
# import pandas as pd
# import math
# import numpy as np
# from seqeval.metrics import f1_score
# from seqeval.metrics import classification_report,accuracy_score,f1_score
# import matplotlib.pyplot as plt
# import seaborn as sns
# import os
# from tqdm import tqdm,trange

#### Set GPUs:

In [None]:
# Set GPUs 
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

n_gpu = torch.cuda.device_count()
torch.cuda.get_device_name(0)

### Set Execution parameters (moded to top of the notebook)

In [None]:
# # EXECUTION PARAMETERS:

# #-----------------------------
# # base folder:
# #-----------------------------
# base_folder = 'training-datasets/'


# #-----------------------------
# # dataset source files:
# #-----------------------------
# filename_sa = 'scw_01-23_sa_v6.csv'


# # filename_da = '2021-03-21_14-02-50_01-272_INFER_PL1.csv'
# # filename_da = 'scw_01_23_da.csv'
# # filename_da = 'scw_24_49_da.csv'
# # filename_da = 'scw_50_99_da.csv'
# # filename_da = 'scw_100_149_da.csv'
# # filename_da = 'scw_1-149_220-272_da.csv'
# # filename_da = 'scw_nhve_1-149_220-272_da.csv'
# # filename_da = 'scw_220_272_da.csv'
# filename_da = '2021-03-22_01-22-44_nhve_scw_220_272_INFER.csv'

# #-----------------------------
# # select datasources for current execution:
# #-----------------------------
# training_source = 'sa'
# validation_source = 'da'

# #-----------------------------
# # SAVE MODEL 
# #-----------------------------
# save_model = 'no'
# # save_model = 'yes'

# #-----------------------------
# # TRAINING PARAMETERS
# #-----------------------------
# max_len = 256   # Max length:
# bs = 16         # Batch size: 2, 4, 8, 16
# batch_num = bs  # Batch Size:
# test_size = 0.3 # Validation/Test Split:
# lr = 5e-5       # Learning rate: 1e-5, 2e-5, 3e-5, 5e-5 
# eps = 1e-8      # EPS
# epochs = 3     # Epochs: 2,3,4,5

# #-----------------------------
# # Set Finetunning depth:
# #-----------------------------
# FULL_FINETUNING = True   # True: fine tuning all the layers 
# # FULL_FINETUNNING = False # False: only fine tuning the classifier layers

# #-----------------------------
# # MODEL TRAIN CODE VERSION: 
# #-----------------------------
# # train_code = 'OLD_CODE'
# train_code = 'NEW_CODE'

# #-----------------------------
# # SET MODEL SOURCE
# #-----------------------------
# MODEL_SOURCE = 'Transformers'
# selected_model = ''

# # MODEL_SOURCE = 'Saved_model'
# # selected_model = '2021-03-21_14-02-50.checkpoint'

# #-----------------------------
# # TRAIN MODEL:
# #-----------------------------
# TRAIN_MODEL = 'yes'
# # TRAIN_MODEL = 'no'

# #-----------------------------
# # GET TRAINING DATA
# #-----------------------------
# get_training_data = 'yes'
# # get_training_data = 'no'

### In case of Loading a Saved mode, select model to be loaded:

In [None]:
# if MODEL_SOURCE == 'Saved_model':
#     # LAST MODEL SAVED:
#     last_saved_model = get_latest_saved_model()
#     last_saved_model = last_saved_model[:19]+'.checkpoint'
#     print('Last saved model: ')
#     print(last_saved_model)

#     # ALL MODELS SAVED:
#     MODELS_FOLDER = 'models_saved/'
#     print('All saved models: ')
#     !ls "{MODELS_FOLDER}"

### Print execution parameters:

In [None]:
# print("Execution Summary:")
# print("Datetime: ", get_current_datetime())
# print("SUPERVISED Annotated Dataset: ", filename_sa)
# print("DISTANT Annotated Dataset: ", filename_da)
# print("Training data source: ", training_source)
# print("Validation data source: ", validation_source)
# print('train/valid. split: ',test_size)
# print('save_model: ', save_model)
# print('Max length:' , max_len)
# print('Batch Size: ', bs )
# print('batch_num: ', bs)
# print('learning_rate: ', lr)
# print('eps: ', eps)
# print('Epochs: ', epochs)
# print('FULL_FINETUNING', FULL_FINETUNING)
# print('MODEL_SOURCE', MODEL_SOURCE)
# print('selected_model', selected_model)
# print('Train_code: ', train_code)
# print('Train model: ', TRAIN_MODEL)
# print('get_training_data', get_training_data)

# 1) LOAD TRAINING/VALIDATION DATASET

## 1.a) from SUPERVISED Annotated Dataset

In [None]:
# Get Train/Validation data from SUPERVISED Annptated Dataset
if get_training_data == 'yes':
    
#     test_size = 0.3
    
    train_data_sa, train_sampler_sa, train_dataloader_sa, valid_data_sa, valid_sampler_sa, valid_dataloader_sa, tag2idx, tag_values, input_ids, tags, attention_masks, val_inputs, tag2name, idx2tag, val_masks, val_tags = get_train_validation_dataset(filename_sa, max_len, bs, test_size, base_folder)

In [None]:
filename_sa, max_len, bs, test_size, base_folder

## 1.b) from DISTANT Annotated Dataset

In [None]:
# Get Train/Validation data from DISTANT Annotated Dataset
if get_training_data == 'yes':
    
    train_data_da, train_sampler_da, train_dataloader_da, valid_data_da, valid_sampler_da, valid_dataloader_da, tag2idx, tag_values, input_ids, tags, attention_masks, val_inputs, tag2name, idx2tag, val_masks, val_tags = get_train_validation_dataset(filename_da, max_len, bs, test_size, base_folder)

In [None]:
check_breakpoint(execution_mode) # insert to set breakpoint check

# 2) COMBINE FINAL DATASET

In [None]:
# COPY THE CORRESPONDING DATASETS TO THE FINAL CONTAINERS/DATALOADERS:
if get_training_data == 'yes':
    
    if training_source == 'da':
        train_data = train_data_da
        train_sampler = train_sampler_da
        train_dataloader = train_dataloader_da
    elif training_source == 'sa':
        train_data = train_data_sa
        train_sampler = train_sampler_sa
        train_dataloader = train_dataloader_sa
    else:
        print('please enter a valid training_source, either da or sa')

    if training_source == 'da':
        valid_data = valid_data_da
        valid_sampler = valid_sampler_da
        valid_dataloader = valid_dataloader_da
    elif training_source == 'sa':
        valid_data = valid_data_sa
        valid_sampler = valid_sampler_sa
        valid_dataloader = valid_dataloader_sa
    else:
        print('please enter a valid validation_source, either da or sa')

In [None]:
check_breakpoint(execution_mode) # insert to set breakpoint check

# 3) IMPORT MODEL (from transformers library or a Saved Model)

In [None]:
if MODEL_SOURCE == 'Transformers':
    from transformers import BertForTokenClassification, AdamW
    model = BertForTokenClassification.from_pretrained(
    "bert-base-cased",
    num_labels=len(tag2idx),
    output_attentions = False,
    output_hidden_states = False)
    
elif MODEL_SOURCE == 'Saved_model':
    from transformers import BertForTokenClassification, AdamW
    model = BertForTokenClassification.from_pretrained(
    "bert-base-cased",
    #num_labels=len(tag2idx),
    num_labels= 7,
    output_attentions = False,
    output_hidden_states = False)

    PATH = MODELS_FOLDER+selected_model
    checkpoint = torch.load(PATH)
    model.load_state_dict(checkpoint)
    # LOAD tag2idx, idx2tag, tag2name from file: 
    import pickle

    # tag2idx
    file = open(MODELS_FOLDER+selected_model[:19]+'_'+'tag2idx', 'rb')
    tag2idx = pickle.load(file)
    print(tag2idx)

    # idx2tag
    file = open(MODELS_FOLDER+selected_model[:19]+'_'+'idx2tag', 'rb')
    idx2tag = pickle.load(file)
    print(idx2tag)

    # tag2name
    file = open(MODELS_FOLDER+selected_model[:19]+'_'+'tag2name', 'rb')
    tag2name = pickle.load(file)
    print(tag2name)

In [None]:
# # Funtion for loading model checkpoint, optimizer and loss:
# def load_ckp(checkpoint_fpath, model, optimizer):
#     """
#     checkpoint_path: path to save checkpoint
#     model: model that we want to load checkpoint parameters into       
#     optimizer: optimizer we defined in previous training
#     """
#     # load check point
#     checkpoint = torch.load(checkpoint_fpath)
#     # initialize state_dict from checkpoint to model
#     model.load_state_dict(checkpoint['state_dict'])
#     # initialize optimizer from checkpoint to optimizer
#     optimizer.load_state_dict(checkpoint['optimizer'])
#     # initialize valid_loss_min from checkpoint to valid_loss_min
#     valid_loss_min = checkpoint['valid_loss_min']
#     # return model, optimizer, epoch value, min validation loss 
#     return model, optimizer, checkpoint['epoch'], valid_loss_min.item()

In [None]:
# Pass the model parameters to the GPU.
model.cuda();

In [None]:
check_breakpoint(execution_mode) # insert to set breakpoint check

# 4) Set-up Transformer Optimization:
https://huggingface.co/transformers/_modules/transformers/optimization.html

### a) Set Full_finetuning

In [None]:
# True: fine tuning all the layers 
# False: only fine tuning the classifier layers
# FULL_FINETUNING = True

### b) Set parameters and Instantiate optimizer

In [None]:
# Optimization parameters:
# lr=3e-5
# eps=1e-8

if FULL_FINETUNING:
    param_optimizer = list(model.named_parameters())
    no_decay = ['bias', 'gamma', 'beta']
    optimizer_grouped_parameters = [
        {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
         'weight_decay_rate': 0.01},
        {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
         'weight_decay_rate': 0.0}
    ]
else:
    param_optimizer = list(model.classifier.named_parameters())
    optimizer_grouped_parameters = [{"params": [p for n, p in param_optimizer]}]

# Instantiate Optimizer:
optimizer = AdamW(
    optimizer_grouped_parameters,
    lr=lr,
    eps=eps
)

### c) Add Linear LR Schedule with warm-up:

In [None]:
# Scheduler to linearly reduce the learning rate throughout the epochs
from transformers import get_linear_schedule_with_warmup

# get_linear_schedule_with_warmup: Creates a schedule with a constant LR 
#(learning rate) preceded by a warmup period during which the LR increases
#linearly between 0 and the initial LR set in the optimizer.

# epochs = 3
max_grad_norm = 1.0

# Total number of training steps is number of batches * number of epochs.
total_steps = len(train_dataloader) * epochs

# Create the learning rate scheduler.
scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=0,
    num_training_steps=total_steps
)

### d) Get Model Parameters:

In [None]:
# Get all of the model's parameters as a list of tuples.
params = list(model.named_parameters())

print('The BERT model has {:} different named parameters.\n'.format(len(params)))

print('==== Embedding Layer ====\n')

for p in params[0:5]:
    print("{:<55} {:>12}".format(p[0], str(tuple(p[1].size()))))

print('\n==== First Transformer ====\n')

for p in params[5:21]:
    print("{:<55} {:>12}".format(p[0], str(tuple(p[1].size()))))

print('\n==== Output Layer ====\n')

for p in params[-4:]:
    print("{:<55} {:>12}".format(p[0], str(tuple(p[1].size()))))

In [None]:
check_breakpoint(execution_mode) # insert to set breakpoint check

In [None]:
# stop

## 4) TRAIN/ Fine-Tune BERT model for NER

In [None]:
# NEW CODE:

# if train_code == 'NEW_CODE':
if TRAIN_MODEL == 'yes':

    model.train()

    ## Store the average loss after each epoch so we can plot them.
    loss_values, validation_loss_values = [], []

    # epochs = 5

    for _ in trange(epochs, desc="Epoch"):
        # ========================================
        #               Training
        # ========================================
        # Perform one full pass over the training set.
        
        # see here BERT Transformers Moddel documentation: https://huggingface.co/transformers/main_classes/output.html 

        # Put the model into training mode.
        model.train()
        # Reset the total loss for this epoch.
        total_loss = 0

        # Training loop
        for step, batch in enumerate(train_dataloader):
            # add batch to gpu
            batch = tuple(t.to(device) for t in batch)
            b_input_ids, b_input_mask, b_labels = batch
            # Always clear any previously calculated gradients before performing a 
            #backward pass.
            model.zero_grad()
            # forward pass

            labels = torch.tensor([1,0]).unsqueeze(0)

            # NEW CODE:
            outputs = model(b_input_ids, attention_mask=b_input_mask, labels=b_labels)
            loss = outputs.loss
            
            # logits: Classification scores (before SoftMax). (torch.FloatTensor of shape (batch_size, sequence_length, config.num_labels))
            # loss: Classification loss. (torch.FloatTensor of shape (1,), optional, returned when labels is provided)
            
            # or
    #         from torch.nn import functional as F
    #         labels = torch.tensor([1,0])
    #         outputs = model(b_input_ids, attention_mask=b_input_mask)
    #         # Move logits and labels to CPU
    # #         logits = outputs[1].detach().cpu().numpy()
    #         label_ids = b_labels.to('cpu').numpy()
    #         loss = F.cross_entropy(outputs.logits, labels=labels_ids)
            # END NEW CODE:

            # BEGIN OLD CODE:
    #         # This will return the loss (rather than the model output)
    #         # because we have provided the `labels`.
    #         outputs = model(b_input_ids, token_type_ids=None,
    #                         attention_mask=b_input_mask, labels=b_labels)
    #         # get the loss
    #         loss = outputs[0]
            # END OLD CODE

            # Perform a backward pass to calculate the gradients.
            loss.backward()
            # track train loss
            total_loss += loss.item()
            # Clip the norm of the gradient
            # This is to help prevent the "exploding gradients" problem.
            torch.nn.utils.clip_grad_norm_(parameters=model.parameters(), max_norm=max_grad_norm)
            # update parameters
            optimizer.step()
            # Update the learning rate.
            scheduler.step()

        # Calculate the average loss over the training data.
        avg_train_loss = total_loss / len(train_dataloader)
        print("Average train loss: {}".format(avg_train_loss))

        # Store the loss value for plotting the learning curve.
        loss_values.append(avg_train_loss)


        # ========================================
        #               Validation
        # ========================================
        # After the completion of each training epoch, measure our performance on
        # our validation set.

        # Put the model into evaluation mode
        model.eval()
        # Reset the validation loss for this epoch.
        eval_loss, eval_accuracy = 0, 0
        nb_eval_steps, nb_eval_examples = 0, 0
        predictions , true_labels = [], []
        for batch in valid_dataloader:
            batch = tuple(t.to(device) for t in batch)
            b_input_ids, b_input_mask, b_labels = batch

            # Telling the model not to compute or store gradients,
            # saving memory and speeding up validation
            with torch.no_grad():
                # Forward pass, calculate logit predictions.
                # This will return the logits rather than the loss because we have not provided labels.
                outputs = model(b_input_ids, token_type_ids=None,
                                attention_mask=b_input_mask, labels=b_labels)
            # Move logits and labels to CPU
            logits = outputs[1].detach().cpu().numpy()
            label_ids = b_labels.to('cpu').numpy()

            # Calculate the accuracy for this batch of test sentences.
            eval_loss += outputs[0].mean().item()
            predictions.extend([list(p) for p in np.argmax(logits, axis=2)])
            true_labels.extend(label_ids)

        eval_loss = eval_loss / len(valid_dataloader)
        validation_loss_values.append(eval_loss)
        print("Validation loss: {}".format(eval_loss))
        pred_tags = [tag_values[p_i] for p, l in zip(predictions, true_labels)
                                     for p_i, l_i in zip(p, l) if tag_values[l_i] != "PAD"]
        valid_tags = [tag_values[l_i] for l in true_labels
                                      for l_i in l if tag_values[l_i] != "PAD"]
        print("Validation Accuracy: {}".format(accuracy_score(pred_tags, valid_tags)))
    #     print("Validation F1-Score: {}".format(f1_score(pred_tags, valid_tags)))
        print()

In [None]:
print("Execution Summary:")
print("Datetime: ", get_current_datetime())
print("SUPERVISED Annotated Dataset: ", filename_sa)
print("DISTANT Annotated Dataset: ", filename_da)
print("Training data source: ", training_source)
print("Validation data source: ", validation_source)
print('train/valid. split: ',test_size)
print('save_model: ', save_model)
print('Max length:' , max_len)
print('Batch Size: ', bs )
print('batch_num: ', bs)
print('learning_rate: ', lr)
print('eps: ', eps)
print('Epochs: ', epochs)
print('FULL_FINETUNING', FULL_FINETUNING)
print('MODEL_SOURCE', MODEL_SOURCE)
print('selected_model', selected_model)
print('Train_code: ', train_code)
print('get_training_data', get_training_data)

In [None]:
# print("Execution Summary:")
# print("Datetime: ")
# print("SUPERVISED Annotated Dataset: ")
# print("DISTANT Annotated Dataset: ")
# print("Training data source: ")
# print("Validation data source: ")
# print('train/valid. split: ')
# print('save_model: ')
# print('Max length:' )
# print('Batch Size: ')
# print('batch_num: ')
# print('learning_rate: ')
# print('eps: ')
# print('Epochs: ')
# print('MODEL_SOURCE')
# print('selected_model')
# print('Train_code: ')
# print('get_training_data')

In [None]:
# print("Execution Summary:")
# print(get_current_datetime())
# print(filename_sa)
# print(filename_da)
# print(training_source)
# print(validation_source)
# print(test_size)
# print(save_model)
# print(max_len)
# print(bs )
# print(bs)
# print(lr)
# print(eps)
# print(epochs)
# print(FULL_FINETUNING)
# print(MODEL_SOURCE)
# print(selected_model)
# print(train_code)
# print(get_training_data)
# print(train_code)

### Plot Training Loss

In [None]:
plot_training_validation_loss(loss_values, validation_loss_values)

In [None]:
check_breakpoint(execution_mode) # insert to set breakpoint check

# 5) EVALUATE Model

In [None]:
# MODEL EVALUATION: (FROM V.2)

model.eval()

eval_loss, eval_accuracy = 0, 0
nb_eval_steps, nb_eval_examples = 0, 0
y_true = []
y_pred = []

bert_out_address = "models_evaluation/"
datetime = get_current_datetime()
filename_prefix = str(datetime)+'_'

print("***** Running evaluation *****")
# print("  Num examples ={}".format(len(val_inputs)))
print("  Batch size = {}".format(batch_num))

for step, batch in enumerate(valid_dataloader):
    batch = tuple(t.to(device) for t in batch)
    input_ids, input_mask, label_ids = batch
    
#     if step > 2:
#         break
    
    with torch.no_grad():
        outputs = model(input_ids, token_type_ids=None,
        attention_mask=input_mask,)
        # For eval mode, the first result of outputs is logits
        logits = outputs[0] 
    
    # Get NER predict result
    logits = torch.argmax(F.log_softmax(logits,dim=2),dim=2)
    logits = logits.detach().cpu().numpy()
    

    # Get NER true result
    label_ids = label_ids.to('cpu').numpy()
    
    
    # Only predict the real word, mark=0, will not calculate
    input_mask = input_mask.to('cpu').numpy()
    
    # Compare the valuable predict result
    for i,mask in enumerate(input_mask):
        # Real one
        temp_1 = []
        # Predict one
        temp_2 = []
        
        for j, m in enumerate(mask):
            # Mark=0, meaning its a pad word, dont compare
            if m:
                if tag2name[label_ids[i][j]] != "X" and tag2name[label_ids[i][j]] != "[CLS]" and tag2name[label_ids[i][j]] != "[SEP]" : # Exclude the X label
                    temp_1.append(tag2name[label_ids[i][j]])
                    temp_2.append(tag2name[logits[i][j]])
            else:
                break
        
            
        y_true.append(temp_1)
        y_pred.append(temp_2)

# Get acc , recall, F1 result report
report = classification_report(y_true, y_pred,digits=4)

### Print Evaluation Results

In [None]:
# Print Evaluation Results
print("***** Evaluation results *****")
print("\ndatetime: ", datetime)
print("  Num eval. predictions ={}".format(len(y_pred)))        
print("f1 score: %f"%(f1_score(y_true, y_pred)))
print("Accuracy score: %f"%(accuracy_score(y_true, y_pred)))
print("\n%s"%(report))

In [None]:
# FLATTEN PREDICTIONS AND TRUE LABELS:
y_true_flat, y_pred_flat = flatten_true_pred_labels(y_true, y_pred)

In [None]:
target_names = ['B-PROD', 'I-PROD', 'O']

from sklearn.metrics import classification_report

print(classification_report(y_true_flat, y_pred_flat, target_names=target_names, digits=6))

### GET CONFUSSION MATRIX:

In [None]:

plot_matrix(y_true_flat, y_pred_flat)

In [None]:
conf_matrix, class_names = get_confusion_matrix(y_true_flat, y_pred_flat)

print(class_names)

print(conf_matrix)

In [None]:
check_breakpoint(execution_mode) # insert to set breakpoint check

### Understanding 'y_pred' and 'y_true'

In [None]:
n = 5
print('y_true: ',y_true[n])
print('y_pred: ', y_pred[n])

# 6) SAVE Fine-Tuned Model, parameters, eval. results

In [None]:
check_breakpoint(execution_mode) # insert to set breakpoint check

### 6.1) SAVE  Trained Model checkpoint to file

we want to save a checkpoint that allows us to use this information to continue our model training. Here is the information needed:
- epoch: a measure of the number of times all of the training vectors are used once to update the weights.
- valid_loss_min: the minimum validation loss, this is needed so that when we continue the training, we can start with this rather than np.Inf value.
- state_dict: model architecture information. It includes the parameter matrices for each of the layers.
- optimizer: You need to save optimizer parameters especially when you are using Adam as your optimizer. Adam is an adaptive learning rate method, which means, it computes individual learning rates for different parameters which we would need if we want to continue our training from where we left off [2].

In [None]:
# SAVE  Trained Model checkpoint to file
BASE_FOLDER = 'models_saved/' 
datetime = get_current_datetime()
datetime = datetime.replace(':', '-')
datetime = datetime.replace(' ', '_')

PATH = BASE_FOLDER+str(datetime)
#PATH

torch.save(model.state_dict(), PATH+'.checkpoint')

In [None]:
PATH

### 6.2) SAVE tag2idx, idx2tag, tag2name to file

In [None]:
# SAVE tag2idx, idx2tag, tag2name to file

import pickle

MODELS_FOLDER = 'models_saved/'

# tag2idx to file
file = open(MODELS_FOLDER+datetime+'_'+'tag2idx', 'wb' )
pickle.dump(tag2idx, file)
file.close()

# idx2tag to file
file = open(MODELS_FOLDER+datetime+'_'+'idx2tag', 'wb' )
pickle.dump(idx2tag, file)
file.close()

# tag2name to file
file = open(MODELS_FOLDER+datetime+'_'+'tag2name', 'wb' )
pickle.dump(tag2name, file)
file.close()

In [None]:
# OPEN saved file (test)
file = open(MODELS_FOLDER+datetime+'_'+'tag2idx', 'rb')
tag2idx = pickle.load(file)
tag2idx

### 6.3) SAVE Training Parameters to file

In [None]:
# SAVE Model Parameters and Evaluation Results
BASE_FOLDER = 'models_saved/'

file = os.path.join(BASE_FOLDER, datetime+'_parameters')
with open(file, "w") as writer:
    writer.write('\n--------------------------------------------------------------')
    writer.write("\nExecution Summary:   ")
    writer.write('\n--------------------------------------------------------------')
    writer.write("\nDatetime: ") 
    writer.write(get_current_datetime())
    writer.write("\nModel_name (generated): ") 
    writer.write(PATH)
    writer.write('\n--------------------------------------------------------------')
    writer.write('\nMODEL_SOURCE: ')
    writer.write(MODEL_SOURCE)
    writer.write('\nselected_model: (if source= saved_model):  ')
    writer.write(selected_model)
    writer.write("\n\nSupervised Annotated Dataset: ") 
    writer.write(filename_sa)
    writer.write("\nDistant Annotated Dataset: ")
    writer.write(filename_da)
    writer.write("\n\nTraining data source: ")
    writer.write(training_source)
    writer.write("\nValidation data source: ") 
    writer.write(validation_source)
    writer.write('\n--------------------------------------------------------------')
    writer.write('\nMax length: ')
    writer.write(str(max_len))
    writer.write('\nBatch Size: ')
    writer.write(str(bs))
    writer.write('\nLearning_rate: ')
    writer.write(str(lr))
    writer.write('\nEPS: ') 
    writer.write(str(eps))
    writer.write('\nEpochs: ')
    writer.write(str(epochs))
    writer.write('\n--------------------------------------------------------------')

### 6.4) SAVE Evaluation Results to file

In [None]:
file = os.path.join(BASE_FOLDER, datetime+'_eval_metrics')
with open(file, "w") as writer:
    writer.write("\nModel_name: ") 
    writer.write(PATH)
    writer.write("\n\nEVALUATION METRICS: ")
    writer.write("\nf1 socre:  ")
    writer.write(str(f1_score(y_true, y_pred)))
    writer.write("\nAccuracy score:  ")
    writer.write(str(accuracy_score(y_true, y_pred)))
    writer.write("\n\n")  
    writer.write(report)
    writer.write("\nConfusion Matrix:  ")
    #writer.write(str(class_names))
    #writer.write(conf_matrix)
    writer.write('\nB-PROD: B-I-O ') 
    writer.write(str(conf_matrix[0]))
    writer.write('\nI-PROD: B-I-O ')
    writer.write(str(conf_matrix[1]))     
    writer.write('\nO     : B-I-O ')
    writer.write(str(conf_matrix[2]))

In [None]:
PATH

In [None]:
print('B-PROD: B-I-O ', str(conf_matrix[0]))
print('I-PROD: B-I-O ', str(conf_matrix[1]))     
print('O     : B-I-O ', str(conf_matrix[2]))
      

# 7) INFERENCE model (unit testing)

In [None]:
# Set GPUs 
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
n_gpu = torch.cuda.device_count()
torch.cuda.get_device_name(0)  
    
# Pass the model parameters to the GPU.
model.cuda();

In [None]:
# Inference model
# test_sentence = 'you can get a complete overview of all applications delivered with mss wda in the sap library for sap erp on sap help portal at sap erp enhancement packages erp central component shared services manager self service manager self service wda applications'
# test_sentence = 'common object layer brim billing and revenue innovation management ccm cross catalog mapping sap cc sap convergent charging system sap ci sap convergent invoicing smt subscriber mapping table srt subscriber range table odi order distribution'
# test_sentence = 'this article is related to hana'
test_sentence = 'sap erp is the best erp'

inference_sap_bert(test_sentence, model)