# LOAD saved model & INFERENCE

This notebook loads and inferences a trained sapxBERT_NER model with the sentences from a selected annotated dataset.

input parameters:
- last_saved_model => the checkpoint file of a saved model to be inferenced
- filename => a BIO annotated dataset

output parameters:
- .csv file (models_inference_true_pred/"+model_name+filename_main+"_true_pred.csv)
- fields: true_token, true_label, pred_token, pred_label, sentence


# 0) EXECUTION PARAMETERS:

### a) Run to see all models available in the folder:

In [None]:
# ALL MODELS SAVED:
MODELS_FOLDER = 'models_saved/'
print('All saved models: ')
!ls "{MODELS_FOLDER}"

### b) Execution parameters:

In [None]:
# EXECUTION PARAMETERS:

last_saved_model = '2021-03-22_01-22-44.checkpoint' #=2021-03-18_02-35-03 retrained 
# last_saved_model = '2021-03-21_14-02-50.checkpoint'

# filename = 'scw_1-149_220-272_da.csv'
# filename = 'scw_01-23_sa_v6.csv'
# filename = 'scw_220_272_da.csv'
filename = 'nhve_scw_220_272_da.csv'

filename_main = filename[:12]
filename, filename_main

# 1) LOAD Model Checkpoint

### Import Libraries

In [None]:
print('installing transformers')
!pip install --upgrade transformers==4.2.2 --quiet
!pip install transformers[sentencepiece] --quiet #transformers v4.x --> --quiet

print('importing packages')
import torch
from transformers import BertModel, BertTokenizer, BertForTokenClassification, BertConfig
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained('bert-base-cased')

import numpy as np
import pandas as pd

from transformers import BertForTokenClassification, AdamW
model = BertForTokenClassification.from_pretrained(
    "bert-base-cased",
    #num_labels=len(tag2idx),
    num_labels= 7,
    output_attentions = False,
    output_hidden_states = False)
print('done')

In [None]:
def get_latest_saved_model():

    import os.path
    import glob
    import datetime

    path = 'models_saved/'
    list_of_files = glob.glob('models_saved/2021*')
    list_of_files

    latest_file = max(list_of_files, key=os.path.getctime)
    _, filename = os.path.split(latest_file)
    return filename

### Set GPUs 

In [None]:
# Set GPUs 
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
n_gpu = torch.cuda.device_count()
torch.cuda.get_device_name(0)  
    
# Pass the model parameters to the GPU.
model.cuda();

### Get list of models available:

In [None]:
# LAST MODEL SAVED:  ** COMMENTED BECAUSE WE ARE GETTING THE VALUE FROM THE BEGINNING **
# last_saved_model = get_latest_saved_model()
# last_saved_model = last_saved_model[:19]+'.checkpoint'
# print('Last saved model: ')
# print(last_saved_model)

# # ALL MODELS SAVED:
# MODELS_FOLDER = 'models_saved/'
# print('All saved models: ')
# !ls "{MODELS_FOLDER}"

In [None]:
# stop

### Load model checkpoint

In [None]:
# selected_model = '2021-03-11_23-52-31'
selected_model = last_saved_model
PATH = MODELS_FOLDER+selected_model

# PATH = MODELS_FOLDER+selected_model+'/'+selected_model+'.checkpoint'
checkpoint = torch.load(PATH)
model.load_state_dict(checkpoint)

# # set model status
model.eval()
# # - or -
# #model.train()

In [None]:
PATH

### Get Model Parameters

In [None]:
# Get MOdel Parameters
param_path = PATH[:32]+'_parameters'
file_param = param_path[13:]

folder = param_path[:13]

model_name = PATH[:32]
model_name = model_name[13:]

folder, model_name, file_param

In [None]:
# with open(param_path, 'r') as reader:
#     parameters = reader.read()
    
# valid_source = parameters[241:243]
# valid_filenm = parameters[174:183]
# valid_source, valid_filenm

In [None]:
# model_name = parameters[76:95]

### LOAD tag2idx, idx2tag, tag2name from file: 

In [None]:
selected_model

In [None]:
# LOAD tag2idx, idx2tag, tag2name from file: 
def tag_values_tag2idx_idx2tag_tag2name_from_model(model):

    import pickle

    # tag2idx
    file = open(MODELS_FOLDER+selected_model[:19]+'_'+'tag2idx', 'rb')
    tag2idx = pickle.load(file)
    #print(tag2idx)

    # idx2tag
    file = open(MODELS_FOLDER+selected_model[:19]+'_'+'idx2tag', 'rb')
    idx2tag = pickle.load(file)
    #print(idx2tag)

    # tag2name
    file = open(MODELS_FOLDER+selected_model[:19]+'_'+'tag2name', 'rb')
    tag2name = pickle.load(file)
    #print(tag2name)
    
    tag_values = []


    for key in tag2idx.keys():
        tag_values.append(key)
        
    tags_vals = tag_values
    
    return tags_vals, tag_values, tag2idx, idx2tag, tag2name



tags_vals, tag_values, tag2idx, idx2tag, tag2name = tag_values_tag2idx_idx2tag_tag2name_from_model(selected_model)
print('tags_vals: ', tags_vals)
print('tag_values: ', tag_values)
print('tag2idx: ', tag2idx)
print('idx2tag: ', idx2tag)
print('tag2name: ', tag2name)

In [None]:
# tag_values = []


# for key in tag2idx.keys():
#     tag_values.append(key)

# tag_values

# 2) Inference functions:

In [None]:
def inference_sap_bert(test_sentence, inference_model):
    
    model = inference_model
    
    test_sentence = test_sentence.lower()

    
    tokenizer = AutoTokenizer.from_pretrained('bert-base-cased')
    tokenized_sentence = tokenizer.encode(test_sentence)
    #print('tokenized_sentence: ', tokenized_sentence)
    
    input_ids = torch.tensor([tokenized_sentence]).cuda()

    with torch.no_grad():
        output = model(input_ids)
    label_indices = np.argmax(output[0].to('cpu').numpy(), axis=2)
    #print('label_indices: ', label_indices)

    # join bpe split tokens
    tokens = tokenizer.convert_ids_to_tokens(input_ids.to('cpu').numpy()[0])
    #print('tokens: ', tokens)
    
    new_tokens, new_labels = [], []
    for token, label_idx in zip(tokens, label_indices[0]):
        if token.startswith("##"):
            new_tokens[-1] = new_tokens[-1] + token[2:]
        else:
            #new_labels.append(tag_values[label_idx])
            new_labels.append(tag2name[label_idx])
            new_tokens.append(token)

#     for token, label in zip(new_tokens, new_labels):
#         print("{}\t{}".format(label, token))
        
    prediction = {"Token": new_tokens, "Label": new_labels}
    df = pd.DataFrame(prediction)
    
    df2 = df[df['Token'] != '[CLS]']   
    prediction_df = df2[df2['Token'] != '[SEP]'] 
        
    return prediction_df

def inference_sap_bert_to_list(test_sentence, inference_model):
    
    prediction_df = inference_sap_bert(test_sentence, inference_model)
    prediction_list = prediction_df.values.tolist()

    return prediction_list

#### Unit test Inference:

In [None]:
# Inference model
# test_sentence = 'you can get a complete overview of all applications delivered with mss wda in the sap library for sap erp on sap help portal at sap erp enhancement packages erp central component shared services manager self service manager self service wda applications'

# test_sentence = 'common object layer brim billing and revenue innovation management ccm cross catalog mapping sap cc sap convergent charging system sap ci sap convergent invoicing smt subscriber mapping table srt subscriber range table odi order distribution'
# test_sentence = 'this article is related to hana'
test_sentence = 'sap erp is the best erp'

inference_sap_bert(test_sentence, model)

In [None]:
stop

### Adapt Labels if necesary

In [None]:
# original values:
# tags_vals:  ['I-PROD', 'B-PROD', 'O', 'X', '[CLS]', '[SEP]', 'PAD']
# tag_values:  ['I-PROD', 'B-PROD', 'O', 'X', '[CLS]', '[SEP]', 'PAD']
# tag2idx:  {'I-PROD': 0, 'B-PROD': 1, 'O': 2, 'X': 3, '[CLS]': 4, '[SEP]': 5, 'PAD': 6}
# idx2tag:  {0: 'I-PROD', 1: 'B-PROD', 2: 'O', 3: 'X', 4: '[CLS]', 5: '[SEP]', 6: 'PAD'}
# tag2name:  {0: 'I-PROD', 1: 'B-PROD', 2: 'O', 3: 'X', 4: '[CLS]', 5: '[SEP]', 6: 'PAD'}

# # adapted:
# tags_vals =  ['B-PROD', 'I-PROD', 'O', 'X', '[CLS]', '[SEP]', 'PAD']
# tag_values =  ['B-PROD', 'I-PROD', 'O', 'X', '[CLS]', '[SEP]', 'PAD']
# tag2idx =  {'B-PROD': 0, 'I-PROD': 1, 'O': 2, 'X': 3, '[CLS]': 4, '[SEP]': 5, 'PAD': 6}
# idx2tag =   {0: 'B-PROD', 1: 'I-PROD', 2: 'O', 3: 'X', 4: '[CLS]', 5: '[SEP]', 6: 'PAD'}
# tag2name  =   {0: 'B-PROD', 1: 'I-PROD', 2: 'O', 3: 'X', 4: '[CLS]', 5: '[SEP]', 6: 'PAD'}

# 3) Inference with probability scores

In [None]:
def inference_with_prob(input_sentence):

    import torch.nn as nn
    softmax = nn.Softmax(dim=-1)
    #y = softmax(x)

    # test_sentence = 'sap erp is the best erp'
    # test_sentence = 'common object layer brim billing and revenue innovation management ccm cross catalog mapping sap cc sap convergent charging system sap ci sap convergent invoicing smt subscriber mapping table srt subscriber range table odi order distribution'

    # test_sentence = test_sentence.lower()
    input_sentence = input_sentence.lower()

    tokenizer = AutoTokenizer.from_pretrained('bert-base-cased')
    tokenized_sentence = tokenizer.encode(test_sentence)
    #print('tokenized_sentence: ', tokenized_sentence)

    input_ids = torch.tensor([tokenized_sentence]).cuda()
    #print('input_ids: ', input_ids)

    with torch.no_grad():
        output = model(input_ids)
    #print(output)

    label_indices = np.argmax(output[0].to('cpu').numpy(), axis=2)
    #print('label_indices: ', label_indices)

    #print('tag_values: ', tag_values)
    output[0]

    np.argmax(output[0].to('cpu').numpy())

    import torch.nn as nn
    # print(tag_values)
    softmax = nn.Softmax(dim=-1)
    y = softmax(output[0])
    # print(y)

    y.shape

    yarr3d = y.to('cpu').numpy()

    yarr3d.shape

    yarr2d = np.reshape(yarr3d, (y.shape[1], 7))
    #yarr2d

    yarr2d.shape

    label_prob = []
    prob = []

    for i in yarr2d:
        line = []
        lbl_ind = np.argmax(i)
        #print(i[lbl_idx])
        #print(lbl_idx)
        line.append(lbl_ind)
        line.append(i[lbl_ind])
        prob.append(i[lbl_ind])
        label_prob.append(line)

    # print(label_prob)
    # print(prob)

    # for i in label_indices[0]:
    #     print(tag_values[i])
    
    # join the split tokens
    tokens = tokenizer.convert_ids_to_tokens(input_ids.to('cpu').numpy()[0])
    #print('tokens: ', tokens)

    # new code version (taking the probability into account)
    new_tokens, new_labels, new_probs = [], [], []
    tkn_lbl_prob = []

    for token_, label_, prob_ in zip(tokens, label_indices[0], prob):
        line = []
        line = (token_, label_, prob_)
        tkn_lbl_prob.append(line)

    for token_, label_, prob_ in tkn_lbl_prob:
        if token_.startswith("##"):
            new_tokens[-1] = new_tokens[-1] + token_[2:]
        else:
            new_labels.append(tag_values[label_])
            new_tokens.append(token_)
            new_probs.append(prob_)

    pred_prob = {"Token": new_tokens, "Label": new_labels, "Probability": new_probs}
    df = pd.DataFrame(pred_prob)
    df

    df2 = df[df['Token'] != '[CLS]']   
    prediction_df = df2[df2['Token'] != '[SEP]'] 
    return prediction_df

In [None]:
#test_sentence = 'you can get a complete overview of all applications delivered with mss wda in the sap library for sap erp on sap help portal at sap erp enhancement packages erp central component shared services manager self service manager self service wda applications'
test_sentence = 'common object layer brim billing and revenue innovation management ccm cross catalog mapping sap cc sap convergent charging system sap ci sap convergent invoicing smt subscriber mapping table srt subscriber range table odi order distribution'
#test_sentence = 'this article is related to hana'
# test_sentence = 'sap erp is the best erp'

inference_with_prob(test_sentence)

In [None]:
# stop

# 4) INFERENCE: Apply model to a TRAINING SET

## Define functions:

In [None]:
print('installing tensorflow')
!pip install tensorflow --quiet
print('installing keras')
!pip install keras --quiet
print('importing packages')
from keras.preprocessing.sequence import pad_sequences
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from sklearn.model_selection import train_test_split
print('done')

In [None]:
def tag_values_tag2idx_idx2tag_tag2name_from_data(data):

    tags_vals = list(set(data["Tag"].values))

    # Add some additional tags:
    # X  tag for word piece support
    # [CLS] and [SEP] as BERT need
    tags_vals.append('X')
    tags_vals.append('[CLS]')
    tags_vals.append('[SEP]')
    tags_vals.append("PAD")

    tag2idx = {t: i for i, t in enumerate(tags_vals)}
    idx2tag = {i: t for i, t in enumerate(tags_vals) }

    print('tags_vals: ', tags_vals)
    print('tag2idx: ', tag2idx)
    print('idx2tag: ', idx2tag)

    tag_values = tags_vals
    
    # Mapping tag to name
    tag2name={tag2idx[key] : key for key in tag2idx.keys()}
    print('tag2name: ', tag2name)

    return tags_vals, tag_values, tag2idx, idx2tag, tag2name


In [None]:
def turn_BIO_data_into_sentences(data):
    #concat sentence
    getter = SentenceGetter(data)
    
    sentences = [" ".join([s[0] for s in sent]) for sent in getter.sentences]
    sentences_sbw = [[s[0] for s in sent] for sent in getter.sentences]
    labels = [[s[2] for s in sent] for sent in getter.sentences]
    return sentences, sentences_sbw, labels

class SentenceGetter(object):
    
    def __init__(self, data):
        self.n_sent = 1
        self.data = data
        self.empty = False
        agg_func = lambda s: [(w, p, t) for w, p, t in zip(s["Word"].values.tolist(),
                                                           s["POS"].values.tolist(),
                                                           s["Tag"].values.tolist())]
        self.grouped = self.data.groupby("Sentence").apply(agg_func)
        self.sentences = [s for s in self.grouped]
    
    def get_next(self):
        try:
            s = self.grouped["Sentence: {}".format(self.n_sent)]
            self.n_sent += 1
            return s
        except:
            return None
        
# TEXT TOKENIZATION and EXTENSION OF LABELS FOR SPLITTED TOKENS
def tokenize_texts_extend_labels(sentences, labels):
    tokenized_texts = []
    tokenized_labels = []
    for sent, labs in zip(sentences, labels):
        tokenized_sentence = []
        labels = []

        sent_tokens = sent.split()
        for word, label in zip(sent_tokens, labs):

            # Tokenize the word and count # of subwords the word is broken into
            tokenized_word = tokenizer.tokenize(word)
            n_subwords = len(tokenized_word)

            # Add the tokenized word to the final tokenized word list
            tokenized_sentence.extend(tokenized_word)

            # Add the same label to the new list of labels `n_subwords` times
            labels.extend([label] * n_subwords)

        tokenized_texts.append(tokenized_sentence)
        tokenized_labels.append(labels)
    
    return tokenized_texts, tokenized_labels

# tokenized_texts, tokenized_labels = tokenize_texts_extend_labels(sentences, labels)

def tokenize_texts_and_labels(tokenizer, max_len, sentences, labels, tag2idx):
    tokenizer = AutoTokenizer.from_pretrained('bert-base-cased')

    #TOKENIZE TEXTS and LABELS:
    tokenized_texts, tokenized_labels = tokenize_texts_extend_labels(sentences, labels)
    
    # SGD (added to comply with previous versions of code)
    word_piece_labels = tokenized_labels

    # INPUT IDs:
    input_ids = pad_sequences([tokenizer.convert_tokens_to_ids(txt) for txt in tokenized_texts],
                              maxlen=max_len, dtype="long", value=0.0,
                              truncating="post", padding="post")

    # TAGS:
    tags = pad_sequences([[tag2idx.get(l) for l in lab] for lab in tokenized_labels],
                         maxlen=max_len, value=tag2idx["PAD"], padding="post",
                         dtype="long", truncating="post")

    # ATTENTION MASKS:
    attention_masks = [[float(i != 0.0) for i in ii] for ii in input_ids]
        
        
    return tokenized_texts, tokenized_labels, word_piece_labels, input_ids, tags, attention_masks

In [None]:
# stop

## I. Compare True Labels vs Predicted Labels

### a) (load sample data , tokenize, split train/validation dataset, convert to tensors, prepare dataloader) - execute only if you load a saved model

In [None]:
BASE_FOLDER = 'training-datasets/'
filename = filename
# filename = 'scw_24_49_da'
# # filename = 'sapner_tds_836_1.csv'
# # filename = 'scw_sup_annot_1-23.csv'
# # filename = 'scw_sup_annot_1-23_v5.csv'
# # filename = 'scw_41_50_da.csv'
# filename = 'scw_01-23_sa_v6.csv'

In [None]:
filename

In [None]:
data = pd.read_csv(BASE_FOLDER+filename,sep=",",encoding="latin1").fillna(method='ffill')    
data.head(5)

In [None]:
data

In [None]:
# tag_values_tag2idx_idx2tag_tag2name(data)
sentences, sentences_sbw, labels = turn_BIO_data_into_sentences(data)

# Get tags_vals, tag_values, tag2idx, idx2tag, tag2name:
# tags_vals, tag_values, tag2idx, idx2tag, tag2name = tag_values_tag2idx_idx2tag_tag2name_from_data(data)
tags_vals, tag_values, tag2idx, idx2tag, tag2name = tag_values_tag2idx_idx2tag_tag2name_from_model(selected_model)
    
# Set GPUs 
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
n_gpu = torch.cuda.device_count()
torch.cuda.get_device_name(0)

# TOKENIZE TEXTS AND LABELS:
max_len = 256
MAX_LEN = max_len
model_max_length = max_len

tokenized_texts, tokenized_labels, word_piece_labels, input_ids, tags, attention_masks = tokenize_texts_and_labels(tokenizer, max_len, sentences, labels, tag2idx)

# n = 15
# print(tokenized_texts[n])
# print(tokenized_labels[n])
# print(word_piece_labels[n])
# print(input_ids[n])
# print(tags[0])
# print(attention_masks[0])

# SPLIT TRAINING/ VALIDATION DATASET:
tr_inputs, val_inputs, tr_tags, val_tags = train_test_split(input_ids, tags, 
                                                            random_state=2018, test_size=0.2)
tr_masks, val_masks, _, _ = train_test_split(attention_masks, input_ids,
                                             random_state=2018, test_size=0.2)

# CONVERT TO TORCH TENSORS (since we are operating in Pytorch)
tr_inputs = torch.tensor(tr_inputs)
val_inputs = torch.tensor(val_inputs)
tr_tags = torch.tensor(tr_tags)
val_tags = torch.tensor(val_tags)
tr_masks = torch.tensor(tr_masks)
val_masks = torch.tensor(val_masks)


# SET BATCH-SIZE (BS): val_inputs, tag2name
bs = 16
batch_num = bs

# DEFINE DATALOADERS: 
#We shuffle the data at training time with the RandomSampler 
# and at test time we just pass them sequentially with the SequentialSampler.
train_data = TensorDataset(tr_inputs, tr_masks, tr_tags)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=bs)

valid_data = TensorDataset(val_inputs, val_masks, val_tags)
valid_sampler = SequentialSampler(valid_data)
valid_dataloader = DataLoader(valid_data, sampler=valid_sampler, batch_size=bs)

### b) Detokenize Tokens/Labels from given sentence of Training/Validation Dataset

#### b.1)  detokenize sentence into tokens labels

In [None]:
# b.1) detokenize sentence into tokens labels

def labels_from_label_ids(label_ids, idx2tag):
    label_names = []
    for i in label_ids:
        label = idx2tag[i]
        label_names.append(label)  
    return label_names

def detokenize_sentence_into_tokens_labels(inputs_i, masks_i, tags_i, idx2tag):
    # Tokens from inputs
    id_list = inputs_i
    tokns = tokenizer.convert_ids_to_tokens(id_list)
    # Labels from Tags
    label_ids = tags_i.tolist()
    # Get labels from Label Ids
    label_names = labels_from_label_ids(label_ids, idx2tag)
    # Dataframe with Tokens/Labels
    token_label_dict = {'Token': tokns, 'Label': label_names}
    token_label_df_tmp = pd.DataFrame(token_label_dict)
    token_label_df = token_label_df_tmp[token_label_df_tmp['Token'] != '[PAD]']
    # generate list from 'token_label_df':
    tkn_lbl = token_label_df.values.tolist()
    # clean tokens: remove rows where Token contains '##'
    new_df = token_label_df[~token_label_df.Token.str.contains("##")]
    # generate ist from 'new_df':
    tkn_lbl_clean = new_df.values.tolist()
    
    return tkn_lbl, tkn_lbl_clean

# UNIT TESTING:
# # # # Execution
# n = 15

# inputs_i = val_inputs[n]
# masks_i = val_masks[n]
# tags_i = val_tags[n]

# tkn_lbl, tkn_lbl_clean = detokenize_sentence_into_tokens_labels(inputs_i, masks_i, tags_i, idx2tag)
# print('tkn_lbl: ', tkn_lbl)
# print('-'*50)
# print('tkn_lbl_clean: ', tkn_lbl_clean)
# print('-'*50)

#### b.2) Get entity List from Tokens/Labels list

In [None]:
# b.2) Get entity List from Tokens/Labels list

def get_entities_from_token_label_list(tkn_lbl):
    prev_lbl = 'O'
    entity = ''
    entity_list = []
    space = ' '

    # tkn_lbl = [['sa', 'B-PROD']]

    for i in tkn_lbl:
        if i[1] == 'B-PROD':
            if prev_lbl == 'O' or prev_lbl == 'I-PROD':
                if entity != '':
                    entity_list.append(entity)
                    entity = ''

                tkn = i[0].replace('##','')
                entity = tkn
                prev_lbl = 'B-PROD'

            elif prev_lbl == 'B-PROD':
                tkn = i[0].replace('##','')
                entity = entity + tkn
                prev_lbl = 'B-PROD'         

        elif i[1] == 'I-PROD':
            if prev_lbl == 'B-PROD':
                tkn = i[0].replace('##','')
                entity = entity + space
                entity = entity + tkn
                prev_lbl = 'I-PROD'

            elif prev_lbl == 'I-PROD':
                if '##' in i[0]:
                    tkn = i[0].replace('##','')
                    entity = entity + tkn    
                    prev_lbl = 'I-PROD'
                elif '##' not in i[0]:
                    tkn = i[0].replace('##','')
                    entity = entity + space
                    entity = entity + tkn    
                    prev_lbl = 'I-PROD'
                    
            elif prev_lbl == 'O':
                if entity != '':
                    entity_list.append(entity)
                    entity = ''

                tkn = i[0].replace('##','')
                entity = entity + tkn 
                prev_lbl = 'I-PROD'

        elif i[1] == 'O':
            if prev_lbl == 'B-PROD' or prev_lbl == 'I-PROD':
                entity_list.append(entity)
                entity = ''
                prev_lbl = 'O'
            elif prev_lbl == 'O':
                prev_lbl = 'O'

    if entity != '':
        entity_list.append(entity)

    return entity_list

# UNIT TESTING:
# # # Execution
# entity_list = get_entities_from_token_label_list(tkn_lbl)
# print('entity_list: ', entity_list)
# print('tkn_lbl: ', tkn_lbl)

#### b.3) Return list with separated fields for Tokens/Labels

In [None]:
def return_list_with_separated_fields_for_Tokens_Labels(tkn_lbl):

    tkn_lbl_joined = []
    tkns = []
    lbls = []

    for i in tkn_lbl:
        tkns.append(i[0])
        lbls.append(i[1])

    # print(tkns)
    # print(lbls)

    tkn_lbl_joined.append(tkns)
    tkn_lbl_joined.append(lbls)
    
    return tkns, lbls, tkn_lbl_joined

# ## Execution
# tkns, lbls, tkn_lbl_joined = return_list_with_separated_fields_for_Tokens_Labels(tkn_lbl)

# print('tkns: ', tkns)
# print('-'*50)
# print('lbls: ',lbls )
# print('-'*50)
# print('tkn_lbl_joined: ',tkn_lbl_joined)
# print('-'*50)

#### b.4) Return clean list of Tokens/Labels (joining splitted words '##')

In [None]:
def return_clean_list_of_tokens_labels(tkn_lbl_clean, tkns):
    # Get final list of labels
    final_labels = []
    for i in tkn_lbl_clean:
        final_labels.append(i[1])
    
    # Get final list of Tokens
    txt = ' '.join([x for x in tkns])
    fine_text = txt.replace(' ##', '')
    fine_text = fine_text.split()

    final_tokens = []
    for i in fine_text:
        final_tokens.append(i)
        
    # Compose output dataframe
    token_label_dict2 = {'Token': final_tokens, 'Label': final_labels}
    token_label_final = pd.DataFrame(token_label_dict2)
    
    return token_label_final, final_tokens, final_labels

# UNIT TESTING:
# # # Execution
# token_label_final, final_tokens, final_labels = return_clean_list_of_tokens_labels(tkn_lbl_clean, tkns)
# print(token_label_final)
# print(final_tokens)
# print(final_labels)

#### b.5) From Tain/Valid sentence to tokens/labels (1 shot execution)

In [None]:
def final_tokens_labels_from_train_validation_sentence(n, inputs, masks, tags):
    inputs_i = inputs
    masks_i = masks
    tags_i = tags

    tkn_lbl, tkn_lbl_clean = detokenize_sentence_into_tokens_labels(inputs_i, masks_i, tags_i, idx2tag)

    entity_list = get_entities_from_token_label_list(tkn_lbl)

    tkns, lbls, tkn_lbl_joined = return_list_with_separated_fields_for_Tokens_Labels(tkn_lbl)

    token_label_final, final_tokens, final_labels = return_clean_list_of_tokens_labels(tkn_lbl_clean, tkns)
    
    return token_label_final, final_tokens, final_labels

In [None]:
# UNITE TEST INFERENCE
sentce = 'sap erp is the best product'
inference_sap_bert(sentce, model)

inference_with_prob(sentce)

### b.6) GET TRUE/PREDICTED LABELS FOR EVERY SENTENCE IN TRAIN/VALIDATION DATASET:

In [None]:
## GET TRUE/PREDICTED LABELS FOR EVERY SENTENCE IN TRAIN/VALIDATION DATASET:

#--- set parameters:
take_from = 'validation_dataset'
# take_from = 'training_dataset'

#--- execute
true_pred_final_df = pd.DataFrame()
counter = 1

for n in range(0, len(val_inputs)):
    # take the right dataset based on parameter 'take_from':
    if take_from == 'validation_dataset':
        # a) take from 'Validation data'
        inputs_i = val_inputs[n]
        masks_i = val_masks[n]
        tags_i = val_tags[n]

    elif take_from == 'training_dataset':
        # b) take from 'Training data'
        inputs_i = tr_inputs[n]
        masks_i = tr_masks[n]
        tags_i = tr_tags[n]
    
    # detokenize sentence
    token_label_final, final_tokens, final_labels = final_tokens_labels_from_train_validation_sentence(n, inputs_i, masks_i, tags_i)
    
    # dataframe with 'true' token/labels of that sentence
    s_true = token_label_final
    s_true  
    
    # compose sentence from tokens
    sentce = ''
    space = ' '

    for i in final_tokens:
        sentce = sentce + i
        sentce = sentce + space
        
    # Inference sentence (predict labels based on trained model)
    s_pred = inference_sap_bert(sentce, model)
    s_pred = s_pred.reset_index(drop=True)
    
    # join 's_true' and 's_pred' dataframes. Add Sentence column
    s_true_pred_df = token_label_final.join(s_pred, lsuffix='true_labels', rsuffix='pred_labels')
    s_true_pred_df['Sentence'] = counter
    counter = counter + 1
    
    # add current 's_true_pred_df' into 'true_pred_final_df' 
    df1 = true_pred_final_df.copy()
    df2 = s_true_pred_df

    frames = [df1, df2]

    true_pred_final_df = pd.concat(frames)
    true_pred_final_df

In [None]:
true_pred_final_df

### SAVE to .CSV

In [None]:
# model_data = 'tr_da_1_23_val_sa'
# model_data = filename_main

file_path_to_a = "models_inference_true_pred/"+model_name+filename_main+"_true_pred.csv"
true_pred_final_df.to_csv(file_path_to_a, sep=',',index=False)

In [None]:
file_path_to_a

### SAVE Inference Parameters to file

In [None]:
def get_current_datetime():
    # using time module 
    import time 
    from datetime import datetime

    # ts stores the time in seconds 
    ts = time.time() 
    # print(ts) 

    #convert timestamp to date/time
    dt_object = datetime.fromtimestamp(ts)
    # print("dt_object =", dt_object)
    # print("type(dt_object) =", type(dt_object))

    #get datetime
    datetime = str(dt_object)
    datetime = datetime[:19]

    return datetime

In [None]:
# SAVE Inference Parameters and Evaluation Results
import os
BASE_FOLDER = "models_inference_true_pred/"

file = os.path.join(BASE_FOLDER, model_name+'_parameters')
with open(file, "w") as writer:
    writer.write('\n--------------------------------------------------------------')
    writer.write("\nExecution Parameters:   ")
    writer.write('\n--------------------------------------------------------------')
    writer.write("\nDatetime: ") 
    writer.write(get_current_datetime())
    writer.write("\nModel Inferenced:  ") 
    writer.write(model_name)
    writer.write("\nSource data:  ") 
    writer.write(filename)
    writer.write("\nData split used:  ") 
    writer.write(take_from )
    writer.write('\n--------------------------------------------------------------')
  

In [None]:
print("Model Inferenced: ", model_name) 
print("Source data: " , filename) 

### Execution (FINAL)

In [None]:
# # Execution (final):

# n=15

# take_from = 'validation_dataset'
# # take_from = 'training_dataset'

# if take_from == 'validation_dataset':
#     # a) take from 'Validation data'
#     inputs_i = val_inputs[n]
#     masks_i = val_masks[n]
#     tags_i = val_tags[n]

# elif take_from == 'training_dataset':
#     # b) take from 'Training data'
#     inputs_i = tr_inputs[n]
#     masks_i = tr_masks[n]
#     tags_i = tr_tags[n]

# token_label_final, final_tokens, final_labels = final_tokens_labels_from_train_validation_sentence(n, inputs_i, masks_i, tags_i)

# # b) take from 'Training data'
# # token_label_final = final_tokens_labels_from_train_validation_sentence(n, tr_inputs, tr_masks, tr_tags)

# # Final result
# s_true = token_label_final
# s_true