In [1]:
import pickle

dict_cont = pickle.load(open('dict/dict_cont', 'rb'))
dict_abb = pickle.load(open('dict/dict_abb', 'rb'))
import os
os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID"   # see issue #152
os.environ["CUDA_VISIBLE_DEVICES"]="-1"

In [2]:
import logging
import math
import os
import pandas as pd
import numpy as np
import time
import datetime
import string
import glob
import tensorflow as tf
# 
current_time = datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
log_dir = 'runs/' + current_time 
writer = tf.summary.create_file_writer(log_dir)
print("Log Dir",log_dir)
load_model=True

import torch
from torch import nn
from torch.nn import CrossEntropyLoss, MSELoss

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, cohen_kappa_score, roc_auc_score, confusion_matrix
from sklearn.model_selection import train_test_split


import transformers
from transformers import BertTokenizer, BertModel, AdamW, BertConfig
from transformers import get_linear_schedule_with_warmup


from transformers.activations import gelu, gelu_new
from transformers.file_utils import add_start_docstrings
from transformers.modeling_utils import PreTrainedModel, prune_linear_layer


import pickle
from keras.preprocessing.sequence import pad_sequences

def rank_fn(logs):
    a,b=logs.sort(1,True)
    c=np.where(b.cpu().numpy()==0)
    return c[1]



def tokenize_sent_map_to_ids(sentence, MAX_LEN = 128 ):
    # Tokenize all of the sentences and map the tokens to thier word IDs.
    
    '''a function that takes a dataframe df containing columns CONTENT and class for sentences and labels respectively.
    returns input id's and attention mask '''
    
    input_ids = []
    changed_input_ids = []
    encoded_sent = tokenizer.encode(
                        sentence,                      # Sentence to encode.
                        add_special_tokens = True, # Add '[CLS]' and '[SEP]'
                   )

    input_ids.append(encoded_sent)

    from keras.preprocessing.sequence import pad_sequences
    input_ids = pad_sequences(input_ids, maxlen=MAX_LEN, dtype="long", 
                              value=0, truncating="post", padding="post")

    attention_masks = []
    for sent in input_ids:
        att_mask = [int(token_id > 0) for token_id in sent]
        attention_masks.append(att_mask)

    return torch.tensor(input_ids), torch.tensor(attention_masks)

def tokenize_sent1_sent2_map_to_ids(sentence1,sentence2=[], MAX_LEN = 512 ):
    # Tokenize all of the sentences and map the tokens to thier word IDs.
    
    '''a function that takes a dataframe df containing columns CONTENT and class for sentences and labels respectively.
    returns input id's and attention mask '''
    

    input_ids = []
    if len(sentence2)!=0:
        for sent1,sent2 in zip(sentence1,sentence2):
            encoded_sent = tokenizer.encode(
                                str(sent1),                      # Sentence to encode.
                                str(sent2),
                                add_special_tokens = True, # Add '[CLS]' and '[SEP]'
                           )

            input_ids.append(encoded_sent)
    else:
        for sent1 in sentence1:
            encoded_sent = tokenizer.encode(
                                str(sent1),                      # Sentence to encode.
                                add_special_tokens = True, # Add '[CLS]' and '[SEP]'
                           )

            input_ids.append(encoded_sent)
        
    max_len=max([len(sen) for sen in input_ids])
    from keras.preprocessing.sequence import pad_sequences

    if max_len < MAX_LEN:
        MAX_LEN=max_len
        
    input_ids = pad_sequences(input_ids, maxlen=MAX_LEN, dtype="long", 
                              value=0, truncating="post", padding="post")

    attention_masks = []
    for sent in input_ids:
        att_mask = [int(token_id > 0) for token_id in sent]
        attention_masks.append(att_mask)

    return torch.tensor(input_ids), torch.tensor(attention_masks)

#### LOAD BERT Tokenizer

tokenizer = BertTokenizer.from_pretrained('tokenizer.txt')

logger = logging.getLogger(__name__)

BERT_PRETRAINED_MODEL_ARCHIVE_MAP = {
    "bert-base-uncased": "https://cdn.huggingface.co/bert-base-uncased-pytorch_model.bin",
    "bert-large-uncased": "https://cdn.huggingface.co/bert-large-uncased-pytorch_model.bin",
    "bert-base-cased": "https://cdn.huggingface.co/bert-base-cased-pytorch_model.bin",
    "bert-large-cased": "https://cdn.huggingface.co/bert-large-cased-pytorch_model.bin",
    "bert-base-multilingual-uncased": "https://cdn.huggingface.co/bert-base-multilingual-uncased-pytorch_model.bin",
    "bert-base-multilingual-cased": "https://cdn.huggingface.co/bert-base-multilingual-cased-pytorch_model.bin",
    "bert-base-chinese": "https://cdn.huggingface.co/bert-base-chinese-pytorch_model.bin",
    "bert-base-german-cased": "https://cdn.huggingface.co/bert-base-german-cased-pytorch_model.bin",
    "bert-large-uncased-whole-word-masking": "https://cdn.huggingface.co/bert-large-uncased-whole-word-masking-pytorch_model.bin",
    "bert-large-cased-whole-word-masking": "https://cdn.huggingface.co/bert-large-cased-whole-word-masking-pytorch_model.bin",
    "bert-large-uncased-whole-word-masking-finetuned-squad": "https://cdn.huggingface.co/bert-large-uncased-whole-word-masking-finetuned-squad-pytorch_model.bin",
    "bert-large-cased-whole-word-masking-finetuned-squad": "https://cdn.huggingface.co/bert-large-cased-whole-word-masking-finetuned-squad-pytorch_model.bin",
    "bert-base-cased-finetuned-mrpc": "https://cdn.huggingface.co/bert-base-cased-finetuned-mrpc-pytorch_model.bin",
    "bert-base-german-dbmdz-cased": "https://cdn.huggingface.co/bert-base-german-dbmdz-cased-pytorch_model.bin",
    "bert-base-german-dbmdz-uncased": "https://cdn.huggingface.co/bert-base-german-dbmdz-uncased-pytorch_model.bin",
    "bert-base-japanese": "https://cdn.huggingface.co/cl-tohoku/bert-base-japanese/pytorch_model.bin",
    "bert-base-japanese-whole-word-masking": "https://cdn.huggingface.co/cl-tohoku/bert-base-japanese-whole-word-masking/pytorch_model.bin",
    "bert-base-japanese-char": "https://cdn.huggingface.co/cl-tohoku/bert-base-japanese-char/pytorch_model.bin",
    "bert-base-japanese-char-whole-word-masking": "https://cdn.huggingface.co/cl-tohoku/bert-base-japanese-char-whole-word-masking/pytorch_model.bin",
    "bert-base-finnish-cased-v1": "https://cdn.huggingface.co/TurkuNLP/bert-base-finnish-cased-v1/pytorch_model.bin",
    "bert-base-finnish-uncased-v1": "https://cdn.huggingface.co/TurkuNLP/bert-base-finnish-uncased-v1/pytorch_model.bin",
    "bert-base-dutch-cased": "https://cdn.huggingface.co/wietsedv/bert-base-dutch-cased/pytorch_model.bin",
}


def load_tf_weights_in_bert(model, config, tf_checkpoint_path):
    """ Load tf checkpoints in a pytorch model.
    """
    try:
        import re
        import numpy as np
        import tensorflow as tf
    except ImportError:
        logger.error(
            "Loading a TensorFlow model in PyTorch, requires TensorFlow to be installed. Please see "
            "https://www.tensorflow.org/install/ for installation instructions."
        )
        raise
    tf_path = os.path.abspath(tf_checkpoint_path)
    logger.info("Converting TensorFlow checkpoint from {}".format(tf_path))
    # Load weights from TF model
    init_vars = tf.train.list_variables(tf_path)
    names = []
    arrays = []
    for name, shape in init_vars:
        logger.info("Loading TF weight {} with shape {}".format(name, shape))
        array = tf.train.load_variable(tf_path, name)
        names.append(name)
        arrays.append(array)

    for name, array in zip(names, arrays):
        name = name.split("/")
        if any(
            n in ["adam_v", "adam_m", "AdamWeightDecayOptimizer", "AdamWeightDecayOptimizer_1", "global_step"]
            for n in name
        ):
            logger.info("Skipping {}".format("/".join(name)))
            continue
        pointer = model
        for m_name in name:
            if re.fullmatch(r"[A-Za-z]+_\d+", m_name):
                scope_names = re.split(r"_(\d+)", m_name)
            else:
                scope_names = [m_name]
            if scope_names[0] == "kernel" or scope_names[0] == "gamma":
                pointer = getattr(pointer, "weight")
            elif scope_names[0] == "output_bias" or scope_names[0] == "beta":
                pointer = getattr(pointer, "bias")
            elif scope_names[0] == "output_weights":
                pointer = getattr(pointer, "weight")
            elif scope_names[0] == "squad":
                pointer = getattr(pointer, "classifier")
            else:
                try:
                    pointer = getattr(pointer, scope_names[0])
                except AttributeError:
                    logger.info("Skipping {}".format("/".join(name)))
                    continue
            if len(scope_names) >= 2:
                num = int(scope_names[1])
                pointer = pointer[num]
        if m_name[-11:] == "_embeddings":
            pointer = getattr(pointer, "weight")
        elif m_name == "kernel":
            array = np.transpose(array)
        try:
            assert pointer.shape == array.shape
        except AssertionError as e:
            e.args += (pointer.shape, array.shape)
            raise
        logger.info("Initialize PyTorch weight {}".format(name))
        pointer.data = torch.from_numpy(array)
    return model


def mish(x):
    return x * torch.tanh(nn.functional.softplus(x))


ACT2FN = {"gelu": gelu, "relu": torch.nn.functional.relu, "gelu_new": gelu_new, "mish": mish}


BertLayerNorm = torch.nn.LayerNorm

Log Dir runs/20210514-141819


Calling BertTokenizer.from_pretrained() with the path to a single file or url is deprecated


In [3]:

class BertPreTrainedModel(PreTrainedModel):
    """ An abstract class to handle weights initialization and
        a simple interface for downloading and loading pretrained models.
    """

    config_class = BertConfig
    pretrained_model_archive_map = BERT_PRETRAINED_MODEL_ARCHIVE_MAP
    load_tf_weights = load_tf_weights_in_bert
    base_model_prefix = "bert"

    def _init_weights(self, module):
        """ Initialize the weights """
        if isinstance(module, (nn.Linear, nn.Embedding)):
            # Slightly different from the TF version which uses truncated_normal for initialization
            # cf https://github.com/pytorch/pytorch/pull/5617
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
        elif isinstance(module, BertLayerNorm):
            module.bias.data.zero_()
            module.weight.data.fill_(1.0)
        if isinstance(module, nn.Linear) and module.bias is not None:
            module.bias.data.zero_()



import torch.nn as nn
import torch.nn.functional as F


    
class MLP(nn.Module):
    def __init__(self, h_size, out_size, num_hid_layers):
        super(MLP, self).__init__()
        self.hidden = nn.ModuleList()
        for k in range(num_hid_layers):
            self.hidden.append(nn.Linear(h_size, h_size)) 
        # Output layer
        self.out = nn.Linear(h_size, out_size)
        
    def forward(self, x):
        # Feedforward
        for layer in self.hidden:
            x = F.relu(layer(x))
            
        output= self.out(x)

        return output

class MLP_1(nn.Module):
    def __init__(self, h_size, out_size, num_hid_layers):
        super(MLP_1, self).__init__()
        self.hidden = nn.ModuleList()
        for k in range(num_hid_layers):
            self.hidden.append(nn.Linear(h_size, h_size)) 
        # Output layer
        self.out = nn.Linear(h_size, out_size)
    
    def forward(self, x):            
        output= self.out(x)
        return output
  
import torch
import torch.nn as nn
import torch.nn.functional as F

class AdMSoftmaxLoss(nn.Module):

    def __init__(self, in_features, out_features,m=.8, s = 30):
        '''
        AM Softmax Loss
        '''
        super(AdMSoftmaxLoss, self).__init__()
        self.m = m
        self.s = s

    def forward(self, sentence, alternatives, labels):
        '''
        input shape (N, in_features)
        '''


        logits = torch.cosine_similarity(sentence,alternatives, dim = -1) 

        numerator = self.s *(torch.cosine_similarity(sentence,alternatives,dim=-1) - self.m * torch.tensor(labels).repeat(sentence.shape[0],1))

        numerator=torch.exp(numerator)

        denominator=torch.sum(numerator,1)
        denominator=denominator.repeat((numerator.shape[1],1)).transpose(0,1)

        loss=-torch.log(numerator/denominator)

        return torch.mean(loss,1),numerator,logits
    
    
class BertForSequenceClassification_notes(BertPreTrainedModel):
    def __init__(self, config):
        super().__init__(config)
        
        self.num_labels = config.num_labels
        self.bert = BertModel(config)
        self.dropout = nn.Dropout(.1)
        self.classifier2 = MLP_1(config.hidden_size,config.hidden_size,1)
        self.init_weights()
        self.hidden_size = config.hidden_size
        self.personalization_layer = MLP_1(config.hidden_size,config.hidden_size,1)
        
    def forward(
        self,
        input_ids=None,
        input_ids_alt = None,
        attention_mask=None,
        attention_mask_alt = None, 
        token_type_ids=None,
        position_ids=None,
        head_mask=None,
        inputs_embeds=None,
        labels=None,
        train = True,
        index_of_abb = None,
        num_of_options=5,
        alt_batch=10,
        all_options=False,
        all_alt_input_ids=False,
        all_abb_bert=False,
        just_cls=False,
        all_alt_input_ids_vectors=[],
        all_alt_abb_bert_vectors=[]
    ):
 

        outputs_1 = self.bert(
            input_ids = input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            position_ids=position_ids,
            head_mask=head_mask,
            inputs_embeds=inputs_embeds
        )

        if not just_cls:            
            abb_indices = [l.item() for l in ((input_ids.flatten() == 1).nonzero()).flatten()]
            abb_output = torch.stack([outputs_1[0][:,l,:] for l in abb_indices])
            sent_int = abb_output
            sent_int = self.dropout(sent_int)
            sent_int2 = self.classifier2(sent_int)
            sent_int2 = sent_int2.reshape((len(abb_indices),1,768))
            sent=self.personalization_layer(sent_int2)
        else:
            sent_int =outputs_1[0][:,0,:]
            sent_int = self.dropout(sent_int)
            sent_int2 = self.classifier2(sent_int)
            return sent_int2
        
        logits_list = []
        if not all_options:
            shapes_alts=input_ids_alt.shape
            for alt_ids_index in range(0,shapes_alts[0],alt_batch):
                outputs_2 = self.bert(
                    input_ids = input_ids_alt[alt_ids_index:alt_ids_index+alt_batch,:],
                    attention_mask = attention_mask_alt[alt_ids_index:alt_ids_index+alt_batch,:],
                    token_type_ids=token_type_ids,
                    position_ids=position_ids,
                    head_mask=head_mask,
                    inputs_embeds=inputs_embeds)
                if alt_ids_index==0:
                    cls_output=outputs_2[0][:,0,:]
                else:
                    cls_output=torch.cat((cls_output,outputs_2[0][:,0,:]),dim=0)


            criteria = AdMSoftmaxLoss(self.hidden_size,self.hidden_size)
            altx = torch.stack([cls_output[i*num_of_options:(i+1)*num_of_options] for i in range(len(abb_indices))])

            altx = self.classifier2(altx)
            loss_1,logits,logits_list_test = criteria(sent, altx, labels )

            logits_list.append(logits_list_test)
            loss = loss_1


        elif all_alt_input_ids: # When input ids are present
            logits_list = []
            for (all_alt_input_ids_vector,all_attention_mask_alt),label in all_alt_input_ids_vectors:
                shapes_alts=all_alt_input_ids_vector.shape
                for alt_ids_index in range(0,shapes_alts[0],alt_batch):
                    outputs_2 = self.bert(
                        input_ids = all_alt_input_ids_vector[alt_ids_index:alt_ids_index+alt_batch,:],
                        attention_mask = all_attention_mask_alt[alt_ids_index:alt_ids_index+alt_batch,:],
                        token_type_ids=token_type_ids,
                        position_ids=position_ids,
                        head_mask=head_mask,
                        inputs_embeds=inputs_embeds)
                    if alt_ids_index==0:
                        cls_output=outputs_2[0][:,0,:]
                    else:
                        cls_output=torch.cat((cls_output,outputs_2[0][:,0,:]),dim=0)

                criteria = AdMSoftmaxLoss(self.hidden_size,self.hidden_size)
                altx=cls_output
                altx = self.classifier2(altx)
                altx=self.personalization_layer(altx)

                loss_1,logits,logits_list_test = criteria(sent, altx, label )
                logits_list.append(logits_list_test)
                loss = loss_1
            
        elif all_abb_bert: # when vectors from ABB_BERT is present
            for num,(all_alt_abb_bert_vector,label) in enumerate(all_alt_abb_bert_vectors):
                shapes_alts=all_alt_abb_bert_vector.shape
                altx=all_alt_abb_bert_vector
                criteria = AdMSoftmaxLoss(self.hidden_size,self.hidden_size)
                altx=self.personalization_layer(altx)
                print('altx shape',altx.shape)
                print('sent shape',sent[num].shape)
                loss,logits,logits_list_test = criteria(sent[num], altx, label)
                print('numerator',logits_list_test.shape)
                logits_list.append(logits_list_test)
                
        else:
            pass
            
        return (loss), logits_list, logits_list_test,sent,sent_int2  # (loss), logits, (hidden_states), (attentions)

model = BertForSequenceClassification_notes.from_pretrained('bert-base-uncased', num_labels=2)
device = torch.device("cpu")
model.load_state_dict(torch.load("model/570000",map_location=torch.device('cpu') ))
model = model.to(device)


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification_notes: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification_notes from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification_notes from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification_notes were not initialized from

In [4]:
import os
import random
files_list = [l for l in os.listdir('dataset/') if 'test_' in l]

In [8]:
# Note: AdamW is a class from the huggingface library (as opposed to pytorch) 
# I believe the 'W' stands for 'Weight Decay fix"
import progressbar
optimizer = AdamW(model.parameters(),
                  lr = 5e-6, # args.learning_rate - default is 5e-5, our notebook had 2e-5
                  eps = 1e-8 # args.adam_epsilon  - default is 1e-8.
                )

# Number of training epochs (authors recommend between 2 and 4)
epochs = 1


import random

seed_val = 42
    
random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)

# Store the average loss after each epoch so we can plot them.
loss_values = []

loss_values_x_batch = []
loss_values_x_batch_means = []
logs_diff_values_x_batch = []
logs_diff_values_x_batch_means = []
ratio_less_10 = []
ratio_less_50 = []
ratio_less_100 =[]

val_rank_list = []
rank_list=[]

logs_diff_values_validate = []
correct_rank_validate = []
num_of_options_val=50
num_of_options_train=50
logs_diff_values_x_batch_validate = []
correct_rank_val = []

# For each epoch...
step=0

for epoch_i in range(0, epochs):
    with writer.as_default():
        tf.summary.scalar('epoch', epoch_i,step)

    # Measure how long the training epoch takes.
    t0 = time.time()

    # Reset the total loss for this epoch.
    total_loss = 0

    model.train()

#     # For each batch of training data...
    for file_index, file in enumerate(files_list):
        val_df = pickle.load(open('dataset/'+file, 'rb'))        
        total_steps = len(val_df) * epochs
        print('''
        ################################
        ##### running validation #######
        ################################
        ''')
        total_steps = len(val_df) * epochs
        for _, ind in enumerate(progressbar.progressbar(np.random.choice(len(val_df), len(val_df), replace=False))):
            
            step+=1
            batch = val_df[ind][0],val_df[ind][1]
            batch_alt = val_df[ind][2],val_df[ind][3] 
            print("valid alt_size", batch_alt[0].shape)
            b_input_ids = batch[0].to(device)
            b_input_mask = batch[1].to(device)
            b_input_ids_alt = batch_alt[0].to(device)
            b_input_mask_alt = batch_alt[1].to(device)
            b_labels = torch.zeros((num_of_options_val), dtype=torch.float).to(device)

            model.eval()
            outputs = model(input_ids = b_input_ids,
                            input_ids_alt = b_input_ids_alt,
                            token_type_ids=None, 
                            attention_mask=b_input_mask, 
                            attention_mask_alt = b_input_mask_alt,
                            labels=b_labels,
                            num_of_options=num_of_options_val
                           )
            model.zero_grad()
            logs = outputs[2]
            logs_diff_values_x_batch_validate.extend([g.item() for l in logs for g in [l[0] - torch.mean(l[1:])] ])

            for predictions in logs:
                sorted_pred = torch.sort(predictions, descending = True).indices.tolist()
                correct_rank = sorted_pred.index(0)
                print(correct_rank)
                correct_rank_val.append(correct_rank)

            with writer.as_default():
                tf.summary.scalar('validation mean rank',np.mean(correct_rank_val),step)
                tf.summary.scalar('validation log diff validation',np.mean(logs_diff_values_x_batch_validate),step)

    print("")
    print("Validation complete!")

N/A% (0 of 1000) |                       | Elapsed Time: 0:00:00 ETA:  --:--:--


        ################################
        ##### running validation #######
        ################################
        
valid alt_size torch.Size([50, 8])


  0% (1 of 1000) |                       | Elapsed Time: 0:00:00 ETA:   0:06:12

0
valid alt_size torch.Size([100, 13])


KeyboardInterrupt: 