In [1]:
!pip install transformers
!pip install seqeval
!pip install torchtext==0.10.0

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.22.2-py3-none-any.whl (4.9 MB)
[K     |████████████████████████████████| 4.9 MB 32.6 MB/s 
Collecting tokenizers!=0.11.3,<0.13,>=0.11.1
  Downloading tokenizers-0.12.1-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.6 MB)
[K     |████████████████████████████████| 6.6 MB 54.8 MB/s 
Collecting huggingface-hub<1.0,>=0.9.0
  Downloading huggingface_hub-0.10.0-py3-none-any.whl (163 kB)
[K     |████████████████████████████████| 163 kB 69.6 MB/s 
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.10.0 tokenizers-0.12.1 transformers-4.22.2
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting seqeval
  Downloading seqeval-1.2.2.tar.gz (43 kB)
[K     |████████████████████████████████| 43 kB 2.4 MB/s 
Building wh

In [2]:
import os
import pandas as pd
import math
import numpy as np
from tqdm import tqdm, trange
from seqeval.metrics import classification_report, accuracy_score, f1_score
import torch
import torch.nn.functional as F
from torch.optim import Adam
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
import torch.nn as nn
from transformers import AutoModel, AutoConfig, AutoTokenizer
from transformers import AdamW
from transformers import AutoModelForSequenceClassification, BertForSequenceClassification

In [3]:
import tensorflow as tf
import torch

# Get the GPU device name.
device_name = tf.test.gpu_device_name()

# The device name should look like the following:
if device_name == '/device:GPU:0':
    print('Found GPU at: {}'.format(device_name))
else:
    print('GPU device not found')

# If there's a GPU available...
if torch.cuda.is_available():    
    # Tell PyTorch to use the GPU.    
    device = torch.device("cuda")
    print('There are %d GPU(s) available.' % torch.cuda.device_count())
    print('Use the GPU:', torch.cuda.get_device_name(0))
else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

Found GPU at: /device:GPU:0
There are 1 GPU(s) available.
Use the GPU: Tesla T4


In [5]:
import torch

SEED = 1111
torch.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

from transformers import BertTokenizer
tokenizer = BertTokenizer.from_pretrained('emilyalsentzer/Bio_ClinicalBERT')
print(len(tokenizer))
tokens = tokenizer.tokenize('He had no [entity] cardiac murmur [entity] .')
print(tokens)
indexes = tokenizer.convert_tokens_to_ids(tokens)
print(indexes)

28996
['he', 'had', 'no', '[', 'entity', ']', 'cardiac', 'murmur', '[', 'entity', ']', '.']
[1119, 1125, 1185, 164, 9127, 166, 17688, 22895, 164, 9127, 166, 119]


In [6]:
init_token = tokenizer.cls_token
eos_token = tokenizer.sep_token
pad_token = tokenizer.pad_token
unk_token = tokenizer.unk_token

print(init_token, eos_token, pad_token, unk_token)

[CLS] [SEP] [PAD] [UNK]


In [7]:
init_token_idx = tokenizer.cls_token_id
eos_token_idx = tokenizer.sep_token_id
pad_token_idx = tokenizer.pad_token_id
unk_token_idx = tokenizer.unk_token_id

print(init_token_idx, eos_token_idx, pad_token_idx, unk_token_idx)

101 102 0 100


In [13]:
max_input_length = 512

In [8]:
def tokenize_bert(sentence):
    tokens = tokenizer.tokenize(sentence)
    #tokens = tokenizer.tokenize(sentence, max_length=512, truncation=True)
    #tokens = tokenizer.tokenize(sentence, padding=True, max_length=512, truncation=True, return_tensors="pt")
    return tokens

def split_and_cut(sentence):
    tokens = sentence.strip().split(" ")
    tokens = tokens[:max_input_length-1]
    return tokens

def trim_sentence(sent):
    try:
        sent = sent.split()
        sent = sent[:128]
        return " ".join(sent)
    except:
        return sent

In [9]:
data_path_train_url = "https://raw.githubusercontent.com/sajjadIslam2619/sample_files/main/processed/merged/assertion_NLI_train.tsv"
df_train = pd.read_csv(data_path_train_url, sep="\t").astype(str)

data_path_dev_url = "https://raw.githubusercontent.com/sajjadIslam2619/sample_files/main/processed/merged/assertion_NLI_dev.tsv"
df_dev = pd.read_csv(data_path_dev_url, sep="\t").astype(str)

data_path_test_url = "https://raw.githubusercontent.com/sajjadIslam2619/sample_files/main/processed/merged/assertion_NLI_test.tsv"
#data_path_test_url = "https://raw.githubusercontent.com/sajjadIslam2619/sample_files/main/processed/merged/assertion_NLI_test_mini.tsv"
df_test = pd.read_csv(data_path_test_url, sep="\t").astype(str)
print(len(df_test))

f = open("train.csv", "x")
f.close()
f = open("dev.csv", "x")
f.close()
f = open("test.csv", "x")
f.close()

59892


In [10]:
def get_sent1_token_type(sent):
    try:
        return [0]* len(sent)
    except:
        return []

def get_sent2_token_type(sent):
    try:
        return [1]* len(sent)
    except:
        return []
    
def combine_seq(seq):
    return " ".join(seq)

def combine_mask(mask):
    mask = [str(m) for m in mask]
    return " ".join(mask)

In [11]:
df_test_original_label = pd.DataFrame()

In [12]:
import pandas as pd

df_train = df_train[['label','premise','hypothesis']]
df_dev = df_dev[['label','premise','hypothesis']]
df_test = df_test[['label','premise','hypothesis']]

df_test_original_label = df_test.copy()


df_train['premise'] = df_train['premise'].apply(trim_sentence)
df_train['hypothesis'] = df_train['hypothesis'].apply(trim_sentence)
df_dev['premise'] = df_dev['premise'].apply(trim_sentence)
df_dev['hypothesis'] = df_dev['hypothesis'].apply(trim_sentence)
df_test['premise'] = df_test['premise'].apply(trim_sentence)
df_test['hypothesis'] = df_test['hypothesis'].apply(trim_sentence)

df_train['premise'] = '[CLS] ' + df_train['premise'] + ' [SEP] '
df_train['hypothesis'] = df_train['hypothesis'] + ' [SEP]'
df_dev['premise'] = '[CLS] ' + df_dev['premise'] + ' [SEP] '
df_dev['hypothesis'] = df_dev['hypothesis'] + ' [SEP]'
df_test['premise'] = '[CLS] ' + df_test['premise'] + ' [SEP] '
df_test['hypothesis'] = df_test['hypothesis'] + ' [SEP]'

df_train['sent1_t'] = df_train['premise'].apply(tokenize_bert)
df_train['sent2_t'] = df_train['hypothesis'].apply(tokenize_bert)
df_dev['sent1_t'] = df_dev['premise'].apply(tokenize_bert)
df_dev['sent2_t'] = df_dev['hypothesis'].apply(tokenize_bert)
df_test['sent1_t'] = df_test['premise'].apply(tokenize_bert)
df_test['sent2_t'] = df_test['hypothesis'].apply(tokenize_bert)

df_train['sent1_token_type'] = df_train['sent1_t'].apply(get_sent1_token_type)
df_train['sent2_token_type'] = df_train['sent2_t'].apply(get_sent2_token_type)
df_dev['sent1_token_type'] = df_dev['sent1_t'].apply(get_sent1_token_type)
df_dev['sent2_token_type'] = df_dev['sent2_t'].apply(get_sent2_token_type)
df_test['sent1_token_type'] = df_test['sent1_t'].apply(get_sent1_token_type)
df_test['sent2_token_type'] = df_test['sent2_t'].apply(get_sent2_token_type)

df_train['sequence'] = df_train['sent1_t'] + df_train['sent2_t']
df_dev['sequence'] = df_dev['sent1_t'] + df_dev['sent2_t']
df_test['sequence'] = df_test['sent1_t'] + df_test['sent2_t']


df_train['attention_mask'] = df_train['sequence'].apply(get_sent2_token_type)
df_dev['attention_mask'] = df_dev['sequence'].apply(get_sent2_token_type)
df_test['attention_mask'] = df_test['sequence'].apply(get_sent2_token_type)

df_train['token_type'] = df_train['sent1_token_type'] + df_train['sent2_token_type']
df_dev['token_type'] = df_dev['sent1_token_type'] + df_dev['sent2_token_type']
df_test['token_type'] = df_test['sent1_token_type'] + df_test['sent2_token_type']

df_train['sequence'] = df_train['sequence'].apply(combine_seq)
df_dev['sequence'] = df_dev['sequence'].apply(combine_seq)
df_test['sequence'] = df_test['sequence'].apply(combine_seq)

df_train['attention_mask'] = df_train['attention_mask'].apply(combine_mask)
df_dev['attention_mask'] = df_dev['attention_mask'].apply(combine_mask)
df_test['attention_mask'] = df_test['attention_mask'].apply(combine_mask)

df_train['token_type'] = df_train['token_type'].apply(combine_mask)
df_dev['token_type'] = df_dev['token_type'].apply(combine_mask)
df_test['token_type'] = df_test['token_type'].apply(combine_mask)

df_train = df_train[['label', 'sequence', 'attention_mask', 'token_type']]
df_dev = df_dev[['label', 'sequence', 'attention_mask', 'token_type']]
df_test = df_test[['label', 'sequence', 'attention_mask', 'token_type']]

df_train.to_csv('train.csv', index=False)
df_dev.to_csv('dev.csv', index=False)
df_test.to_csv('test.csv', index=False)

In [16]:
print(len(df_test))
print(len(df_test_original_label))

59892
59892


In [17]:
#print(df_train.head(3))
#print(df_dev.head(2))
#print(df_test.head(3))

In [18]:
df_train['label'].unique()

array(['entailment', 'contradiction'], dtype=object)

In [19]:
pd.set_option('display.max_colwidth', None)

In [20]:
def convert_to_int(tok_ids):
    tok_ids = [int(x) for x in tok_ids]
    return tok_ids

In [21]:
from torchtext.legacy import data 

TEXT = data.Field(batch_first = True,
                  use_vocab = False,
                  tokenize = split_and_cut,
                  preprocessing = tokenizer.convert_tokens_to_ids,
                  pad_token = pad_token_idx,
                  unk_token = unk_token_idx)

LABEL = data.LabelField()

ATTENTION = data.Field(batch_first = True,
                  use_vocab = False,
                  tokenize = split_and_cut,
                  preprocessing = convert_to_int,
                  pad_token = pad_token_idx)

TTYPE = data.Field(batch_first = True,
                  use_vocab = False,
                  tokenize = split_and_cut,
                  preprocessing = convert_to_int,
                  pad_token = 1)

In [22]:
fields = [('label', LABEL), ('sequence', TEXT), ('attention_mask', ATTENTION), ('token_type', TTYPE)]

train_data, valid_data, test_data = data.TabularDataset.splits(
                                        path = '',
                                        train = 'train.csv',
                                        validation = 'dev.csv',
                                        test = 'test.csv',
                                        format = 'csv',
                                        fields = fields,
                                        skip_header = True)

In [23]:
print(f"Number of training data: {len(train_data)}")
print(f"Number of validation data: {len(valid_data)}")
print(f"Number of testing data: {len(test_data)}")

train_data_len = len(train_data)

Number of training data: 38190
Number of validation data: 4248
Number of testing data: 59892


In [24]:
tokens = tokenizer.convert_ids_to_tokens(vars(train_data.examples[0])['sequence'])
LABEL.build_vocab(train_data)
print(LABEL.vocab.stoi)
print(LABEL.vocab.freqs.most_common())
print(LABEL.vocab.itos)

defaultdict(None, {'contradiction': 0, 'entailment': 1})
[('contradiction', 31825), ('entailment', 6365)]
['contradiction', 'entailment']


In [25]:
BATCH_SIZE = 16

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

train_iterator, valid_iterator, test_iterator = data.BucketIterator.splits(
    (train_data, valid_data, test_data), 
    batch_size = BATCH_SIZE,
    sort_key = lambda x: len(x.sequence),
    sort_within_batch = False, 
    device = device)

In [26]:
from transformers import BertModel

bert_model = BertModel.from_pretrained('emilyalsentzer/Bio_ClinicalBERT')

Downloading:   0%|          | 0.00/436M [00:00<?, ?B/s]

Some weights of the model checkpoint at emilyalsentzer/Bio_ClinicalBERT were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [27]:
print(device)

cuda


In [28]:
import torch.nn as nn

class BERTNLIModel(nn.Module):
    def __init__(self,
                 bert_model,
                 hidden_dim,
                 output_dim,
                ):
        
        super().__init__()
        
        self.bert = bert_model
        
        embedding_dim = bert_model.config.to_dict()['hidden_size']
        self.out = nn.Linear(embedding_dim, output_dim)
        
        
    def forward(self, sequence, attn_mask, token_type):       
        embedded = self.bert(input_ids = sequence, attention_mask = attn_mask, token_type_ids= token_type)[1]
        output = self.out(embedded)
        
        return output

In [29]:
HIDDEN_DIM = 512
OUTPUT_DIM = len(LABEL.vocab)

model = BERTNLIModel(bert_model,
                         HIDDEN_DIM,
                         OUTPUT_DIM,
                        ).to(device)

In [30]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f'The model has {count_parameters(model):,} trainable parameters')

The model has 108,311,810 trainable parameters


In [31]:
import torch.optim as optim
from transformers import get_constant_schedule_with_warmup

#optimizer = optim.Adam(model.parameters())
optimizer = AdamW(model.parameters(),lr=2e-5,eps=1e-6,correct_bias=False)

def get_scheduler(optimizer, warmup_steps):
    scheduler = get_constant_schedule_with_warmup(optimizer, num_warmup_steps=warmup_steps)
    return scheduler



In [32]:
criterion = nn.CrossEntropyLoss().to(device)

In [33]:
def categorical_accuracy(preds, y):
    max_preds = preds.argmax(dim = 1, keepdim = True) # get the index of the max probability
    correct = (max_preds.squeeze(1)==y).float()
    return correct.sum() / len(y)

In [34]:
max_grad_norm = 1

def train(model, iterator, optimizer, criterion, scheduler):
    #print(iterator)
    
    epoch_loss = 0
    epoch_acc = 0
    
    model.train()
    
    for batch in iterator:

        optimizer.zero_grad() # clear gradients first
        torch.cuda.empty_cache() # releases all unoccupied cached memory 
        

        sequence = batch.sequence
        attn_mask = batch.attention_mask
        token_type = batch.token_type
        label = batch.label
        
        predictions = model(sequence, attn_mask, token_type)
        loss = criterion(predictions, label)
        
        acc = categorical_accuracy(predictions, label)
        
        #if fp16:
            #with amp.scale_loss(loss, optimizer) as scaled_loss:
                #scaled_loss.backward()
            #torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer), max_grad_norm)
        #else:
        loss.backward()
        
        optimizer.step()
        scheduler.step()
        
        epoch_loss += loss.item()
        epoch_acc += acc.item()
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [35]:
def evaluate(model, iterator, criterion):
    #print(iterator)
    epoch_loss = 0
    epoch_acc = 0
    
    model.eval()
    
    with torch.no_grad():
    
        for batch in iterator:
            sequence = batch.sequence
            attn_mask = batch.attention_mask
            token_type = batch.token_type
            labels = batch.label
                        
            predictions = model(sequence, attn_mask, token_type)
            
            loss = criterion(predictions, labels)
                
            acc = categorical_accuracy(predictions, labels)
            
            epoch_loss += loss.item()
            epoch_acc += acc.item()
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [36]:
import time

def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

In [None]:
import math
N_EPOCHS = 6

warmup_percent = 0.2
total_steps = math.ceil(N_EPOCHS*train_data_len*1./BATCH_SIZE)
warmup_steps = int(total_steps*warmup_percent)
scheduler = get_scheduler(optimizer, warmup_steps)

best_valid_loss = float('inf')

for epoch in range(N_EPOCHS):

    start_time = time.time()
    
    train_loss, train_acc = train(model, train_iterator, optimizer, criterion, scheduler)
    valid_loss, valid_acc = evaluate(model, valid_iterator, criterion)
    
    end_time = time.time()

    epoch_mins, epoch_secs = epoch_time(start_time, end_time)
    
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), 'bert-nli.pt')
    
    print(f'Epoch: {epoch+1:02} | Epoch Time: {epoch_mins}m {epoch_secs}s')
    print(f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}%')
    print(f'\t Val. Loss: {valid_loss:.3f} |  Val. Acc: {valid_acc*100:.2f}%')

NameError: ignored

In [37]:
model.load_state_dict(torch.load('bert-nli.pt'))

test_loss, test_acc = evaluate(model, test_iterator, criterion)

print(f'Test Loss: {test_loss:.3f} |  Test Acc: {test_acc*100:.2f}%')

Test Loss: 0.070 |  Test Acc: 98.20%


In [91]:
import torch.nn.functional as F
def test(model, iterator):
    model.eval()
    for batch in iterator:
      #print('batch ', batch)
      with torch.no_grad():
        #batch = tuple(t.to(device) for t in batch)
        sequence = batch.sequence
        attn_mask = batch.attention_mask
        token_type = batch.token_type
        label = batch.label
                        
        result = model(sequence, attn_mask, token_type)

        logits = result

        logits = logits.detach().cpu().numpy()
        label_ids = label.to('cpu').numpy()

        predictions.append(logits)
        true_labels.append(label_ids)

        #predictions.append(prediction)
        #true_labels.append(label)
        
    return predictions, true_labels

Instead of BucketIterator in **test_2** method DataFrame is used. And probabilities are calculated. BucketIterator does not maintain order of the data. But to comapre the probability for each test sentence with 5 other generated sentence order is necessary. 

In [92]:
# test without BucketIterator
def test_2(model, df_test_original_label):
    model.eval()
    for i in range (len(df_test_original_label)): 
      with torch.no_grad():
        premise = df_test_original_label['premise'][i]
        hypothesis = df_test_original_label['hypothesis'][i]
        
        premise = '[CLS] ' + premise + ' [SEP]'
        hypothesis = hypothesis + ' [SEP]'
    
        prem_t = tokenize_bert(premise)
        hypo_t = tokenize_bert(hypothesis)
    
        prem_type = get_sent1_token_type(prem_t)
        hypo_type = get_sent2_token_type(hypo_t)
    
        indexes = prem_t + hypo_t
    
        indexes = tokenizer.convert_tokens_to_ids(indexes)
        indexes_type = prem_type + hypo_type
    
        attn_mask = get_sent2_token_type(indexes)

        indexes = torch.LongTensor(indexes).unsqueeze(0).to(device)
        indexes_type = torch.LongTensor(indexes_type).unsqueeze(0).to(device)
        attn_mask = torch.LongTensor(attn_mask).unsqueeze(0).to(device)
    
        #print(indexes_type)
        result = model(indexes, attn_mask, indexes_type)                
        #result = model(sequence, attn_mask, token_type)

        logits = result

        prob = logits.softmax(dim=1).cpu().numpy()
        #prob_label_is_true = probs[:,1]

        #prob = F.softmax(logits, dim=1).cpu().numpy()
        logits = logits.detach().cpu().numpy()
        #label_ids = label.to('cpu').numpy()

        #probs_true.append(prob_label_is_true)
        probs.append(prob)
        predictions.append(logits)
        #true_labels.append(label)

        #predictions.append(prediction)
        #true_labels.append(label)
        
    return predictions, probs

**Used test method**

In [93]:
y_true = []
y_pred = []
predictions , true_labels, probs = [], [], []

predictions, true_labels = test(model, test_iterator)

# For each input batch...
for i in range(len(true_labels)):
  pred_labels_i = np.argmax(predictions[i], axis=1).flatten() 
  y_true.extend(true_labels[i])
  y_pred.extend(pred_labels_i) 


**Caution:** To calculate F1 score for these 2 label test method is necessary.

Label in order: ['contradiction', 'entailment']

In [94]:
from sklearn.metrics import f1_score,accuracy_score, classification_report
from sklearn.metrics import confusion_matrix

print("f1 socre: %f"%(f1_score(y_true, y_pred, average='micro')))
print("Accuracy score: %f"%(accuracy_score(y_true, y_pred)))
report = classification_report(y_true, y_pred,digits=4)
print("***** Eval results *****")
print("\n%s"%(report))
matrix = confusion_matrix(y_true, y_pred)
print('***** Confusion Matrix *****')
print(matrix)

f1 socre: 0.982001
Accuracy score: 0.982001
***** Eval results *****

              precision    recall  f1-score   support

           0     0.9887    0.9898    0.9892     49910
           1     0.9485    0.9432    0.9459      9982

    accuracy                         0.9820     59892
   macro avg     0.9686    0.9665    0.9675     59892
weighted avg     0.9820    0.9820    0.9820     59892

***** Confusion Matrix *****
[[49399   511]
 [  567  9415]]


**Used test_2 method**

In [95]:
y_true = []
y_pred = []
y_prob = []
y_entail_prob = []
predictions , true_labels, probs = [], [], []

print('df_test_original_label ::', len(df_test_original_label))
predictions, probs = test_2(model, df_test_original_label)

for i in range (len(df_test_original_label)): 
  label = df_test_original_label['label'][i]
  if label == 'entailment': label = 1
  else: label = 0
  true_labels.append(label)

print(len(true_labels))
# For each input batch...
for i in range(len(predictions)):
  pred_labels_i = np.argmax(predictions[i], axis=1).flatten() 
  #y_true.extend(true_labels[i])
  y_pred.extend(pred_labels_i) 
  y_prob.extend(probs[i])

for i in range (len(true_labels)):
  y_true.append(true_labels[i])

#print(len(y_pred[0]))
for i in range(len(y_pred)):
  pred_value = y_pred[i]
  #print(y_prob[i])
  if pred_value == 0:
    y_entail_prob.append(min(y_prob[i]))
  elif pred_value == 1:
    y_entail_prob.append(max(y_prob[i]))
  

df_test_original_label :: 59892
59892


In [96]:
print('No of total test sentecne : ',len(y_true))
test_data_len = int(len(y_true)/6)
print('No of actual test sentence : ', test_data_len)

No of total test sentecne :  59892
No of actual test sentence :  9982


In [97]:
print('predictions :: ', len(predictions))
print('true_labels :: ', len(true_labels))
print('df_test_original_label :: ', len(df_test_original_label))
print('y_true :: ', len(y_true))
print('y_pred :: ', len(y_pred))
print('y_prob :: ', len(y_prob))
print('probs :: ', len(probs))
print('max_prob :: ', len(y_entail_prob))
print('slice ', y_entail_prob[11736: 11740])

predictions ::  59892
true_labels ::  59892
df_test_original_label ::  59892
y_true ::  59892
y_pred ::  59892
y_prob ::  59892
probs ::  59892
max_prob ::  59892
slice  [0.00040664885, 0.00037196692, 0.0002589777, 0.0002357064]


In [99]:
label_names = ['present', 'absent', 'possible', 'conditional', 'hypothetical', 'associated_with_someone_else', 'multiple_label']

For each label, it is possible to get multiple entailment from hypothesis. Following code is calculation how many sentence have multiple entailment. 

In [100]:
present_true = 0
present_false = 0
absent_true = 0
absent_false = 0
possible_true = 0
possible_false = 0 
conditional_true = 0 
conditional_false = 0
hypothetical_true = 0
hypothetical_false = 0
associated_true = 0
associated_false = 0

true_label = []
pred_label = []

for i in range(test_data_len):
  sentence = df_test_original_label['premise'][i]
  hypothesis = df_test_original_label['hypothesis'][i]
  label = hypothesis.split(' ')[-1]
  label_index = label_names.index(label)
  true_label.append(label_index)
  
  y_true_label = y_true[i]
  is_pred_label_set = False
  y_pred_label_list = []
  for j in range(5):
    next_sentence = df_test_original_label['premise'][i*5+test_data_len+j]
    y_pred_label = y_pred [i*5+test_data_len+j]
    if y_pred_label == y_true_label:
      pred_label_index = label_names.index('multiple_label')
      is_pred_label_set = True
      break

    y_pred_label_list.append(y_pred_label)
  
  if len(set(y_pred_label_list)) > 1:
    pred_label_index = label_names.index('multiple_label')
    is_pred_label_set = True

  if is_pred_label_set == False:
    pred_label.append(label_index)
    if label == 'present': 
      present_true = present_true +1 
    if label == 'absent':
      absent_true = absent_true + 1
    if label == 'possible':
      possible_true = possible_true + 1
    if label == 'conditional':
      conditional_true = conditional_true + 1
    if label == 'hypothetical':
      hypothetical_true = hypothetical_true + 1
    if label == 'associated_with_someone_else':
      associated_true = associated_true + 1
  else:
    pred_label.append(pred_label_index)
    if label == 'present': 
      present_false = present_false + 1 
    if label == 'absent':
      absent_false = absent_false + 1
    if label == 'possible':
      possible_false = possible_false + 1
    if label == 'conditional':
      conditional_false = conditional_false + 1
    if label == 'hypothetical':
      hypothetical_false = hypothetical_false + 1
    if label == 'associated_with_someone_else':
      associated_false = associated_false + 1

#print(len(true_label))
#print(len(pred_label))
print('present - single ', present_true)
print('present - multiple ', present_false)
print('absent - single ', absent_true)
print('absent - multiple ', absent_false)
print('possible - single ', possible_true)
print('possible - multiple ', possible_false)
print('conditional - single ', conditional_true)
print('conditional - multiple ', conditional_false)
print('hypothetical - single ', hypothetical_true)
print('hypothetical - multiple ', hypothetical_false)
print('associated_with_someone_else - single ', associated_true)
print('associated_with_someone_else - multiple ', associated_false)



present - single  6512
present - multiple  152
absent - single  2210
absent - multiple  76
possible - single  349
possible - multiple  151
conditional - single  58
conditional - multiple  67
hypothetical - single  254
hypothetical - multiple  39
associated_with_someone_else - single  90
associated_with_someone_else - multiple  24


In [101]:
label_names = ['present', 'absent', 'possible', 'conditional', 'hypothetical', 'associated_with_someone_else']

Calculation based on probability. Marging all 6 generated sentence in one to calculate F1 score.  

In [102]:
true_label = []
pred_label = []
prob_by_group = []
#print (y_max_prob)
for i in range(test_data_len):
  sentence = df_test_original_label['premise'][i]
  hypothesis = df_test_original_label['hypothesis'][i]
  label = hypothesis.split(' ')[-1]
  label_index = label_names.index(label)
  true_label.append(label_index)

  prob_single_group = []
  max_probability = y_entail_prob [i]
  prob_single_group.append(max_probability)
  for j in range(5):
    next_sentence = df_test_original_label['premise'][i*5+test_data_len+j]
    prob = y_entail_prob [i*5+test_data_len+j]
    
    prob_single_group.append(prob)
    if max_probability < prob : 
      #print('index :: ', i*5+test_data_len+j)
      #print('prob :: ', prob)
      max_probability = prob
      next_hypothesis = df_test_original_label['hypothesis'][i*5+test_data_len+j]
      label = next_hypothesis.split(' ')[-1]
      label_index = label_names.index(label)

    
  pred_label.append(label_index)
  prob_by_group.append(prob_single_group)
  

#print(prob_by_group)
#print(len(true_label))
#print(len(pred_label))
print(pred_label[:10])


[0, 0, 1, 0, 0, 0, 4, 0, 0, 0]


Label in order : ['present', 'absent', 'possible', 'conditional', 'hypothetical', 'associated_with_someone_else']

In [103]:
from sklearn.metrics import f1_score,accuracy_score, classification_report
from sklearn.metrics import confusion_matrix

print("f1 socre: %f"%(f1_score(true_label, pred_label, average='micro')))
print("Accuracy score: %f"%(accuracy_score(true_label, pred_label)))
report = classification_report(true_label, pred_label,digits=4)
print("***** Eval results *****")
print("\n%s"%(report))
matrix = confusion_matrix(true_label, pred_label)
print('***** Confusion Matrix *****')
print(matrix)

f1 socre: 0.945602
Accuracy score: 0.945602
***** Eval results *****

              precision    recall  f1-score   support

           0     0.9537    0.9763    0.9649      6664
           1     0.9648    0.9602    0.9625      2286
           2     0.7813    0.6860    0.7306       500
           3     0.6835    0.4320    0.5294       125
           4     0.9299    0.8601    0.8936       293
           5     0.9271    0.7807    0.8476       114

    accuracy                         0.9456      9982
   macro avg     0.8734    0.7825    0.8214      9982
weighted avg     0.9432    0.9456    0.9437      9982

***** Confusion Matrix *****
[[6506   49   72   23   11    3]
 [  72 2195   14    1    0    4]
 [ 141   12  343    1    3    0]
 [  63    2    1   54    5    0]
 [  29    4    8    0  252    0]
 [  11   13    1    0    0   89]]


In [104]:
def predict_inference(premise, hypothesis, model, device):
    
    model.eval()
    
    premise = '[CLS] ' + premise + ' [SEP]'
    hypothesis = hypothesis + ' [SEP]'
    
    prem_t = tokenize_bert(premise)
    hypo_t = tokenize_bert(hypothesis)
    
    prem_type = get_sent1_token_type(prem_t)
    hypo_type = get_sent2_token_type(hypo_t)
    
    indexes = prem_t + hypo_t
    
    indexes = tokenizer.convert_tokens_to_ids(indexes)
    indexes_type = prem_type + hypo_type
    
    attn_mask = get_sent2_token_type(indexes)

    indexes = torch.LongTensor(indexes).unsqueeze(0).to(device)
    indexes_type = torch.LongTensor(indexes_type).unsqueeze(0).to(device)
    attn_mask = torch.LongTensor(attn_mask).unsqueeze(0).to(device)
    
    #print(indexes_type)
    prediction = model(indexes, attn_mask, indexes_type)
    
    prediction = prediction.argmax(dim=-1).item()
    
    return LABEL.vocab.itos[prediction]

In [105]:
premise = 'She had [entity] a functioning arteriovenous fistula [entity] with a thrill and a bruit in her left arm .'
hypothesis = 'a functioning arteriovenous fistula is possible'

predict_inference(premise, hypothesis, model, device)

'contradiction'

In [106]:
premise = 'The differential diagnosis included [entity] viral pneumonitis [entity] , Mycoplasma , Chlamydia .'
hypothesis = 'viral pneumonitis is possible'

predict_inference(premise, hypothesis, model, device)

'entailment'

In [57]:
premise = 'Son died at 50 of [entity] diabetes [entity] and myocardial infarction .'
hypothesis = 'diabetes is present'

predict_inference(premise, hypothesis, model, device)

'contradiction'

In [58]:
premise = 'She is [entity] allergic [entity] to Augmentin which gives her a rash .'
hypothesis = 'allergic is conditional'

predict_inference(premise, hypothesis, model, device)

'entailment'

In [59]:
premise = 'She is suffering from [entity] fever [entity] .'
hypothesis = 'fever is associated with someone else'

predict_inference(premise, hypothesis, model, device)

'contradiction'

In [60]:
premise = 'There was [entity] an initial murmur [entity] on admission likely secondary to severe anemia which has since resolved .'
hypothesis = 'an initial murmur is absent'

predict_inference(premise, hypothesis, model, device)

'contradiction'