In [None]:
!pip install seqeval
!pip install transformers

In [2]:
import os
import pandas as pd
import math
import numpy as np
from tqdm import tqdm, trange
from seqeval.metrics import classification_report, accuracy_score, f1_score
import torch
import torch.nn.functional as F
from torch.optim import Adam
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from transformers import AutoModel, AutoConfig, AutoTokenizer
from transformers import AdamW
from transformers import AutoModelForSequenceClassification, BertForSequenceClassification

In [3]:
import tensorflow as tf
import torch

# Get the GPU device name.
device_name = tf.test.gpu_device_name()

# The device name should look like the following:
if device_name == '/device:GPU:0':
    print('Found GPU at: {}'.format(device_name))
else:
    print('GPU device not found')

# If there's a GPU available...
if torch.cuda.is_available():    
    # Tell PyTorch to use the GPU.    
    device = torch.device("cuda")
    print('There are %d GPU(s) available.' % torch.cuda.device_count())
    print('Use the GPU:', torch.cuda.get_device_name(0))
else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

Found GPU at: /device:GPU:0
There are 1 GPU(s) available.
Use the GPU: Tesla T4


In [None]:
# no of classifier: present, possible, conditional, not-present
num_labels = 4
MODEL_CLASSES = {
  'bert': (AutoConfig, BertForSequenceClassification, AutoTokenizer),
}
MODEL_ADDRESS = 'emilyalsentzer/Bio_ClinicalBERT'
config_class, model_class, tokenizer_class = MODEL_CLASSES['bert']
model_config = config_class.from_pretrained(MODEL_ADDRESS, num_labels=num_labels)
tokenizer = tokenizer_class.from_pretrained(MODEL_ADDRESS, do_lower_case=False)
model = model_class.from_pretrained(MODEL_ADDRESS, config=model_config)

In [5]:
def modify_label(label):
    if label == 'present':
        return int(0)
    elif label == 'possible':
        return int(1)
    elif label == 'conditional':
        return int(2)
    elif label == 'not-present':
        return int(3)

In [6]:
data_path_train_url = "https://raw.githubusercontent.com/sajjadIslam2619/sample_files/main/processed/merged/assertion_4_label_modified_train.tsv"
#data_path_train_url = 'https://raw.githubusercontent.com/sajjadIslam2619/sample_files/main/processed/merged/assertion_label_modified_train_small.tsv'
df_data_train = pd.read_csv(data_path_train_url, sep="\t").astype(str)

df_data_train['label'] = df_data_train['label'].apply(modify_label)

sentences_train = df_data_train.sentence.values
labels_train = df_data_train.label.values
#print(sentences_train[0])
#print(labels_train[:10])
print(df_data_train.shape)
df_data_train.sample(3)

(6365, 3)


Unnamed: 0.1,Unnamed: 0,sentence,label
1711,1711,Abdomen soft and [entity] non-tender [entity] .,3
5045,5045,"No [entity] m,r,g [entity] .",3
3092,3092,[entity] Hypotension [entity] - On the night o...,0


In [7]:
# Print the original sentence.
print('Original: ', sentences_train[0])
# Print the sentence split into tokens.
print('Tokenized: ', tokenizer.tokenize(sentences_train[0]))
# Print the sentence mapped to token ids.
print('Token IDs: ', tokenizer.convert_tokens_to_ids(tokenizer.tokenize(sentences_train[0])))

Original:  Her 05-27 CXR post procedure w/o PNX , [entity] effusion [entity] much improved , yet her 05-28 CXR worse than 05-27 with some reaccumulation of right pleural effusion .
Tokenized:  ['Her', '05', '-', '27', 'C', '##X', '##R', 'post', 'procedure', 'w', '/', 'o', 'P', '##N', '##X', ',', '[', 'entity', ']', 'e', '##ff', '##usion', '[', 'entity', ']', 'much', 'improved', ',', 'yet', 'her', '05', '-', '28', 'C', '##X', '##R', 'worse', 'than', '05', '-', '27', 'with', 'some', 're', '##ac', '##cum', '##ulation', 'of', 'right', 'p', '##le', '##ural', 'e', '##ff', '##usion', '.']
Token IDs:  [1430, 4991, 118, 1765, 140, 3190, 2069, 2112, 7791, 192, 120, 184, 153, 2249, 3190, 117, 164, 9127, 166, 174, 3101, 17268, 164, 9127, 166, 1277, 4725, 117, 1870, 1123, 4991, 118, 1743, 140, 3190, 2069, 4146, 1190, 4991, 118, 1765, 1114, 1199, 1231, 7409, 19172, 6856, 1104, 1268, 185, 1513, 12602, 174, 3101, 17268, 119]


In [None]:
input_ids = []
attention_masks = []

for sent in sentences_train:
    encoded_dict = tokenizer.encode_plus(
                        sent,                      # Sentence to encode.
                        add_special_tokens = True, # Add '[CLS]' and '[SEP]'
                        max_length = 128,           # Pad & truncate all sentences.
                        pad_to_max_length = True,
                        return_attention_mask = True,   # Construct attn. masks.
                        return_tensors = 'pt',     # Return pytorch tensors.
                   )
    
    input_ids.append(encoded_dict['input_ids'])
    
    attention_masks.append(encoded_dict['attention_mask'])

input_ids = torch.cat(input_ids, dim=0)
attention_masks = torch.cat(attention_masks, dim=0)
labels = torch.tensor(labels_train)

# print('Original: ', sentences_train[0])
# print('Token IDs:', input_ids[0])

In [9]:
train_dataset = TensorDataset(input_ids, attention_masks, labels)

In [10]:
data_path_dev_url = "https://raw.githubusercontent.com/sajjadIslam2619/sample_files/main/processed/merged/assertion_4_label_modified_dev.tsv"
# data_path_dev_url = 'https://raw.githubusercontent.com/sajjadIslam2619/sample_files/main/processed/merged/assertion_label_modified_test_small.tsv'
df_data_dev = pd.read_csv(data_path_dev_url, sep="\t").astype(str)

df_data_dev['label'] = df_data_dev['label'].apply(modify_label)

sentences_dev = df_data_dev.sentence.values
labels_dev = df_data_dev.label.values
#print(sentences_dev[0])
#print(labels_dev[:10])
print(df_data_dev.shape)
df_data_dev.sample(3)

(708, 3)


Unnamed: 0.1,Unnamed: 0,sentence,label
319,319,Psychiatry Service saw her on 6/28 and recomme...,0
430,430,[entity] Mild swelling [entity] was noted in t...,0
46,46,The pt continued to have [entity] low urine ou...,0


In [None]:
input_ids = []
attention_masks = []

# For every sentence...
for sent in sentences_dev:
    encoded_dict = tokenizer.encode_plus(
                        sent,                      # Sentence to encode.
                        add_special_tokens = True, # Add '[CLS]' and '[SEP]'
                        max_length = 128,           # Pad & truncate all sentences.
                        pad_to_max_length = True,
                        return_attention_mask = True,   # Construct attn. masks.
                        return_tensors = 'pt',     # Return pytorch tensors.
                   )
    
    input_ids.append(encoded_dict['input_ids'])

    attention_masks.append(encoded_dict['attention_mask'])

input_ids = torch.cat(input_ids, dim=0)
attention_masks = torch.cat(attention_masks, dim=0)
labels = torch.tensor(labels_dev)

# print('Original: ', sentences_test[0])
# print('Token IDs:', input_ids[0])

In [12]:
val_dataset = TensorDataset(input_ids, attention_masks, labels)

In [13]:
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler

batch_size = 32 
train_dataloader = DataLoader(
            train_dataset,  # The training samples.
            sampler = RandomSampler(train_dataset), # Select batches randomly
            batch_size = batch_size # Trains with this batch size.
        )

validation_dataloader = DataLoader(
            val_dataset, # The validation samples.
            sampler = SequentialSampler(val_dataset), # Pull out batches sequentially.
            batch_size = batch_size # Evaluate with this batch size.
        )

In [None]:
# Tell pytorch to run this model on the GPU.
model.cuda()

In [15]:
optimizer = AdamW(model.parameters(),
                  lr = 2e-5, # args.learning_rate - default is 5e-5.
                  eps = 1e-8 # args.adam_epsilon  - default is 1e-8.
                )



In [16]:
from transformers import get_linear_schedule_with_warmup
epochs = 4
total_steps = len(train_dataloader) * epochs
scheduler = get_linear_schedule_with_warmup(optimizer, 
                                            num_warmup_steps = 0, # Default value in run_glue.py
                                            num_training_steps = total_steps)

In [17]:
import numpy as np

# Function to calculate the accuracy of predictions vs labels
def flat_accuracy(preds, labels):
    pred_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return np.sum(pred_flat == labels_flat) / len(labels_flat)

In [18]:
import time
import datetime

def format_time(elapsed):
    # Round to the nearest second.
    elapsed_rounded = int(round((elapsed)))
    
    # Format as hh:mm:ss
    return str(datetime.timedelta(seconds=elapsed_rounded))

**Model train and validation**

In [19]:
import random
import numpy as np

seed_val = 42

random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)

training_stats = []
total_t0 = time.time()

# For each epoch...
for epoch_i in range(0, epochs):
    
    # ========================================
    #               Training
    # ========================================
    
    # Perform one full pass over the training set.

    print("")
    print('======== Epoch {:} / {:} ========'.format(epoch_i + 1, epochs))
    print('Training...')

    t0 = time.time()
    total_train_loss = 0

    model.train()

    # For each batch of training data...
    for step, batch in enumerate(train_dataloader):

        # Progress update every 40 batches.
        if step % 40 == 0 and not step == 0:
            elapsed = format_time(time.time() - t0)
            print('  Batch {:>5,}  of  {:>5,}.    Elapsed: {:}.'.format(step, len(train_dataloader), elapsed))

        b_input_ids = batch[0].to(device)
        b_input_mask = batch[1].to(device)
        b_labels = batch[2].to(device)

        model.zero_grad()        

        result = model(b_input_ids, 
                       token_type_ids=None, 
                       attention_mask=b_input_mask, 
                       labels=b_labels,
                       return_dict=True)

        loss = result.loss
        logits = result.logits
        total_train_loss += loss.item()
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step()
        scheduler.step()

    avg_train_loss = total_train_loss / len(train_dataloader)            

    training_time = format_time(time.time() - t0)

    print("")
    print("  Average training loss: {0:.2f}".format(avg_train_loss))
    print("  Training epcoh took: {:}".format(training_time))
        
    # ========================================
    #               Validation
    # ========================================
    # After the completion of each training epoch, measure performance on validation set.

    print("")
    print("Running Validation...")

    t0 = time.time()
    model.eval()
    total_eval_accuracy = 0
    total_eval_loss = 0
    nb_eval_steps = 0

    # Evaluate data for one epoch
    for batch in validation_dataloader:

        b_input_ids = batch[0].to(device)
        b_input_mask = batch[1].to(device)
        b_labels = batch[2].to(device)

        with torch.no_grad():        

            result = model(b_input_ids, 
                           token_type_ids=None, 
                           attention_mask=b_input_mask,
                           labels=b_labels,
                           return_dict=True)

        loss = result.loss
        logits = result.logits

        total_eval_loss += loss.item()

        logits = logits.detach().cpu().numpy()
        label_ids = b_labels.to('cpu').numpy()

        # predictions.append(logits)
        # true_labels.append(label_ids)

        total_eval_accuracy += flat_accuracy(logits, label_ids)

    avg_val_accuracy = total_eval_accuracy / len(validation_dataloader)
    print("  Accuracy: {0:.2f}".format(avg_val_accuracy))

    avg_val_loss = total_eval_loss / len(validation_dataloader)

    validation_time = format_time(time.time() - t0)
    
    print("  Validation Loss: {0:.2f}".format(avg_val_loss))
    print("  Validation took: {:}".format(validation_time))

    # Record all statistics from this epoch.
    training_stats.append(
        {
            'epoch': epoch_i + 1,
            'Training Loss': avg_train_loss,
            'Valid. Loss': avg_val_loss,
            'Valid. Accur.': avg_val_accuracy,
            'Training Time': training_time,
            'Validation Time': validation_time
        }
    )

print("")
print("Training complete!")

print("Total training took {:} (h:mm:ss)".format(format_time(time.time()-total_t0)))


Training...
  Batch    40  of    199.    Elapsed: 0:00:24.
  Batch    80  of    199.    Elapsed: 0:00:47.
  Batch   120  of    199.    Elapsed: 0:01:11.
  Batch   160  of    199.    Elapsed: 0:01:35.

  Average training loss: 0.42
  Training epcoh took: 0:01:59

Running Validation...
  Accuracy: 0.93
  Validation Loss: 0.26
  Validation took: 0:00:05

Training...
  Batch    40  of    199.    Elapsed: 0:00:25.
  Batch    80  of    199.    Elapsed: 0:00:50.
  Batch   120  of    199.    Elapsed: 0:01:15.
  Batch   160  of    199.    Elapsed: 0:01:40.

  Average training loss: 0.16
  Training epcoh took: 0:02:04

Running Validation...
  Accuracy: 0.95
  Validation Loss: 0.21
  Validation took: 0:00:05

Training...
  Batch    40  of    199.    Elapsed: 0:00:25.
  Batch    80  of    199.    Elapsed: 0:00:51.
  Batch   120  of    199.    Elapsed: 0:01:16.
  Batch   160  of    199.    Elapsed: 0:01:42.

  Average training loss: 0.10
  Training epcoh took: 0:02:06

Running Validation...
  Accu

In [20]:
import pandas as pd

pd.set_option('precision', 2)
df_stats = pd.DataFrame(data=training_stats)
df_stats = df_stats.set_index('epoch')

# Display the table.
df_stats

Unnamed: 0_level_0,Training Loss,Valid. Loss,Valid. Accur.,Training Time,Validation Time
epoch,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,0.42,0.26,0.93,0:01:59,0:00:05
2,0.16,0.21,0.95,0:02:04,0:00:05
3,0.1,0.2,0.95,0:02:06,0:00:05
4,0.06,0.2,0.95,0:02:06,0:00:05


In [21]:
data_path_test_url = "https://raw.githubusercontent.com/sajjadIslam2619/sample_files/main/processed/merged/assertion_4_label_modified_test.tsv"
# data_path_test_url = 'https://raw.githubusercontent.com/sajjadIslam2619/sample_files/main/processed/merged/assertion_label_modified_dev_small.tsv'
df_data_test = pd.read_csv(data_path_test_url, sep="\t").astype(str)

df_data_test['label'] = df_data_test['label'].apply(modify_label)

sentences_test = df_data_test.sentence.values
labels_test = df_data_test.label.values
#print(sentences_test[0])
#print(labels_test[:10])
print(df_data_test.shape)
df_data_test.sample(3)

(11118, 3)


Unnamed: 0.1,Unnamed: 0,sentence,label
1864,1864,"Pt is a 78 yo male with h/o A fib , s/p multip...",0
3353,3353,4. [entity] Paroxysmal atrial fibrillation [en...,0
4684,4684,Chief Complaint : [entity] abdominal pain [ent...,0


In [None]:
# Tokenize all of the sentences and map the tokens to thier word IDs.
input_ids = []
attention_masks = []

# For every sentence...
for sent in sentences_test:
    encoded_dict = tokenizer.encode_plus(
                        sent,                      # Sentence to encode.
                        add_special_tokens = True, # Add '[CLS]' and '[SEP]'
                        max_length = 128,           # Pad & truncate all sentences.
                        pad_to_max_length = True,
                        return_attention_mask = True,   # Construct attn. masks.
                        return_tensors = 'pt',     # Return pytorch tensors.
                   )
      
    input_ids.append(encoded_dict['input_ids'])

    attention_masks.append(encoded_dict['attention_mask'])

# Convert the lists into tensors.
input_ids = torch.cat(input_ids, dim=0)
attention_masks = torch.cat(attention_masks, dim=0)
labels = torch.tensor(labels_test)

# Print sentence 0, now as a list of IDs.
# print('Original: ', sentences_dev[0])
# print('Token IDs:', input_ids[0])

In [26]:
prediction_data = TensorDataset(input_ids, attention_masks, labels)
prediction_sampler = SequentialSampler(prediction_data)
prediction_dataloader = DataLoader(prediction_data, sampler=prediction_sampler, batch_size=batch_size)

In [27]:
# To calculate F1 score and accurecy and generate classification report.
y_true = []
y_pred = []
predictions , true_labels = [], []

In [28]:
# Prediction on test set

print('Predicting labels for {:,} test sentences...'.format(len(input_ids)))
model.eval()
predictions , true_labels = [], []

for batch in prediction_dataloader:
  batch = tuple(t.to(device) for t in batch)
  b_input_ids, b_input_mask, b_labels = batch
  with torch.no_grad():
      result = model(b_input_ids, 
                     token_type_ids=None, 
                     attention_mask=b_input_mask,
                     return_dict=True)

  logits = result.logits

  logits = logits.detach().cpu().numpy()
  label_ids = b_labels.to('cpu').numpy()

  predictions.append(logits)
  true_labels.append(label_ids)

print('DONE.')

Predicting labels for 11,118 test sentences...
DONE.


In [29]:
print('Positive samples: %d of %d (%.2f%%)' % (df_data_test.label.sum(), len(df_data_test.label), (df_data_test.label.sum() / len(df_data_test.label) * 100.0)))

Positive samples: 9203 of 11118 (82.78%)


In [30]:
from sklearn.metrics import matthews_corrcoef
import numpy as np
from sklearn.metrics import f1_score,accuracy_score, classification_report

matthews_set = []
# Calculating Matthews Corr. Coef. for each batch...
# For each input batch...
for i in range(len(true_labels)):
  pred_labels_i = np.argmax(predictions[i], axis=1).flatten() 
  matthews = matthews_corrcoef(true_labels[i], pred_labels_i)
  y_true.extend(true_labels[i])
  y_pred.extend(pred_labels_i)         
  matthews_set.append(matthews)


print("f1 socre: %f"%(f1_score(y_true, y_pred, average='micro')))
print("Accuracy score: %f"%(accuracy_score(y_true, y_pred)))
report = classification_report(y_true, y_pred,digits=4)
print("***** Eval results *****")
print("\n%s"%(report))

f1 socre: 0.951700
Accuracy score: 0.951700
***** Eval results *****

              precision    recall  f1-score   support

           0     0.9576    0.9798    0.9685      7622
           1     0.8137    0.7230    0.7657       574
           2     0.9000    0.2628    0.4068       137
           3     0.9614    0.9558    0.9586      2785

    accuracy                         0.9517     11118
   macro avg     0.9082    0.7304    0.7749     11118
weighted avg     0.9504    0.9517    0.9487     11118



In [31]:
flat_predictions = np.concatenate(predictions, axis=0)
flat_predictions = np.argmax(flat_predictions, axis=1).flatten()
flat_true_labels = np.concatenate(true_labels, axis=0)
mcc = matthews_corrcoef(flat_true_labels, flat_predictions)

print('Total MCC: %.3f' % mcc)

Total MCC: 0.894


**Save model**

In [32]:
import os

# Saving best-practices: if you use defaults names for the model, you can reload it using from_pretrained()

output_dir = './model_save/'

# Create output directory if needed
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

print("Saving model to %s" % output_dir)

# Save a trained model, configuration and tokenizer using `save_pretrained()`.
# They can then be reloaded using `from_pretrained()`
model_to_save = model.module if hasattr(model, 'module') else model  # Take care of distributed/parallel training
model_to_save.save_pretrained(output_dir)
tokenizer.save_pretrained(output_dir)

# Good practice: save your training arguments together with the trained model
# torch.save(args, os.path.join(output_dir, 'training_args.bin'))

Saving model to ./model_save/


('./model_save/tokenizer_config.json',
 './model_save/special_tokens_map.json',
 './model_save/vocab.txt',
 './model_save/added_tokens.json',
 './model_save/tokenizer.json')

In [33]:
!ls -l --block-size=K ./model_save/

total 424052K
-rw-r--r-- 1 root root      1K May 14 06:13 config.json
-rw-r--r-- 1 root root 423169K May 14 06:13 pytorch_model.bin
-rw-r--r-- 1 root root      1K May 14 06:13 special_tokens_map.json
-rw-r--r-- 1 root root      1K May 14 06:13 tokenizer_config.json
-rw-r--r-- 1 root root    654K May 14 06:13 tokenizer.json
-rw-r--r-- 1 root root    209K May 14 06:13 vocab.txt


In [34]:
!ls -l --block-size=M ./model_save/pytorch_model.bin

-rw-r--r-- 1 root root 414M May 14 06:13 ./model_save/pytorch_model.bin


In [35]:
# Mount Google Drive to this Notebook instance.
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [36]:
!cp -r ./model_save/ "./drive/My Drive/MU/NMDSI/ast_model_save/4_label_model/"

In [38]:
# Load a trained model and vocabulary that you have fine-tuned
output_dir = './trained_models/4_label_model/'
# output_dir = './model_save/'
model = model_class.from_pretrained(output_dir)
tokenizer = tokenizer_class.from_pretrained(output_dir)

# Copy the model to the GPU.
# model.to(device)

**Predict with model**

In [39]:
sentence = 'This could be due to internal hernia or could be [entity] stricture [entity] related .'
sentences = []
sentences.append(sentence)
input_ids = []
attention_masks = []

# For every sentence...
for sent in sentences:
    encoded_dict = tokenizer.encode_plus(
                        sent,                      # Sentence to encode.
                        add_special_tokens = True, # Add '[CLS]' and '[SEP]'
                        max_length = 128,           # Pad & truncate all sentences.
                        pad_to_max_length = True,
                        return_attention_mask = True,   # Construct attn. masks.
                        return_tensors = 'pt',     # Return pytorch tensors.
                   )
      
    input_ids.append(encoded_dict['input_ids'])

    attention_masks.append(encoded_dict['attention_mask'])

# Convert the lists into tensors.
input_ids = torch.cat(input_ids, dim=0)
attention_masks = torch.cat(attention_masks, dim=0)
# labels = torch.tensor(labels)


Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


In [40]:
input_ids = torch.tensor(input_ids)
attention_masks = torch.tensor(attention_masks)

  """Entry point for launching an IPython kernel.
  


In [43]:
model.eval()

with torch.no_grad():
    result = model(input_ids, token_type_ids=None, attention_mask=attention_masks, return_dict=True)

logits = result.logits
logits = logits.detach().cpu().numpy()
predictions.append(logits)

print('sentence: ', sentence)
pred_labels_i = np.argmax(logits, axis=1).flatten()
print('Label prediction: ', pred_labels_i) 

if pred_labels_i[0] == 0:
  print ('Present')
elif pred_labels_i[0] == 1:
  print ('Possible')
elif pred_labels_i[0] == 2:
  print ('Conditional')
elif pred_labels_i[0] == 3:
  print ('Not-present')



sentence:  This could be due to internal hernia or could be [entity] stricture [entity] related .
Label prediction:  [1]
Possible
