#Sentence Classification using BERT

In [None]:
import tensorflow as tf
# Checking for the GPU
device_name = tf.test.gpu_device_name()
print(device_name)
import torch
device = torch.device("cuda")

/device:GPU:0


In [None]:
#!pip install transformers

In [None]:
#from google.colab import files
#uploaded = files.upload()
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import pandas as pd
import re
import numpy as np
from sklearn.preprocessing import LabelEncoder
LE = LabelEncoder()

# Load the dataset into a pandas dataframe
other = "/content/drive/MyDrive/nlp/lung_cancer/model/other.csv"
pat = "/content/drive/MyDrive/nlp/lung_cancer/model/nlp_progression_without_date.csv"

# data wiht progression 
d1 = pd.read_csv(pat)
# delete unnecessary columns
values = ['E_keywords', 'E_keywords_span', 'between_text', 'note_date','I_keywords','I_keywords_span','note_type']
d1.drop(values, axis=1, inplace=True)
d1['Response']= 'progression'

# data with no-progression
d2 = pd.read_csv(other)
d2['Response']= 'other'

# concat two frames
frames = [d1, d2]
d = pd.concat(frames)

# shuffling
d = d.sample(frac=1).reset_index(drop=True)

# cleaning a text
d["clean_report_text"] = d["note_content"].apply(lambda text: re.sub('\W+', ' ', text).lower().strip() + str(' '))

# delete empty mrn
d = d[d["MRN"] != None]

d["note_content"] = d["note_content"].apply(lambda text: re.sub('\W+', ' ', text).lower().strip() + str(' '))
d['Response'] = LE.fit_transform(d['Response'])

# Report the number of sentences.
print('Number of training sentences: {:,}\n'.format(d.shape[0]))

# Display 2 random rows from the data.
d.sample(2)

Number of training sentences: 1,671



Unnamed: 0,MRN,Visit ID,note_content,Response,clean_report_text
1284,KH1000213306,10467360.0,discharge summary discharge date 04 10 2013 di...,1,discharge summary discharge date 04 10 2013 di...
1503,KH1000024530,107182400000.0,______department of radiology and imaging_____...,0,______department of radiology and imaging_____...


In [None]:
# split data into training and testing
df = d.iloc[:1200,:]
df2 = d.iloc[1201:,:]

# Report the number of sentences in train data
print('Number of training sentences: {:,}\n'.format(df.shape[0]))

# Report the number of sentences in test data
print('Number of test sentences: {:,}\n'.format(df2.shape[0]))

#df.loc[df.Response == 0].sample(5)[['note_content', 'Response']]

Number of training sentences: 1,200

Number of test sentences: 470



In [None]:
from transformers import BertTokenizer
# Load the BERT tokenizer.
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)

Downloading:   0%|          | 0.00/226k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/455k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [None]:
# Get the lists of sentences and their labels.
sentences = df.note_content.values
labels = df.Response.values

# # Print the original sentence.
# print(' Original: ', sentences[0])

# # Print the sentence split into tokens.
# print('Tokenized: ', tokenizer.tokenize(sentences[0]))

# # Print the sentence mapped to token ids.
# print('Token IDs: ', tokenizer.convert_tokens_to_ids(tokenizer.tokenize(sentences[0])))
# Tokenize all of the sentences and map the tokens to thier word IDs.
input_ids = []

# For every sentence...
for sent in sentences:
    # `encode` will:
    #   (1) Tokenize the sentence.
    #   (2) Prepend the `[CLS]` token to the start.
    #   (3) Append the `[SEP]` token to the end.
    #   (4) Map tokens to their IDs.
    encoded_sent = tokenizer.encode(sent)
    
    # Add the encoded sentence to the list.
    input_ids.append(encoded_sent)

# delete token which has len less than 100
for i in input_ids:
    if (len(i) < 100) :
        #print(len(i))
        input_ids.remove(i)      
# len(input_ids)

# Print sentence 0, now as a list of IDs.
# print('Original: ', sentences[0])
# print('Token IDs:', input_ids[0])
print('Max sentence length: ', max([len(sen) for sen in input_ids]))
print('Min sentence length: ', min([len(sen) for sen in input_ids]))

# We will use some utility function from tensorflow(Tensorflow was my first crush)
from keras.preprocessing.sequence import pad_sequences

MAX_LEN = 512

#Padding the input to the max length that is 64
input_ids = pad_sequences(input_ids, maxlen=MAX_LEN, dtype="long", value=0, truncating="post", padding="post")

# Creating the attention masks
attention_masks = []

# For each sentence...
for sent in input_ids:
    
    # Create the attention mask.
    #   - If a token ID is 0, then it's padding, set the mask to 0.
    #   - If a token ID is > 0, then it's a real token, set the mask to 1.
    att_mask = [int(token_id > 0) for token_id in sent]
    
    # Store the attention mask for this sentence.
    attention_masks.append(att_mask)

Token indices sequence length is longer than the specified maximum sequence length for this model (1128 > 512). Running this sequence through the model will result in indexing errors


Max sentence length:  2440
Min sentence length:  9


# define model

In [None]:
# We will call the train_test_split() function from sklearn
from sklearn.model_selection import train_test_split


train_inputs, validation_inputs, train_labels, validation_labels = train_test_split(input_ids, labels, random_state=2018, test_size=0.1)
# Performing same steps on the attention masks
train_masks, validation_masks, _, _ = train_test_split(attention_masks, labels, random_state=2018, test_size=0.1)

#Converting the input data to the tensor , which can be feeded to the model
train_inputs = torch.tensor(train_inputs)
validation_inputs = torch.tensor(validation_inputs)

train_labels = torch.tensor(train_labels)
validation_labels = torch.tensor(validation_labels)

train_masks = torch.tensor(train_masks)
validation_masks = torch.tensor(validation_masks)

from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler

#Creating the DataLoader which will help us to load data into the GPU/CPU
batch_size = 4

# Create the DataLoader for our training set.
train_data = TensorDataset(train_inputs, train_masks, train_labels)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

# Create the DataLoader for our validation set.
validation_data = TensorDataset(validation_inputs, validation_masks, validation_labels)
validation_sampler = SequentialSampler(validation_data)
validation_dataloader = DataLoader(validation_data, sampler=validation_sampler, batch_size=batch_size)

#Loading the pre-trained BERT model from huggingface library

from transformers import BertForSequenceClassification, AdamW, BertConfig

# Load BertForSequenceClassification, the pretrained BERT model with a single 
# linear classification layer on top. 
model = BertForSequenceClassification.from_pretrained(
    "bert-base-uncased", 
    num_labels = 2,   
    output_attentions = False, 
    output_hidden_states = False, )

# Teeling the model to run on GPU
model.cuda()




Downloading:   0%|          | 0.00/420M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, element

In [None]:
# AdamW is an optimizer which is a Adam Optimzier with weight-decay-fix
optimizer = AdamW(model.parameters(),
                  lr = 2e-5, 
                  eps = 1e-8 )

from transformers import get_linear_schedule_with_warmup

# Number of training epochs (authors recommend between 2 and 4)
epochs = 30

# Total number of training steps is number of batches * number of epochs.
total_steps = len(train_dataloader) * epochs

# Create the learning rate scheduler.
scheduler = get_linear_schedule_with_warmup(optimizer, 
                                            num_warmup_steps = 0, # Default value in run_glue.py
                                            num_training_steps = total_steps)

import numpy as np

# Function to calculate the accuracy of our predictions vs labels
def flat_accuracy(preds, labels):
    pred_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return np.sum(pred_flat == labels_flat) / len(labels_flat)

#Creating the helper function to have a watch on elapsed time

import time
import datetime

def format_time(elapsed):
    '''
    Takes a time in seconds and returns a string hh:mm:ss
    '''
    # Round to the nearest second.
    elapsed_rounded = int(round((elapsed)))
    
    # Format as hh:mm:ss
    return str(datetime.timedelta(seconds=elapsed_rounded))

# training

In [None]:
#Let's start the training process

import random

# This training code is based on the `run_glue.py` script here:
# https://github.com/huggingface/transformers/blob/5bfcd0485ece086ebcbed2d008813037968a9e58/examples/run_glue.py#L128


# Set the seed value all over the place to make this reproducible.
seed_val = 42

random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)

# Store the average loss after each epoch so we can plot them.
loss_values = []

# For each epoch...
for epoch_i in range(0, epochs):
    
    # ========================================
    #               Training
    # ========================================
    
    # Perform one full pass over the training set.

    print("")
    print('======== Epoch {:} / {:} ========'.format(epoch_i + 1, epochs))
    print('Training...')

    # Measure how long the training epoch takes.
    t0 = time.time()

    # Reset the total loss for this epoch.
    total_loss = 0

    # Put the model into training mode. Don't be mislead--the call to 
    # `train` just changes the *mode*, it doesn't *perform* the training.
    # `dropout` and `batchnorm` layers behave differently during training
    # vs. test (source: https://stackoverflow.com/questions/51433378/what-does-model-train-do-in-pytorch)
    model.train()

    # For each batch of training data...
    for step, batch in enumerate(train_dataloader):

        # Progress update every 40 batches.
        if step % 10 == 0 and not step == 0:
            # Calculate elapsed time in minutes.
            elapsed = format_time(time.time() - t0)
            
            # Report progress.
            print('  Batch {:>5,}  of  {:>5,}.    Elapsed: {:}.'.format(step, len(train_dataloader), elapsed))

        # Unpack this training batch from our dataloader. 
        #
        # As we unpack the batch, we'll also copy each tensor to the GPU using the 
        # `to` method.
        #
        # `batch` contains three pytorch tensors:
        #   [0]: input ids 
        #   [1]: attention masks
        #   [2]: labels 
        b_input_ids = batch[0].to(device)
        b_input_mask = batch[1].to(device)
        b_labels = batch[2].to(device)

        # Always clear any previously calculated gradients before performing a
        # backward pass. PyTorch doesn't do this automatically because 
        # accumulating the gradients is "convenient while training RNNs". 
        # (source: https://stackoverflow.com/questions/48001598/why-do-we-need-to-call-zero-grad-in-pytorch)
        model.zero_grad()        

        # Perform a forward pass (evaluate the model on this training batch).
        # This will return the loss (rather than the model output) because we
        # have provided the `labels`.
        # The documentation for this `model` function is here: 
        # https://huggingface.co/transformers/v2.2.0/model_doc/bert.html#transformers.BertForSequenceClassification
        outputs = model(b_input_ids, 
                    token_type_ids=None, 
                    attention_mask=b_input_mask, 
                    labels=b_labels)
        
        # The call to `model` always returns a tuple, so we need to pull the 
        # loss value out of the tuple.
        loss = outputs[0]

        # Accumulate the training loss over all of the batches so that we can
        # calculate the average loss at the end. `loss` is a Tensor containing a
        # single value; the `.item()` function just returns the Python value 
        # from the tensor.
        total_loss += loss.item()

        # Perform a backward pass to calculate the gradients.
        loss.backward()

        # Clip the norm of the gradients to 1.0.
        # This is to help prevent the "exploding gradients" problem.
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

        # Update parameters and take a step using the computed gradient.
        # The optimizer dictates the "update rule"--how the parameters are
        # modified based on their gradients, the learning rate, etc.
        optimizer.step()

        # Update the learning rate.
        scheduler.step()

    # Calculate the average loss over the training data.
    avg_train_loss = total_loss / len(train_dataloader)            
    
    # Store the loss value for plotting the learning curve.
    loss_values.append(avg_train_loss)

    print("")
    print("  Average training loss: {0:.2f}".format(avg_train_loss))
    print("  Training epoch took: {:}".format(format_time(time.time() - t0)))
        
    # ========================================
    #               Validation
    # ========================================
    # After the completion of each training epoch, measure our performance on
    # our validation set.

    print("")
    print("Running Validation...")

    t0 = time.time()

    # Put the model in evaluation mode--the dropout layers behave differently
    # during evaluation.
    model.eval()

    # Tracking variables 
    eval_loss, eval_accuracy = 0, 0
    nb_eval_steps, nb_eval_examples = 0, 0

    # Evaluate data for one epoch
    for batch in validation_dataloader:
        
        # Add batch to GPU
        batch = tuple(t.to(device) for t in batch)
        
        # Unpack the inputs from our dataloader
        b_input_ids, b_input_mask, b_labels = batch
        
        # Telling the model not to compute or store gradients, saving memory and
        # speeding up validation
        with torch.no_grad():        

            # Forward pass, calculate logit predictions.
            # This will return the logits rather than the loss because we have
            # not provided labels.
            # token_type_ids is the same as the "segment ids", which 
            # differentiates sentence 1 and 2 in 2-sentence tasks.
            # The documentation for this `model` function is here: 
            # https://huggingface.co/transformers/v2.2.0/model_doc/bert.html#transformers.BertForSequenceClassification
            outputs = model(b_input_ids, 
                            token_type_ids=None, 
                            attention_mask=b_input_mask)
        
        # Get the "logits" output by the model. The "logits" are the output
        # values prior to applying an activation function like the softmax.
        logits = outputs[0]

        # Move logits and labels to CPU
        logits = logits.detach().cpu().numpy()
        label_ids = b_labels.to('cpu').numpy()
        
        # Calculate the accuracy for this batch of test sentences.
        tmp_eval_accuracy = flat_accuracy(logits, label_ids)
        
        # Accumulate the total accuracy.
        eval_accuracy += tmp_eval_accuracy

        # Track the number of batches
        nb_eval_steps += 1

    # Report the final accuracy for this validation run.
    print("  Accuracy: {0:.2f}".format(eval_accuracy/nb_eval_steps))
    print("  Validation took: {:}".format(format_time(time.time() - t0)))

print("")
print("Training complete!")


Training...
  Batch    10  of    270.    Elapsed: 0:00:09.
  Batch    20  of    270.    Elapsed: 0:00:17.
  Batch    30  of    270.    Elapsed: 0:00:25.
  Batch    40  of    270.    Elapsed: 0:00:34.
  Batch    50  of    270.    Elapsed: 0:00:42.
  Batch    60  of    270.    Elapsed: 0:00:50.
  Batch    70  of    270.    Elapsed: 0:00:59.
  Batch    80  of    270.    Elapsed: 0:01:07.
  Batch    90  of    270.    Elapsed: 0:01:15.
  Batch   100  of    270.    Elapsed: 0:01:24.
  Batch   110  of    270.    Elapsed: 0:01:32.
  Batch   120  of    270.    Elapsed: 0:01:40.
  Batch   130  of    270.    Elapsed: 0:01:49.
  Batch   140  of    270.    Elapsed: 0:01:57.
  Batch   150  of    270.    Elapsed: 0:02:05.
  Batch   160  of    270.    Elapsed: 0:02:13.
  Batch   170  of    270.    Elapsed: 0:02:22.
  Batch   180  of    270.    Elapsed: 0:02:30.
  Batch   190  of    270.    Elapsed: 0:02:38.
  Batch   200  of    270.    Elapsed: 0:02:47.
  Batch   210  of    270.    Elapsed: 0:02:55.


In [None]:
print(loss_values) #Having a view of stored loss values in the list

[0.5661747155642068, 0.3960072541212732, 0.3055690651194675, 0.26302900841821814, 0.16781585372982252, 0.14606743125617935, 0.15211791193508328, 0.1761285431783849, 0.12067243127002493, 0.11125086160787355, 0.10778593740472354, 0.09798838024842015, 0.08669434563786059, 0.09592765132124273, 0.06760315302988737, 0.07779148366319498, 0.07640336515861337, 0.06945125548957515, 0.0721589995649184, 0.07252684559776686, 0.06968163325573523, 0.06599207266530424, 0.06534342271829195, 0.07433880876320742, 0.06173013786519732, 0.06017409281672304, 0.060823806072585286, 0.06373538341890876, 0.06291634248530129, 0.06318677654973928]


# model evaluation

In [None]:
#Loading the test data and applying the same preprocessing techniques which we performed on the train data
import pandas as pd

# Load the dataset into a pandas dataframe.
df = df2
# df = df2.head(1)

# Report the number of sentences.
print('Number of test sentences: {:,}\n'.format(df.shape[0]))

# Create sentence and label lists
sentences = df.note_content.values
labels = df.Response.values

# Tokenize all of the sentences and map the tokens to thier word IDs.
input_ids = []

# For every sentence...
for sent in sentences:
    # `encode` will:
    #   (1) Tokenize the sentence.
    #   (2) Prepend the `[CLS]` token to the start.
    #   (3) Append the `[SEP]` token to the end.
    #   (4) Map tokens to their IDs.
    encoded_sent = tokenizer.encode(
                        sent,                      # Sentence to encode.
                        add_special_tokens = True, # Add '[CLS]' and '[SEP]'
                   )
    
    input_ids.append(encoded_sent)

# Pad our input tokens
input_ids = pad_sequences(input_ids, maxlen=MAX_LEN, 
                          dtype="long", truncating="post", padding="post")

# Create attention masks
attention_masks = []

# Create a mask of 1s for each token followed by 0s for padding
for seq in input_ids:
  seq_mask = [float(i>0) for i in seq]
  attention_masks.append(seq_mask) 

# Convert to tensors.
prediction_inputs = torch.tensor(input_ids)
prediction_masks = torch.tensor(attention_masks)
prediction_labels = torch.tensor(labels)

# Set the batch size.  
batch_size = 4

# Create the DataLoader.
prediction_data = TensorDataset(prediction_inputs, prediction_masks, prediction_labels)
prediction_sampler = SequentialSampler(prediction_data)
prediction_dataloader = DataLoader(prediction_data, sampler=prediction_sampler, batch_size=batch_size)

Number of test sentences: 470



In [None]:
#Evaluating our model on the test set

# Prediction on test set

print('Predicting labels for {:,} test sentences...'.format(len(prediction_inputs)))

# Put model in evaluation mode
model.eval()

# Tracking variables 
predictions , true_labels = [], []

# Predict 
for batch in prediction_dataloader:
  # Add batch to GPU
  batch = tuple(t.to(device) for t in batch)
  
  # Unpack the inputs from our dataloader
  b_input_ids, b_input_mask, b_labels = batch

  
  # Telling the model not to compute or store gradients, saving memory and 
  # speeding up prediction
  with torch.no_grad():
      # Forward pass, calculate logit predictions
      outputs = model(b_input_ids, token_type_ids=None, 
                      attention_mask=b_input_mask)

  logits = outputs[0]
  #print('logits' , logits)

  # Move logits and labels to CPU
  logits = logits.detach().cpu().numpy()
  label_ids = b_labels.to('cpu').numpy()

  pred_labels = np.argmax(logits, axis=1).flatten()
  #print(logits)
  #print(pred_labels)
  
  # Store predictions and true labels
  predictions.append(logits)
  true_labels.append(label_ids)

print('Positive samples: %d of %d (%.2f%%)' % (df.Response.sum(), len(df.Response), (df.Response.sum() / len(df.Response) * 100.0)))

from sklearn.metrics import matthews_corrcoef

matthews_set = []

# Evaluate each test batch using Matthew's correlation coefficient
print('Calculating Matthews Corr. Coef. for each batch...')

# For each input batch...
for i in range(len(true_labels)):
  
  # The predictions for this batch are a 2-column ndarray (one column for "0" 
  # and one column for "1"). Pick the label with the highest value and turn this
  # in to a list of 0s and 1s.
  pred_labels_i = np.argmax(predictions[i], axis=1).flatten()
  
  # Calculate and store the coef for this batch.  
  matthews = matthews_corrcoef(true_labels[i], pred_labels_i)                
  matthews_set.append(matthews)

# Combine the predictions for each batch into a single list of 0s and 1s.
flat_predictions = [item for sublist in predictions for item in sublist]
flat_predictions = np.argmax(flat_predictions, axis=1).flatten()

# Combine the correct labels for each batch into a single list.
flat_true_labels = [item for sublist in true_labels for item in sublist]

# Calculate the MCC
mcc = matthews_corrcoef(flat_true_labels, flat_predictions)

print('MCC: %.3f' % mcc)

Predicting labels for 470 test sentences...
Positive samples: 248 of 470 (52.77%)
Calculating Matthews Corr. Coef. for each batch...
MCC: 0.886


  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)


# Testing on single note

In [None]:
t = '''".WHOLE BODY18FDG PET-CT SCANHISTORY-Known case of Ca left lung with brain metastases, post 13 cycles of chemotherapy (last on 07/08/20) and 12 cycles of immunotherapy (last in August 2020), recurrence, received 5 cycles of radiotherapy (last on 09/09/20), on oral chemotherapy, for evaluationPROCEDURE:After atleast 6 hrs fasting 7.8 mCi of18FDG was administered intravenously and whole body Positron Emission Tomography acquired from vertex to mid thigh after 1 hour. SUV calculated based on body weight. Plain CT scan &amp; contrast CT of the same region was performed after oral and intravenous contrast for PET-CT fusion. Additional breath hold CT of lungs also acquired.FINDINGS:As compared to previous PET-CT scan dated 02/03/21, present study shows:Mild increase in FDG uptake (from SUVmax- 5.2 to 7.8) and size (from 1.0 x 3.0 to 1.3 x 3.5 cm) of previously demonstrated mass lesion is seen involving anterior segment of upper lobe of left lung. Few tiny perilesional nodules are seen. Fibrotic scaring noted in left lung.No significant change in FDG uptake (from SUVmax- 4.9 to 5.4) of left hilar lymph node noted.Liver appears normal in size and shape. No significant change in size (1.1 x 1.2 cm) with no appreciable focal FDG uptake noted in poorly enhancing hypodensity involving segment III/IVb of liver. No abnormal focal increased FDG uptake or other morphologically abnormal lesion is seen in the liver.Mild increase in size (from 2.0 x 2.3 to 2.3 x 3.0 cm) of previously demonstrated non FDG avid cyst noted in left adnexal region.Increased FDG uptake noted in endometrial cavity (could be physiological)…suggests USG correlation.There is no FDG avid or size criteria wise significant lymphadenopathy in neck, axillary, mediastinum, retroperitoneum, iliac group and inguinal regions.Spleen is normal in size, attenuation and enhancement. No abnormal focal increased FDG uptake or morphological lesion is seen within.Adrenals appear unremarkable.Pancreas appears normal in size and shape. No focal FDG avid parenchymal lesion is seen within. Pancreatic duct appears normal.Kidneys appear normal in size and showing normal contrast enhancement. Non FDG avid simple cyst noted in inferior pole of left kidney.No abnormal focal FDG uptake or morphological lesion is seen in brain **.Increase FDG uptake noted in sclerotic lesion noted in left acetabulum posteriorly (SUVmax- 5.3).No significant change in non FDG avid sclerotic lesion noted in iliac bone of acetabulum.No significant change in previously demonstrated sclerotic lesion is seen involving Left 7thrib.Rest of the PET-CT is unremarkable with physiological distribution of FDG.                                               IMPRESSION:Known case of Ca left lung with brain metastases, post 13 cycles of chemotherapy (last on 07/08/20) and 12 cycles of immunotherapy (last in August 2020), recurrence, received 5 cycles of radiotherapy (last on 09/09/20), on oral chemotherapy, for evaluationAs compared to previous PET-CT scan dated 02/03/21, present study shows:Mild increase in metabolic activity and size of mass lesion involving anterior segment of upper lobe of left lung. Few tiny perilesional nodules observed.No significant change in metabolic activity of left hilar lymph node.No significant change in size with no appreciable focal metabolic activity in poorly enhancing hypodensity involving segment III/IVb of liver….suggests USG and SOS FNAC correlation.Mild increase in size of metabolically inactive left adnexal cyst.No appreciable metabolically active or morphological lesion in brain…suggests MRI correlation for accurate evaluation of brain**.Increase metabolic activity in sclerotic lesion in left acetabulum posteriorly.No significant change in metabolically inactive sclerotic lesion in iliac bone of acetabulum and left 7thrib (old healed lesions).No evidence of metabolically active disease elsewhere within the body.Suggested: clinical correlation and corroboration with other investigation reports.         Dr. Hemant Khandare                        Consultant -Nuclear Medicine                                     Note:For direct PET/CT appointment call(022)-42699914Kindly bring previous relevant clinical details and previous scan CD for next follow up study.**PET/CT is less sensitive for brain lesion.MRI is recommended for accurate evaluation of brain lesion.Not all tumours are FDG avid and also non all FDG avid lesions are malignant. If current study is showing no FDG uptake but there is other evidence of presence of disease then further investigations like histopathological examination may be required. PET-CT help in diagnosing the disease in correlation to clinical symptoms and other related investigations. Please interpret accordingly."'''

In [None]:
text = '''portable 2d echoindications reason for echocardiogram st p cabg 2d echo study done at a heart rate of100 bpm morphological datamv mitral anular calcification ias intactav tricuspid sclerotic ivs normaltv normal pa normalpv normal ao normalcardiac dimensions la normal lv normal ra normal rv normalno regional wall motion abnormality seen good biventricular systolic function lvef 55 no clot vegetation ascending aorta arch could not be assessed due to tracheostomy doppler amp colour flow mapping diastolic dysfunction grade i pulmonary artery systolic pressure 57 mmhg estimated by tr jet mild mr mild ar mild tr mild pr impressiongood biventricular systolic function no regional wall motion abnormality seen lvef 55 tapse 15 mm mild mitral regurgitation mild aortic regurgitation moderate pulmonary hypertension ivc normal collapsing dr purabi koch specialist non invasive cardiology '''
import pandas as pd
df = pd.DataFrame()
df['note_content'] =[t]
df

Unnamed: 0,note_content
0,""".WHOLE BODY18FDG PET-CT SCANHISTORY-Known cas..."


In [None]:
sentences = df.note_content.values
input_ids = []
for sent in sentences:
    encoded_sent = tokenizer.encode(sent,add_special_tokens = True)
    input_ids.append(encoded_sent)
input_ids = pad_sequences(input_ids, maxlen=MAX_LEN, dtype="long", truncating="post", padding="post")

attention_masks = []
for seq in input_ids:
  seq_mask = [float(i>0) for i in seq]
  attention_masks.append(seq_mask) 

prediction_inputs = torch.tensor(input_ids)
prediction_masks = torch.tensor(attention_masks)
prediction_data = TensorDataset(prediction_inputs, prediction_masks)
prediction_sampler = SequentialSampler(prediction_data)
prediction_dataloader = DataLoader(prediction_data, sampler=prediction_sampler, batch_size=1)

#Evaluating our model on the test set
model.eval()
pred = []
for batch in prediction_dataloader:
  batch = tuple(t.to(device) for t in batch)
  b_input_ids, b_input_mask = batch
  with torch.no_grad():
      outputs = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask)

  logits = outputs[0]
  logits = logits.detach().cpu().numpy()
  pred_labels = np.argmax(logits, axis=1).flatten()
  print(logits)
  print(pred_labels)
  pred.append(pred_labels)

[[-2.1431503  2.05267  ]]
[1]


# save model

In [None]:
import os
output_dir = '/content/drive/MyDrive/nlp/lung_cancer/model_path/'
save_path = output_dir + "bert_model1.pth"

# Create output directory if needed
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

torch.save(model.state_dict(), save_path)

#!cp -r ./model_save/ "/content/drive/MyDrive/nlp/lung_cancer/model"

In [None]:

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

#model = BertForSequenceClassification("bert-base-uncased",  num_labels=2)
model.load_state_dict(torch.load(save_path, device))

<All keys matched successfully>

In [None]:
# for param in model.parameters():
#     param.requires_grad = False
# model.eval()

# End

In [None]:
sentence = df.note_content.values
# sent = sentence[0]
# sentences = np.array(sent)
# sentences

sent = sentence[0]
ids = tokenizer.encode(sent, add_special_tokens = True)
att_mask = [int(i > 0) for i in ids]

prediction_inputs = torch.tensor(ids)
prediction_masks = torch.tensor(att_mask)

prediction_inputs = prediction_inputs.to(device)
prediction_masks = prediction_masks.to(device)

# with torch.no_grad():
#   outputs = model(prediction_inputs, token_type_ids=None, attention_mask=prediction_masks)

In [None]:
labels = df.Response.values
labels = labels[0]
labels = np.array(labels)
print(type(labels))
print(labels)

prediction_labels = torch.tensor(labels)
prediction_labels

<class 'numpy.ndarray'>
0


tensor(0)

In [None]:
# Load the dataset into a pandas dataframe.
df = df2
df = df2.head(1)

# Report the number of sentences.
print('Number of test sentences: {:,}\n'.format(df.shape[0]))

# Create sentence and label lists
sentences = df.note_content.values


input_ids = []
# For every sentence...
for sent in sentences:
    encoded_sent = tokenizer.encode(sent,add_special_tokens = True)
    input_ids.append(encoded_sent)

# Pad our input tokens
input_ids = pad_sequences(input_ids, maxlen=MAX_LEN, dtype="long", truncating="post", padding="post")

# Create attention masks
attention_masks = []
for seq in input_ids:
  seq_mask = [float(i>0) for i in seq]
  attention_masks.append(seq_mask) 


# Convert to tensors.
prediction_inputs = torch.tensor(input_ids)
prediction_masks = torch.tensor(attention_masks)

# Set the batch size.  
batch_size = 4
# Create the DataLoader.
prediction_data = TensorDataset(prediction_inputs, prediction_masks)
prediction_sampler = SequentialSampler(prediction_data)
prediction_dataloader = DataLoader(prediction_data, sampler=prediction_sampler, batch_size=batch_size)

NameError: ignored

In [None]:
#Evaluating our model on the test set
model.eval()
# Tracking variables 
pred = []

# Predict 
for batch in prediction_dataloader:
  # Add batch to GPU
  batch = tuple(t.to(device) for t in batch)
  # Unpack the inputs from our dataloader
  b_input_ids, b_input_mask = batch
  # Telling the model not to compute or store gradients, saving memory and 
  # speeding up prediction
  with torch.no_grad():
      # Forward pass, calculate logit predictions
      outputs = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask)

  logits = outputs[0]
  # Move logits and labels to CPU
  logits = logits.detach().cpu().numpy()
  pred_labels = np.argmax(logits, axis=1).flatten()
  print(logits)
  print(pred_labels)
  pred.append(pred_labels)

[[ 0.42729354 -0.10209928]]
[0]


In [None]:
input_ids = tokenizer.encode(sent, add_special_tokens = True)
attention_masks = [int(i > 0) for i in input_ids]

# Convert to tensors.
prediction_inputs = torch.tensor(input_ids)
prediction_masks = torch.tensor(attention_masks)

# Set the batch size.  
batch_size = 4
# Create the DataLoader.
prediction_data = TensorDataset(prediction_inputs, prediction_masks)
prediction_sampler = SequentialSampler(prediction_data)
prediction_dataloader = DataLoader(prediction_data, sampler=prediction_sampler, batch_size=batch_size)

In [None]:
prediction_inputs


tensor([[  101, 12109, 14134,  ...,     0,     0,     0],
        [  101,  2708, 10821,  ...,     0,     0,     0],
        [  101,  1035,  1035,  ...,     0,     0,     0]])

In [None]:
prediction_masks

tensor([[1., 1., 1.,  ..., 0., 0., 0.],
        [1., 1., 1.,  ..., 0., 0., 0.],
        [1., 1., 1.,  ..., 0., 0., 0.]])

In [None]:
logits = outputs[0]

# Move logits and labels to CPU
logits = logits.detach().cpu().numpy()
label_ids = b_labels.to('cpu').numpy()

In [None]:
import os

# Saving best-practices: if you use defaults names for the model, you can reload it using from_pretrained()

output_dir = './model_save/'

# Create output directory if needed
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

print("Saving model to %s" % output_dir)

# Save a trained model, configuration and tokenizer using `save_pretrained()`.
# They can then be reloaded using `from_pretrained()`
model_to_save = model.module if hasattr(model, 'module') else model  # Take care of distributed/parallel training
model_to_save.save_pretrained(output_dir)
tokenizer.save_pretrained(output_dir)

# Good practice: save your training arguments together with the trained model
# torch.save(args, os.path.join(output_dir, 'training_args.bin'))


Saving model to ./model_save/


('./model_save/tokenizer_config.json',
 './model_save/special_tokens_map.json',
 './model_save/vocab.txt',
 './model_save/added_tokens.json')

In [None]:
!cp -r ./model_save/ "/content/drive/MyDrive/nlp/lung_cancer/model"


In [None]:
# Load a trained model and vocabulary that you have fine-tuned
model = model.from_pretrained(output_dir)
tokenizer = tokenizer.from_pretrained(output_dir)

# Copy the model to the GPU.
model.to(device)

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, element

In [None]:
sentence = df.note_content.values
sent = sentence[0]
sentences = np.array(sent)
sentences

array('______department of radiology and imaging______radiograph of the chest pa viewfindings surgical clips noted over the left hilum reticular opacity in left upper zone remain unchanged as compared with previous film dated 11 09 2015 both the lung fields are clear both the costophrenic angles are clear hilar shadows appear normal cardiothoracic ratio is within normal limits hemidiaphragms are normal in position and contour bony thorax under view is unremarkable dr kalgaonkar sameerjr consultant radiology amp imaging ',
      dtype='<U518')

In [None]:
sent = sentence[0]
ids = tokenizer.encode(sent, add_special_tokens = True)
att_mask = [int(i > 0) for i in ids]

prediction_inputs = torch.tensor(ids)
prediction_masks = torch.tensor(att_mask)

outputs = model(prediction_inputs, token_type_ids=None, attention_mask=prediction_masks)

#outputs = model(ids, token_type_ids=None)

ValueError: ignored