In [1]:
!pip install transformers==4.10.1 --quiet

In [2]:
!pip install pytorch_lightning



In [3]:
!pip install "torchmetrics<0.7"



In [4]:
#testing gpu prescence
import tensorflow as tf
tf.test.gpu_device_name()

'/device:GPU:0'

In [5]:

#imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import json
from tqdm.auto import tqdm

#pytorch
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizerFast as BertTokenizer, BertModel as BM, AdamW as Adam

#pytorch scheduler
from torch.optim.lr_scheduler import ExponentialLR

#pytorch lightning - lots of features like model checkpoints, logging, metrics, etc
import transformers
import pytorch_lightning as pl
#from torchmetrics import Accuracy
from torchmetrics.functional import auroc 
#going to create a confusion matrix
from sklearn.metrics import classification_report, multilabel_confusion_matrix

In [6]:
#IMP Params
EPOCHS = 4
BATCH_SIZE = 16
MAX_TOKEN_NUM = 256

In [7]:
#creating a training and testing dataset
labeledFile = open('news_blobs_labeled.json')
testingFile = open('news_blobs_test.json')
#loading data from file
labeledData = json.load(labeledFile)
testingData = json.load(testingFile)
labeledSize = len(labeledData)
testingSize = len(testingData)
print("Labeled data shape:", len(labeledData))
print("Testing data shape: ", len(testingData))


Labeled data shape: 738
Testing data shape:  81


In [8]:
#conversion into dataframes - training + validation
tdLabels = pd.DataFrame([labeledData[i]['labels'] for i in range(labeledSize)])
tdText= pd.DataFrame([{"text": labeledData[i]['text']} for i in range(labeledSize)])
totaldf = pd.concat([tdText, tdLabels], axis=1)
totaldf.head()

Unnamed: 0,text,REGULATORY,CLINICAL_TRIAL,COLLAB,FINANCING,PRESENTATION,OTHER
0,Cytokinetics to Participate in Upcoming Invest...,0,0,0,0,1,0
1,"Immunic, Inc. Reports Positive Top-line Data f...",0,1,0,0,0,0
2,FDA Accepts and Grants Priority Review of Vand...,1,0,0,0,0,0
3,Members Of The COVID R&D Alliance And Quantum ...,0,1,0,0,0,0
4,Lannett Announces Launch Of FDA Approved Levot...,0,0,0,0,0,1


In [41]:
COLUMNS = totaldf.columns.tolist()[1:]
COLUMNS

['REGULATORY',
 'CLINICAL_TRIAL',
 'COLLAB',
 'FINANCING',
 'PRESENTATION',
 'OTHER']

In [44]:
#what is the distribution of the labeled data given
samples = dict()
for col in COLUMNS:
  samples[col] = sum(totaldf[col])
samples

{'CLINICAL_TRIAL': 128,
 'COLLAB': 43,
 'FINANCING': 50,
 'OTHER': 415,
 'PRESENTATION': 46,
 'REGULATORY': 79}

In [None]:
#we may want fine-tune the training dataset


In [9]:
#use of validation data will help with accuracy prediction 
from sklearn.model_selection import train_test_split
traindf, valdf = train_test_split(totaldf, test_size=0.1)
traindf.shape, valdf.shape

((664, 7), (74, 7))

In [10]:
#conversion into dataframes - testing  
testdf = pd.DataFrame([{"text": testingData[i]['text']} for i in range(testingSize)])
testdf.head(20)

Unnamed: 0,text
0,RLF-100 (aviptadil) clinical trial showed rap...
1,Bio-Techne And Kantaro Biosciences To Launch A...
2,AIVITA Biomedical Publishes Review of GM-CSF H...
3,Biohaven Announces Enrollment Of First Patient...
4,Mirum Pharmaceuticals Reports Inducement Grant...
5,Enzychem Lifesciences Announces FDA Acceptance...
6,Celltrion to Launch Both Antigen and Antibody ...
7,Personalis Announces Pricing of Public Offerin...
8,Mammography Equipment Market Analysis and Fore...
9,Additional analysis of real-world data confirm...


In [11]:
#initial look into the data
print('Research text sample:', traindf['text'].iloc[0])

Research text sample: Company Profile for NMS Labs. NMS Labs is a leading bioanalytical toxicology and forensic sciences laboratory providing esoteric clinical and forensic services to physicians, attorneys, the criminal justice system, clinical reference labs, pharmaceutical companies, and consumer products manufacturers. As part of our services, NMS Labs’ professionals interpret our laboratory testing results to resolve client-specific issues and provide expert witness testimony and consulting support for both civil and criminal judicial proceedings.


In [12]:
#clean titles
for i in range(traindf.shape[0]):
  traindf['text'].iloc[i].strip().lower()
for i in range(valdf.shape[0]):
  valdf['text'].iloc[i].strip().lower()
for i in range(testingSize):
  testdf['text'].iloc[i].strip().lower()

In [14]:
#before processing data, create tokenizer instance
BERT_MODEL_NAME = 'bert-base-cased'
tokenizer = BertTokenizer.from_pretrained(BERT_MODEL_NAME)

In [15]:
#creating a pytorch dataset class
class MedicalDataset(Dataset):
  def __init__(self, data,  tokenizer):
    super().__init__()
    self.tokenizer = tokenizer
    self.data = data
    self.dataLen = len(self.data)

  def __len__(self):
    return self.dataLen

  def __getitem__(self, index: int):
    data_row = self.data.iloc[index]

    medical_text = data_row['text']
    labels = data_row[COLUMNS]

    #creating an encoding 
    encoding = self.tokenizer.encode_plus(
      medical_text,
      add_special_tokens=True,
      max_length=MAX_TOKEN_NUM,
      return_token_type_ids=False,
      padding="max_length",
      truncation=True,
      return_attention_mask=True,
      return_tensors='pt',
    )

    return dict(
      medical_text=medical_text,
      input_ids=encoding["input_ids"].flatten(),
      attention_mask=encoding["attention_mask"].flatten(),
      labels=torch.FloatTensor(labels)
    )

In [16]:
class MedicalTextDataModule(pl.LightningDataModule):

  def __init__(self, train_df, val_df, test_df, tokenizer):
    super().__init__()
    self.train_df = train_df
    self.test_df = test_df
    self.val_df = val_df
    self.tokenizer = tokenizer

  #datasets setup - training, validation, testing  
  def setup(self, stage=None):
    self.train_dataset = MedicalDataset(
      self.train_df,
      self.tokenizer
    )
    self.val_dataset = MedicalDataset(
        self.val_df, 
        self.tokenizer)
    
    self.test_dataset = MedicalDataset(
      self.test_df,
      self.tokenizer
    )

  #The num_workers attribute tells the data loader instance how many sub-processes to use for data loading.
  def train_dataloader(self):
    return DataLoader(
      self.train_dataset,
      batch_size= BATCH_SIZE,
      shuffle=True,
      num_workers=2)
    
  def val_dataloader(self):
    return DataLoader(
      self.val_dataset,
      batch_size= BATCH_SIZE,
      shuffle=True,
      num_workers=2)

  def test_dataloader(self):
    return DataLoader(
      self.test_dataset,
      batch_size=BATCH_SIZE,
      num_workers=2)

In [17]:
#load a pre-trained BERT model - transfer learning
bert_model = BM.from_pretrained(BERT_MODEL_NAME, return_dict=True)

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertModel: ['cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [18]:
#create a data module with all datasets in place
data_module = MedicalTextDataModule(traindf, valdf, testdf, tokenizer)

In [19]:
class MedicalTextLabelClassifier(pl.LightningModule):

  def __init__(self, bert_model, n_classes, criterion, n_training_steps=None):
    super().__init__()
    #pre-trained bert_model
    self.bert = bert_model
    #classifier linear model
    self.classifier = nn.Linear(self.bert.config.hidden_size, n_classes)
    #how many times step is called
    self.n_training_steps = n_training_steps
    self.criterion = criterion

  #the forward function enables us to define how the model goes from input to output
  def forward(self, input_ids, attention_mask, labels=None):
    output = self.bert(input_ids, attention_mask=attention_mask)
    #last layer to a classifier output
    output = self.classifier(output.pooler_output)
    #our activation function
    output = torch.sigmoid(output)    
    loss = 0
    if labels is not None:
        loss = self.criterion(output, labels)
    return loss, output

  def training_step(self, batch, batch_idx):
    input_ids = batch["input_ids"]
    attention_mask = batch["attention_mask"]
    labels = batch["labels"]
    loss, outputs = self(input_ids, attention_mask, labels)
    return {"loss": loss, "predictions": outputs, "labels": labels}

  def validation_step(self, batch, batch_idx):
    input_ids = batch["input_ids"]
    attention_mask = batch["attention_mask"]
    labels = batch["labels"]
    loss, outputs = self(input_ids, attention_mask, labels)
    return loss

  def test_step(self, batch, batch_idx):
    input_ids = batch["input_ids"]
    attention_mask = batch["attention_mask"]
    labels = batch["labels"]
    loss, outputs = self(input_ids, attention_mask, labels)
    return loss

  def training_epoch_end(self, outputs):
    #collecting the labels and predictions from the output
    labels = []
    predictions = []
    for output in outputs:
      for output_labels in output["labels"].detach().cpu():
        labels.append(output_labels)
      for output_predictions in output["predictions"].detach().cpu():
        predictions.append(output_predictions)

    labels = torch.stack(labels).int()
    predictions = torch.stack(predictions)


  def configure_optimizers(self):

    optimizer = Adam(self.parameters(), lr=1e-4)

    #the scheduler enables us to adjust the learning rate
    scheduler = ExponentialLR(optimizer, gamma=0.9)

    return dict(
      optimizer=optimizer,
      lr_scheduler=dict(
        scheduler=scheduler,
        interval='step'
      )
    )

In [20]:
#criterion - measuring the Binary Cross Entropy between the target and the input probabilities
criterion = nn.BCELoss()

In [21]:
#training
#calculating number of train steps
total_training_steps = BATCH_SIZE * EPOCHS

model = MedicalTextLabelClassifier(
    bert_model,
  n_classes=len(COLUMNS),
  criterion = criterion,
  n_training_steps=total_training_steps 
)

#lightning module training configs
trainer = pl.Trainer(
  logger= None,
  checkpoint_callback= None,
  callbacks= None,
  gpus=1,
  max_epochs=EPOCHS
)



GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs


In [22]:
#fitting 
trainer.fit(model, data_module)

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name       | Type      | Params
-----------------------------------------
0 | bert       | BertModel | 108 M 
1 | classifier | Linear    | 4.6 K 
2 | criterion  | BCELoss   | 0     
-----------------------------------------
108 M     Trainable params
0         Non-trainable params
108 M     Total params
433.260   Total estimated model params size (MB)
  rank_zero_warn(f"Checkpoint directory {dirpath} exists and is not empty.")


Validation sanity check: 0it [00:00, ?it/s]



Training: 0it [00:00, ?it/s]

  f"One of the returned values {set(extra.keys())} has a `grad_fn`. We will detach it automatically"


Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

In [23]:
#evaluation mode
model.eval()
model.freeze()

In [25]:
#connect to cuda
device = torch.device('cpu')
trained_model = model
#evaluation
val_dataset = MedicalDataset(valdf, tokenizer)

predictions = []
labels = []

#wrap the iterable with tqdm
for item in tqdm(val_dataset):
  _, prediction = trained_model(
    item["input_ids"].unsqueeze(dim=0).to(device), 
    item["attention_mask"].unsqueeze(dim=0).to(device)
  )
  predictions.append(prediction.flatten())
  labels.append(item["labels"].int())

predictions = torch.stack(predictions).detach().cpu()
labels = torch.stack(labels).detach().cpu()

  0%|          | 0/74 [00:00<?, ?it/s]

In [26]:
THRESHOLD = 0.5
#fscore, precision, recall, etc
y_pred = predictions.numpy()
y_true = labels.numpy()
y_pred = np.where(y_pred > THRESHOLD, 1, 0)

print(classification_report(
  y_true, 
  y_pred, 
  target_names= COLUMNS, 
  zero_division=0
))

                precision    recall  f1-score   support

    REGULATORY       0.00      0.00      0.00         6
CLINICAL_TRIAL       0.00      0.00      0.00        15
        COLLAB       0.00      0.00      0.00         7
     FINANCING       0.00      0.00      0.00         5
  PRESENTATION       0.00      0.00      0.00         6
         OTHER       0.73      0.88      0.80        40

     micro avg       0.73      0.44      0.55        79
     macro avg       0.12      0.15      0.13        79
  weighted avg       0.37      0.44      0.40        79
   samples avg       0.47      0.47      0.47        79



In [27]:
#confusion matrices
cm = multilabel_confusion_matrix(y_true, y_pred)
cm

array([[[68,  0],
        [ 6,  0]],

       [[59,  0],
        [15,  0]],

       [[67,  0],
        [ 7,  0]],

       [[69,  0],
        [ 5,  0]],

       [[68,  0],
        [ 6,  0]],

       [[21, 13],
        [ 5, 35]]])

In [28]:
#accuracy
accDict = dict()
for i, label in enumerate(COLUMNS):
  row = cm[i]
  tp = row[0][0]
  tn = row[1][1]
  fp = row[0][1]
  fn = row[1][0]
  accDict[label] = ((tp+tn)/(tp+tn+fp+fn))
accDict
  

{'CLINICAL_TRIAL': 0.7972972972972973,
 'COLLAB': 0.9054054054054054,
 'FINANCING': 0.9324324324324325,
 'OTHER': 0.7567567567567568,
 'PRESENTATION': 0.918918918918919,
 'REGULATORY': 0.918918918918919}

In [29]:
print("AUROC per label")
for i, name in enumerate(COLUMNS):
  tag_auroc = auroc(predictions[:, i], labels[:, i], pos_label=1)
  print(f"{name}: {tag_auroc}")

AUROC per label
REGULATORY: 0.8210784196853638
CLINICAL_TRIAL: 0.8723164200782776
COLLAB: 0.6226012706756592
FINANCING: 0.3536231815814972
PRESENTATION: 0.6225489974021912
OTHER: 0.8661764860153198


In [45]:
#testing 
def run_tests(path):
    results = []
    resultsFilePath = path
    for i in range(testingSize):
      test_text = testdf.iloc[i]['text']
      print(test_text)
      encoding = tokenizer.encode_plus(
        test_text,
        add_special_tokens=True,
        max_length=512,
        return_token_type_ids=False,
        padding="max_length",
        return_attention_mask=True,
        return_tensors='pt',
      )
    
      #use our trained model
      _, test_prediction = trained_model(encoding["input_ids"], encoding["attention_mask"])
      
      test_prediction = test_prediction.flatten().numpy()
      print(sum(test_prediction))
      predDict = {}
      for label, prediction in zip(COLUMNS, test_prediction):
        if prediction < THRESHOLD:
          predDict[label] = 0
        else:
          
          predDict[label] = 1
      results.append(predDict)
    resultsDF = pd.DataFrame(results)  
    resultsDF.to_csv(path, index=False)
    return
    

In [None]:
run_tests("text_classification.csv")