In [4]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [5]:
import torch
from transformers import AutoModelForSequenceClassification, AutoTokenizer

# path where the model and tokenizer were saved
model_save_path = '/content/drive/MyDrive/MSDS Capstone/FUS_handover/fusBERT.pt'
tokenizer_save_path = '/content/drive/MyDrive/MSDS Capstone/FUS_handover/tokenizer'

# initialize the model and tokenizer
model = AutoModelForSequenceClassification.from_pretrained("emilyalsentzer/Bio_ClinicalBERT", num_labels=2)
tokenizer = AutoTokenizer.from_pretrained(tokenizer_save_path)

# load the model's state_dict
model.load_state_dict(torch.load(model_save_path))

model.eval()

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at emilyalsentzer/Bio_ClinicalBERT and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(28996, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12,

In [6]:
def predict_abstract(abstract):
    # Ensure the model is on the correct device (GPU or CPU)
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)

    # Tokenize the input abstract
    inputs = tokenizer(abstract, padding=True, truncation=True, max_length=512, return_tensors="pt")

    # Move the tokenized inputs to the same device as the model
    inputs = {key: value.to(device) for key, value in inputs.items()}

    # Predict
    with torch.no_grad():
        logits = model(**inputs).logits

    # Print the raw logits
    # print("Raw logits:", logits)

    # Convert logits to probabilities using softmax
    probs = torch.nn.functional.softmax(logits, dim=1)
    confidence, prediction = torch.max(probs, dim=1)

    # Move predictions back to CPU for easy handling (if they were on GPU)
    confidence = confidence.cpu().item() * 100  # as percentage
    prediction = prediction.cpu().item()  # binary indication

    return prediction, confidence, logits.cpu().numpy()


In [None]:
#!pip install openpyxl

In [8]:
# read in excel sheet
import pandas as pd
import openpyxl

file_path = '/content/drive/MyDrive/MSDS Capstone/FUS_handover/Sample Data.xlsx' #update file path to relevant file
df = pd.read_excel(file_path)

In [10]:
# Prediction and confidence assignment
predictions, confidences, raw_logits = [], [], []

df['Abstract'] = df['Abstract'].dropna()
df['Abstract'] =df['Abstract'].astype(str)
df['Abstract'] = [x.lower() for x in df['Abstract']]
df = df.drop_duplicates(subset='Abstract', keep='first')

for abstract in df['Abstract']:
    prediction, confidence, logits = predict_abstract(abstract)
    predictions.append(prediction)
    confidences.append(confidence)
    raw_logits.append(logits.tolist())  # Convert numpy array to list for DataFrame compatibility

# Add the new data to the DataFrame
df['Prediction'] = predictions
df['Confidence'] = confidences
df['Logits'] = raw_logits


# Update DataFrame
df['Prediction'] = predictions
df['Confidence'] = confidences

# Save to a new Excel file
output_file_path = '/content/drive/MyDrive/MSDS Capstone/FUS_handover/double_updated_test.xlsx'
df.to_excel(output_file_path, index=False)