# Comparing FusBERT Models

In [23]:
import pandas as pd
import numpy as np
import torch
import random
from torch.utils.data import Dataset, DataLoader
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification, set_seed, AutoTokenizer, AutoModel

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from transformers import AutoModelForSequenceClassification, Trainer, TrainingArguments, AutoTokenizer, get_linear_schedule_with_warmup
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

In [2]:
import os
os.environ["WANDB_DISABLED"] = "true"

In [3]:
torch.cuda.is_available()

True

## Setting the Seed for Reproducibility

In [4]:
set_seed(6013)

## Helper Functions

In [5]:
class AbstractsDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: self.encodings[key][idx] for key in self.encodings}
        item['labels'] = torch.tensor(self.labels[idx], dtype=torch.long)  # Note the dtype change for binary classification
        return item

    def __len__(self):
        return len(self.labels)

In [6]:
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=1,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=16,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
    learning_rate=0.001, 
    report_to=None,
    evaluation_strategy="epoch",  # Evaluate at the end of each epoch
    logging_strategy="epoch"  # Log at the end of each epoch
)

def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='binary', zero_division=1)  # Adjust zero_division here
    acc = accuracy_score(labels, preds)
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


## Creating fusBERT with pretrained bio_ClinicalBERT

In [20]:
dat = pd.read_csv('zotero_data.csv')
dat['abstract'] = dat['abstract'].astype(str)

In [21]:
labels = dat['fus_related'].tolist()
print("Label distribution:", np.bincount(labels))

Label distribution: [1794 1794]


In [22]:
tokenizer = AutoTokenizer.from_pretrained("emilyalsentzer/Bio_ClinicalBERT")
tokenized_data = tokenizer(list(dat['abstract']), padding=True, truncation=True, max_length=512, return_tensors="pt")

In [23]:
dataset = AbstractsDataset(tokenized_data, labels)

In [24]:
# Split indices into train and test sets first
train_idx, test_idx = train_test_split(list(range(len(dataset))), test_size=0.1, random_state=42)

# Split train indices into train and validation sets
train_idx, val_idx = train_test_split(train_idx, test_size=0.1, random_state=42)

# Create Subset datasets
train_dataset = torch.utils.data.Subset(dataset, train_idx)
val_dataset = torch.utils.data.Subset(dataset, val_idx)
test_dataset = torch.utils.data.Subset(dataset, test_idx)

In [25]:
model = AutoModelForSequenceClassification.from_pretrained("emilyalsentzer/Bio_ClinicalBERT", num_labels=2)

  return self.fget.__get__(instance, owner)()
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at emilyalsentzer/Bio_ClinicalBERT and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [26]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics
)
trainer.create_optimizer_and_scheduler(num_training_steps=1000)
trainer.optimizer = torch.optim.AdamW(model.parameters(), lr=5e-5)
trainer.lr_scheduler = get_linear_schedule_with_warmup(trainer.optimizer, num_warmup_steps=500, num_training_steps=1000)

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)
Detected kernel version 4.18.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


In [27]:
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.3934,0.179084,0.947368,0.951009,0.921788,0.982143


TrainOutput(global_step=364, training_loss=0.3934280060149811, metrics={'train_runtime': 50.6908, 'train_samples_per_second': 57.328, 'train_steps_per_second': 7.181, 'total_flos': 764600726876160.0, 'train_loss': 0.3934280060149811, 'epoch': 1.0})

In [28]:
trainer.evaluate()

{'eval_loss': 0.17908445000648499,
 'eval_accuracy': 0.9473684210526315,
 'eval_f1': 0.9510086455331412,
 'eval_precision': 0.9217877094972067,
 'eval_recall': 0.9821428571428571,
 'eval_runtime': 1.3973,
 'eval_samples_per_second': 231.16,
 'eval_steps_per_second': 15.029,
 'epoch': 1.0}

In [29]:
trainer.evaluate(eval_dataset=test_dataset)

{'eval_loss': 0.31911733746528625,
 'eval_accuracy': 0.9080779944289693,
 'eval_f1': 0.9119999999999999,
 'eval_precision': 0.8423645320197044,
 'eval_recall': 0.9941860465116279,
 'eval_runtime': 1.5904,
 'eval_samples_per_second': 225.727,
 'eval_steps_per_second': 14.462,
 'epoch': 1.0}

## Creating fusBERT with pretrained tinyBERT

In [37]:
zotero = pd.read_csv('zotero_data.csv')
zotero['abstract'] = zotero['abstract'].astype(str)

In [38]:
tokenizer = AutoTokenizer.from_pretrained('huawei-noah/TinyBERT_General_4L_312D')
tokenized_data = tokenizer(list(zotero['abstract']), padding=True, truncation=True, max_length=512, return_tensors="pt")

In [39]:
labels = zotero['fus_related'].tolist()
print("Label distribution:", np.bincount(labels))

Label distribution: [1794 1794]


In [40]:
dataset = AbstractsDataset(tokenized_data, labels)

In [41]:
# Split indices into train and test sets first
train_idx, test_idx = train_test_split(list(range(len(dataset))), test_size=0.1, random_state=42)

# Split train indices into train and validation sets
train_idx, val_idx = train_test_split(train_idx, test_size=0.1, random_state=42)

# Create Subset datasets
train_dataset = torch.utils.data.Subset(dataset, train_idx)
val_dataset = torch.utils.data.Subset(dataset, val_idx)
test_dataset = torch.utils.data.Subset(dataset, test_idx)

In [42]:
model = AutoModelForSequenceClassification.from_pretrained('huawei-noah/TinyBERT_General_4L_312D', num_labels=2)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at huawei-noah/TinyBERT_General_4L_312D and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [43]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics
)
trainer.create_optimizer_and_scheduler(num_training_steps=1000)
trainer.optimizer = torch.optim.AdamW(model.parameters(), lr= 5e-5)
trainer.lr_scheduler = get_linear_schedule_with_warmup(trainer.optimizer, num_warmup_steps=500, num_training_steps=1000)

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)
Detected kernel version 4.18.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


In [44]:
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.5478,0.382863,0.873065,0.873846,0.904459,0.845238


TrainOutput(global_step=364, training_loss=0.5477914076585036, metrics={'train_runtime': 10.6541, 'train_samples_per_second': 272.76, 'train_steps_per_second': 34.165, 'total_flos': 41669123026944.0, 'train_loss': 0.5477914076585036, 'epoch': 1.0})

In [45]:
trainer.evaluate()

{'eval_loss': 0.3828626871109009,
 'eval_accuracy': 0.8730650154798761,
 'eval_f1': 0.8738461538461538,
 'eval_precision': 0.9044585987261147,
 'eval_recall': 0.8452380952380952,
 'eval_runtime': 0.3079,
 'eval_samples_per_second': 1049.103,
 'eval_steps_per_second': 68.208,
 'epoch': 1.0}

In [46]:
trainer.evaluate(eval_dataset=test_dataset)

{'eval_loss': 0.4655163288116455,
 'eval_accuracy': 0.8328690807799443,
 'eval_f1': 0.8333333333333334,
 'eval_precision': 0.7978723404255319,
 'eval_recall': 0.872093023255814,
 'eval_runtime': 0.3698,
 'eval_samples_per_second': 970.805,
 'eval_steps_per_second': 62.196,
 'epoch': 1.0}

## Creating fusBERT with pretrained distilBERT

In [18]:
dat = pd.read_csv('zotero_data.csv')
dat['abstract'] = dat['abstract'].astype(str)

In [19]:
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
tokenized_data = tokenizer(list(dat['abstract']), padding=True, truncation=True, max_length=512, return_tensors="pt")

In [20]:
labels = dat['fus_related'].tolist()
print("Label distribution:", np.bincount(labels))

Label distribution: [1794 1794]


In [21]:
dataset = AbstractsDataset(tokenized_data, labels)

In [22]:
# Split indices into train and test sets first
train_idx, test_idx = train_test_split(list(range(len(dataset))), test_size=0.1, random_state=42)

# Split train indices into train and validation sets
train_idx, val_idx = train_test_split(train_idx, test_size=0.1, random_state=42)

# Create Subset datasets
train_dataset = torch.utils.data.Subset(dataset, train_idx)
val_dataset = torch.utils.data.Subset(dataset, val_idx)
test_dataset = torch.utils.data.Subset(dataset, test_idx)

In [23]:
model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased')

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [24]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics
)
trainer.create_optimizer_and_scheduler(num_training_steps=1000)
trainer.optimizer = torch.optim.AdamW(model.parameters(), lr=5e-5)
trainer.lr_scheduler = get_linear_schedule_with_warmup(trainer.optimizer, num_warmup_steps=500, num_training_steps=1000)

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)
Detected kernel version 4.18.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


In [25]:
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.3697,0.15982,0.95356,0.95702,0.922652,0.994048


TrainOutput(global_step=364, training_loss=0.3696788326724545, metrics={'train_runtime': 26.9732, 'train_samples_per_second': 107.736, 'train_steps_per_second': 13.495, 'total_flos': 384950260494336.0, 'train_loss': 0.3696788326724545, 'epoch': 1.0})

In [26]:
trainer.evaluate()

{'eval_loss': 0.1598198562860489,
 'eval_accuracy': 0.9535603715170279,
 'eval_f1': 0.9570200573065902,
 'eval_precision': 0.9226519337016574,
 'eval_recall': 0.9940476190476191,
 'eval_runtime': 0.7358,
 'eval_samples_per_second': 438.993,
 'eval_steps_per_second': 28.541,
 'epoch': 1.0}

In [27]:
trainer.evaluate(eval_dataset=test_dataset)

{'eval_loss': 0.29590103030204773,
 'eval_accuracy': 0.8997214484679665,
 'eval_f1': 0.9047619047619048,
 'eval_precision': 0.8300970873786407,
 'eval_recall': 0.9941860465116279,
 'eval_runtime': 0.8358,
 'eval_samples_per_second': 429.539,
 'eval_steps_per_second': 27.519,
 'epoch': 1.0}

## Creating fusBERT with pretrained sciBERT

In [13]:
dat = pd.read_csv('zotero_data.csv')
dat['abstract'] = dat['abstract'].astype(str)

In [14]:
labels = dat['fus_related'].tolist()
print("Label distribution:", np.bincount(labels))

Label distribution: [1794 1794]


In [15]:
tokenizer = AutoTokenizer.from_pretrained("allenai/scibert_scivocab_uncased")
tokenized_data = tokenizer(list(dat['abstract']), padding=True, truncation=True, max_length=512, return_tensors="pt")

In [16]:
dataset = AbstractsDataset(tokenized_data, labels)

In [17]:
# Split indices into train and test sets first
train_idx, test_idx = train_test_split(list(range(len(dataset))), test_size=0.1, random_state=42)

# Split train indices into train and validation sets
train_idx, val_idx = train_test_split(train_idx, test_size=0.1, random_state=42)

# Create Subset datasets
train_dataset = torch.utils.data.Subset(dataset, train_idx)
val_dataset = torch.utils.data.Subset(dataset, val_idx)
test_dataset = torch.utils.data.Subset(dataset, test_idx)

In [18]:
model = AutoModelForSequenceClassification.from_pretrained("allenai/scibert_scivocab_uncased", num_labels=2)

pytorch_model.bin:   0%|          | 0.00/442M [00:00<?, ?B/s]

  return self.fget.__get__(instance, owner)()
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at allenai/scibert_scivocab_uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [19]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics
)
trainer.create_optimizer_and_scheduler(num_training_steps=1000)
trainer.optimizer = torch.optim.AdamW(model.parameters(), lr=5e-5)
trainer.lr_scheduler = get_linear_schedule_with_warmup(trainer.optimizer, num_warmup_steps=500, num_training_steps=1000)

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)
Detected kernel version 4.18.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


In [20]:
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.363,0.158074,0.956656,0.95977,0.927778,0.994048


TrainOutput(global_step=364, training_loss=0.3630312825297261, metrics={'train_runtime': 52.5498, 'train_samples_per_second': 55.3, 'train_steps_per_second': 6.927, 'total_flos': 764600726876160.0, 'train_loss': 0.3630312825297261, 'epoch': 1.0})

In [21]:
trainer.evaluate()

{'eval_loss': 0.15807422995567322,
 'eval_accuracy': 0.9566563467492261,
 'eval_f1': 0.9597701149425287,
 'eval_precision': 0.9277777777777778,
 'eval_recall': 0.9940476190476191,
 'eval_runtime': 1.4531,
 'eval_samples_per_second': 222.287,
 'eval_steps_per_second': 14.452,
 'epoch': 1.0}

In [22]:
trainer.evaluate(eval_dataset=test_dataset)

{'eval_loss': 0.3061637282371521,
 'eval_accuracy': 0.9164345403899722,
 'eval_f1': 0.9193548387096774,
 'eval_precision': 0.855,
 'eval_recall': 0.9941860465116279,
 'eval_runtime': 1.5956,
 'eval_samples_per_second': 224.997,
 'eval_steps_per_second': 14.415,
 'epoch': 1.0}

## Traditional Machine Learning Models

In [24]:
df = pd.read_csv('zotero_data.csv')
df['abstract'] = df['abstract'].astype(str)

In [25]:
X = df['abstract']
y = df['fus_related']

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer

X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=.1,random_state=42)

tfidf = TfidfVectorizer(stop_words='english')


#kinda like scaling here
tfidf.fit(X_train)
X_train_tfidf = tfidf.transform(X_train)
X_test_tfidf = tfidf.transform(X_test)

X_tfidf = tfidf.fit_transform(X)

In [26]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(stop_words='english')

matrix = cv.fit_transform(df[df['fus_related']==0]['abstract'])
freqs = zip(cv.get_feature_names_out(), matrix.sum(axis=0).tolist()[0])
# sort from largest to smallest
print("Top 20 words used for Non-FUS Articles.")
print(sorted(freqs, key=lambda x: -x[1])[:40])

Top 20 words used for Non-FUS Articles.
[('patients', 2718), ('care', 2349), ('delirium', 2301), ('hpv', 1595), ('study', 1427), ('icu', 1336), ('health', 1255), ('results', 1231), ('ultrasound', 1162), ('nurses', 1106), ('methods', 939), ('19', 931), ('data', 845), ('patient', 835), ('intensive', 831), ('vaccination', 822), ('vaccine', 811), ('using', 805), ('used', 787), ('covid', 786), ('clinical', 776), ('risk', 775), ('use', 742), ('based', 734), ('social', 700), ('studies', 689), ('treatment', 678), ('associated', 639), ('95', 625), ('high', 612), ('knowledge', 571), ('critical', 562), ('assessment', 558), ('women', 557), ('intervention', 545), ('cancer', 544), ('ci', 540), ('unit', 540), ('screening', 516), ('group', 510)]


In [27]:
cv = CountVectorizer(stop_words='english')

matrix = cv.fit_transform(df[df['fus_related']==1]['abstract'])
freqs = zip(cv.get_feature_names_out(), matrix.sum(axis=0).tolist()[0])
# sort from largest to smallest
print("Top 20 words used for FUS Articles")
print(sorted(freqs, key=lambda x: -x[1])[:40])

Top 20 words used for FUS Articles
[('ultrasound', 3458), ('treatment', 2866), ('focused', 2169), ('hifu', 2101), ('patients', 2020), ('fus', 1655), ('brain', 1653), ('results', 1432), ('high', 1327), ('study', 1198), ('using', 1169), ('ablation', 1120), ('clinical', 1074), ('intensity', 1066), ('imaging', 1043), ('tumor', 980), ('methods', 958), ('used', 924), ('therapy', 901), ('tissue', 782), ('effects', 772), ('group', 770), ('cancer', 758), ('non', 734), ('based', 715), ('bbb', 712), ('time', 712), ('delivery', 694), ('studies', 678), ('guided', 668), ('acoustic', 666), ('mrgfus', 660), ('model', 644), ('tremor', 616), ('mri', 612), ('therapeutic', 612), ('compared', 598), ('blood', 594), ('low', 588), ('method', 583)]


In [29]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
    
def test_model(model):

    y_preds = model.predict(X_test_tfidf)
    
    accuracy = accuracy_score(y_test, y_preds)
    precision = precision_score(y_test, y_preds, average='weighted')
    recall = recall_score(y_test, y_preds, average='weighted')
    f1 = f1_score(y_test, y_preds, average='weighted')

    print("Accuracy:", accuracy)
    print("Precision:", precision)
    print("Recall:", recall)
    print("F1 Score:", f1)

### Naive Bayes

In [28]:
#NAIVE BAYES
from sklearn.naive_bayes import MultinomialNB

nb = MultinomialNB()

nb.fit(X_train_tfidf,y_train)

In [30]:
#NAIVE BAYES
test_model(nb)

Accuracy: 0.8746518105849582
Precision: 0.8921217452341922
Recall: 0.8746518105849582
F1 Score: 0.87385713916597


### Logistic Regression

In [31]:
#LOGISTIC REGRESSION
from sklearn.linear_model import LogisticRegression

log_model = LogisticRegression(max_iter=10000000)

log_model.fit(X_train_tfidf,y_train)

In [32]:
#LOGISTIC REGRESSION
test_model(log_model)

Accuracy: 0.8997214484679665
Precision: 0.9102384331038349
Recall: 0.8997214484679665
F1 Score: 0.89943785880455


### Support Vector Machine

In [33]:
#SUPPORT VECTOR MACHINE
from sklearn.svm import SVC,LinearSVC

rbf_svc = SVC(probability=True)
rbf_svc.fit(X_train_tfidf,y_train)

In [34]:
#SUPPORT VECTOR MACHINE
test_model(rbf_svc)

Accuracy: 0.8885793871866295
Precision: 0.8975011101691494
Recall: 0.8885793871866295
F1 Score: 0.888331901202903


### Linear Support Vector Machine

In [35]:
linear_svc = LinearSVC(dual=True)
linear_svc.fit(X_train_tfidf,y_train)

In [36]:
#LINEAR SUPPORT VECTOR MACHINE
test_model(linear_svc)

Accuracy: 0.8774373259052924
Precision: 0.8817610772162892
Recall: 0.8774373259052924
F1 Score: 0.8773859696417046
