# Imports

In [1]:
import pandas as pd
import numpy as np
import os

# For Logistic Regression
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import precision_recall_fscore_support, accuracy_score, fbeta_score
from sklearn.model_selection import train_test_split

# For DistilBERT
from datasets import Dataset
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification, Trainer, TrainingArguments

In [2]:
# Custom functions
from functions import load_parquet_as_df, is_sensitive, get_keywords_from_xml

# Load Data

In [3]:
# Adjust paths accordingly:
test_path = "data/OHSUMED/test-00000-of-00001.parquet"
train_path = "data/OHSUMED/train-00000-of-00001.parquet"

df_test = load_parquet_as_df(test_path)
df_train = load_parquet_as_df(train_path)

df = pd.concat([df_test, df_train], axis=0)

In [4]:
print(len(df))
df.columns

348564


Index(['seq_id', 'medline_ui', 'mesh_terms', 'title', 'publication_type',
       'abstract', 'author', 'source'],
      dtype='object')

**View an example mesh_terms entry**

In [5]:
# Set max column width to display long strings fully
pd.set_option('display.max_colwidth', None)

# Now, accessing the entry will show the full string
df['mesh_terms'][92]

92    Abortion, Habitual/*CO/PC; Adult; Case Report; Chromosome Abnormalities/*CO/GE; Female; Fibrinogen/*BL/TU; Human; Karyotyping; Pedigree; Pregnancy; Support, Non-U.S. Gov't; Support, U.S. Gov't, P.H.S.; Translocation (Genetics).
92                                             Blood Pressure; Catheters, Indwelling/*ST; Hemodialysis/*ST; Human; Kidney Failure, Acute/*PP/TH; Kidney Failure, Chronic/*PP/TH; Polyurethanes; Quality Control; Support, Non-U.S. Gov't.
Name: mesh_terms, dtype: object

# Sensitivity Labeling

In [6]:
MESH_XML_FILE = "data/nlm/mesh/medit/ascii_xml/output/desc2022.xml"

c12_terms, c13_terms = get_keywords_from_xml(MESH_XML_FILE)

In [7]:
# Create a binary label based on if a match can be found
df['sensitive_label'] = df['mesh_terms'].apply(
    lambda x: is_sensitive(x, c12_terms, c13_terms)
)

In [8]:
percentage_sensitive = 100 * df['sensitive_label'].mean()
print(f"{percentage_sensitive:.2f}% of the rows are sensitive")

7.72% of the rows are sensitive


# Data Processing

In [9]:
# Only save relevant columns to make file smaller
relevant_columns = ['title', 'abstract', 'sensitive_label']
df = df[relevant_columns]

In [10]:
save_csv = False  # Set this to True if you want to save the current df

if save_csv:
    save_path = "data/OHSUMED/full_ohsumed_sensitivity_labeled.csv"
    
    # Check if the file already exists
    if not os.path.exists(save_path):
        df.to_csv(save_path, index=False)
        print(f"File saved to {save_path}")
    else:
        print(f"File already exists at {save_path}, skipping save.")

In [11]:
# Combine title and abstract
df['text'] = df['title'] + " " + df['abstract']

df = df[['text', 'sensitive_label']]  # we only need these two columns for the ML part

In [12]:
# A list which contains indices of rows which have a 1 as label
# We can use it to find records which have been labeled as sensitive
indices = df.index[df['sensitive_label'] == 1].tolist()
indices[0:5]

[24, 92, 146, 177, 196]

In [13]:
# An example of two rows, one from each group
row_selection = df.iloc[23:25]
row_selection

Unnamed: 0,text,sensitive_label
23,"Onset and recovery of atracurium and suxamethonium-induced neuromuscular blockade with simultaneous train-of-four and single twitch stimulation. Single twitch and train-of-four stimulation were applied at 0.08 Hz to each ulnar nerve and the force of contraction of the adductor pollicis was recorded during onset of and recovery from neuromuscular blockade by suxamethonium 1 mg kg-1 or atracurium 0.4 mg kg-1. Times to 90% first twitch blockade of train-of-four were (mean +/- SEM) 0.82 +/- 0.08 and 1.98 +/- 0.18 min for suxamethonium and atracurium, respectively, compared with times to 90% single twitch blockade of 1.00 +/- 0.07 and 3.35 +/- 0.37 min, respectively (P less than 0.05 in both cases). Apparent onset time also depended on how long train-of-four stimulation had been applied before injection of atracurium. The mode of stimulation had little effect on time to 10% recovery. The results are consistent with stimulation-induced augmentation in muscle blood flow, which increased delivery of the drug to the neuromuscular junction.",0
24,"Atracurium, vecuronium and pancuronium in end-stage renal failure. Dose-response properties and interactions with azathioprine. Dose-response relations for atracurium, vecuronium and pancuronium were determined in patients in end-stage renal failure for the initial neuromuscular blockade (using three cumulative doses) and for the maintenance of stable 90% response (during continuous infusion). All measurements were during renal transplant surgery, and the interaction of azathioprine on neuromuscular blockade was estimated. Mean ED95 doses were (microgram kg-1): atracurium 375.6, vecuronium 67.2, pancuronium 86.6; the initial blockade required significantly larger doses than in normal patients (37%, 20% and 45%, respectively, using ED50 values). Mean infusion rates for 90% sustained blockade in renal failure were (microgram kg-1 h-1): atracurium 409.4, vecuronium 78.3, pancuronium 14.2. The atracurium dose was not influenced by renal function, whereas vecuronium and pancuronium requirements were significantly reduced by 23.2% and 61.5%, respectively, compared with normal patients (previous study). Azathioprine was injected at the rate of 1 mg kg-1 min-1 for 3 min at stable 90% neuromuscular blockade with constant-rate infusion of the neuromuscular blocking drug. This produced a relatively small and transient antagonism of blockade--probably of negligible clinical significance.",1


In [14]:
# Split the data into training, validation, and test sets
train_texts, temp_texts, train_labels, temp_labels = train_test_split(
    df['text'], df['sensitive_label'], test_size=0.15, random_state=123, stratify=df['sensitive_label']
)

val_texts, test_texts, val_labels, test_labels = train_test_split(
    temp_texts, temp_labels, test_size=0.5, random_state=123, stratify=temp_labels
)

# Logistic Regression

For the following, there was little information in the paper. We made basic assumptions on how to implement the LR. While not mentioned in the paper, TF-IDF is commonly used for obtaining features. 

**Note: the following cell typically takes a few minutes to run.**

In [15]:
# Convert text to numerical features using TF-IDF
vectorizer = TfidfVectorizer(max_features=10000, ngram_range=(1, 2), stop_words='english')
X_train = vectorizer.fit_transform(train_texts)
X_val = vectorizer.transform(val_texts)
X_test = vectorizer.transform(test_texts)

In [16]:
# Train the logistic regression model
model = LogisticRegression(max_iter=1000, class_weight='balanced', random_state=123)
model.fit(X_train, train_labels)

In [17]:
# Get probabilities for the validation set
val_probs = model.predict_proba(X_val)[:, 1]

In [18]:
# As mentioned in the paper, use "a grid search in the range [0, 1] with step size 0.01 to find the threshold that optimized the F1 measure.

thresholds = np.linspace(0, 1, 101)
best_threshold = 0
best_f1 = 0

for threshold in thresholds:
    val_preds = (val_probs >= threshold).astype(int)
    _, _, f1, _ = precision_recall_fscore_support(val_labels, val_preds, average='binary')
    if f1 > best_f1:
        best_f1 = f1
        best_threshold = threshold

print(f"Optimal Threshold: {best_threshold}")

Optimal Threshold: 0.76


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [19]:
# Evaluate on the test set using the optimal threshold
test_probs = model.predict_proba(X_test)[:, 1]
test_preds = (test_probs >= best_threshold).astype(int)

precision, recall, f1, _ = precision_recall_fscore_support(test_labels, test_preds, average='binary')
accuracy = accuracy_score(test_labels, test_preds)

# Calculate F2 score
f2 = fbeta_score(test_labels, test_preds, beta=2, average='binary')

print("Intrinsic sensitivity classification results:")
print(f"Precision: {precision:.2f}, Recall: {recall:.2f}, F1: {f1:.2f}, F2: {f2:.2f}, Accuracy: {accuracy:.2f}")

Intrinsic sensitivity classification results:
Precision: 0.57, Recall: 0.64, F1: 0.60, F2: 0.62, Accuracy: 0.93


# DistilBERT

In [21]:
# Prepare datasets
train_data = Dataset.from_dict({"text": train_texts, "label": train_labels})
val_data = Dataset.from_dict({"text": val_texts, "label": val_labels})
test_data = Dataset.from_dict({"text": test_texts, "label": test_labels})

In [24]:
# Heavily shorten the training data for now, otherwise model training takes forever
n = 50
train_data = train_data.select(range(n))  # only take first n rows

In [25]:
tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")
model = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased")
print("DistilBERT is ready!")

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


DistilBERT is ready!


**Note: the following cell typically takes a few minutes to run.**

In [26]:
# Tokenize data
def tokenize_function(example):
    return tokenizer(example["text"], padding="max_length", truncation=True, max_length=512)

train_data = train_data.map(tokenize_function, batched=True)
val_data = val_data.map(tokenize_function, batched=True)
test_data = test_data.map(tokenize_function, batched=True)

Map:   0%|          | 0/50 [00:00<?, ? examples/s]

Map:   0%|          | 0/26142 [00:00<?, ? examples/s]

Map:   0%|          | 0/26143 [00:00<?, ? examples/s]

In [27]:
# Set data format for PyTorch
train_data.set_format(type="torch", columns=["input_ids", "attention_mask", "label"])
val_data.set_format(type="torch", columns=["input_ids", "attention_mask", "label"])
test_data.set_format(type="torch", columns=["input_ids", "attention_mask", "label"])

In [28]:
model = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=2)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [29]:
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    save_strategy="epoch",
    logging_dir="./logs",
    logging_steps=10,
    load_best_model_at_end=True
)



In [30]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_data,
    eval_dataset=val_data
)

In [31]:
trainer.train()

Epoch,Training Loss,Validation Loss


KeyboardInterrupt: 

In [None]:
results = trainer.evaluate(test_data)
print(results)

In [32]:
# Get the necessary metrics

# Combined

# Extrinsic Evaluation