# Imports

In [1]:
import pandas as pd
import numpy as np
import os

# For Logistic Regression
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import precision_recall_fscore_support, accuracy_score, fbeta_score
from sklearn.model_selection import train_test_split

# For DistilBERT
from datasets import Dataset
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification, Trainer, TrainingArguments

In [1]:
# Custom functions
from functions import load_parquet_as_df, is_sensitive, get_keywords_from_xml, parse_ohsumed_file, parse_judged_data, compute_metrics

# Load Data

In [3]:
#data is stored in 5 files
files = ["ohsumed.87.txt", "ohsumed.88.txt", "ohsumed.89.txt", "ohsumed.90.txt","ohsumed.91.txt"]
#parses first file and converts it to a df
df2= parse_ohsumed_file(files[0])
#parses the rest of the files an concats them
for file in files[1::]:
    df2 = pd.concat([df2, parse_ohsumed_file(file)])
#print(df2.info())
#display(df2[df2["mesh_terms"].isnull()])

#fills na values
df2["mesh_terms"] = df2["mesh_terms"].fillna("")
df2["Author"] = df2["Author"].fillna("")
df2["Abstract"] = df2["Abstract"].fillna("")

#print(df2.isnull().sum())

# Relevance Labeling

In [4]:
df_judged = parse_judged_data("judged.txt").drop_duplicates()
display(df_judged)

Unnamed: 0,query,Medline ID,document-i,relevance
0,1,87097544,40626,1
1,1,87153566,11852,0
2,1,87157536,12693,1
3,1,87157537,12694,1
4,1,87184723,15450,0
...,...,...,...,...
16135,106,91354564,337888,1
16136,106,91354570,348231,0
16137,106,91356830,338251,0
16138,106,91359739,338619,0


In [5]:
#print(len(df_judged.drop(["query"], axis = 1).drop_duplicates()))
df_judged = df_judged[df_judged["relevance"] > -1]
#display(df_judged[df_judged["document-i"].duplicated() ==1])

#remove duplicats in df_judged. if the duplicates relevance is judged diffrently, keep the one judged as relevent
duplicates = df_judged[df_judged['document-i'].duplicated(keep=False)].sort_values(by=["document-i", "relevance"], ascending=False)

current_id = -1
for i, rows in duplicates.iterrows():
    if rows["document-i"] == current_id:
        df_judged =df_judged.drop(i)
    else:
        current_id = rows["document-i"]
display(duplicates)
print(len(df_judged))
# in paper just 4837 judged as relevant

Unnamed: 0,query,Medline ID,document-i,relevance
525,3,91374975,348498,1
7545,53,91374975,348498,1
5018,37,91347198,348095,0
12596,83,91347198,348095,0
1504,14,91309028,347515,0
...,...,...,...,...
15063,100,87058652,1136,0
11709,79,87055358,685,1
5021,38,87055358,685,0
5247,40,87051269,300,0


14430


In [6]:
#join Ohsumed and judged df
df_data = pd.merge(df2, df_judged[['Medline ID', 'relevance']], on='Medline ID', how='left')
df_data["relevance"] = df_data["relevance"].fillna(-1)

#outdated
if 1 == 2:
    #join Ohsumed data from Source 1 with juded df
    df_1 = df_1.rename(columns={"medline_ui": "Medline ID"})
    df_1 = pd.merge(df_1, df_judged[['Medline ID', 'relevance']], on='Medline ID', how='left')
    df_1["relevance"] = df_1["relevance"].fillna(0)
    #train / test split
    df_1_test = df_1[df_1["relevance"] == 1]
    df_1_train = df_1[df_1["relevance"] == 0]

#split at later place
#df_test = df_combined_2[df_combined_2["relevance"] == 1]
#df_train = df_combined_2[df_combined_2["relevance"] == 0]
#print(len(df_test))

**View an example mesh_terms entry**

In [7]:
# Set max column width to display long strings fully
pd.set_option('display.max_colwidth', None)

# Now, accessing the entry will show the full string
df_data['mesh_terms'][92]

"Blood Pressure; Catheters, Indwelling/*ST; Hemodialysis/*ST; Human; Kidney Failure, Acute/*PP/TH; Kidney Failure, Chronic/*PP/TH; Polyurethanes; Quality Control; Support, Non-U.S. Gov't."

# Sensitivity Labeling

In [8]:
#MESH_XML_FILE = "data/nlm/mesh/medit/ascii_xml/output/desc2022.xml"
#MESH_XML_FILE = "desc2022/usr/nlm/mesh/medit/ascii_xml/output/desc2022.xml"
MESH_XML_FILE = "desc2022.xml"

c12_terms, c13_terms = get_keywords_from_xml(MESH_XML_FILE)
# print(c12_terms)

In [9]:
# Create a binary label based on if a match can be found
df_data['sensitive_label'] = df_data['mesh_terms'].apply(
    lambda x: is_sensitive(x, c12_terms, c13_terms)
)

In [10]:
percentage_sensitive = 100 * df_data['sensitive_label'].mean()
print(f"{percentage_sensitive:.2f}% of the rows are sensitive")

7.72% of the rows are sensitive


# Data Processing

In [11]:
# Only save relevant columns to make file smaller
relevant_columns = ['Title', 'Abstract', 'sensitive_label', 'relevance']
df_data = df_data[relevant_columns]

In [12]:
save_csv = False  # Set this to True if you want to save the current df

if save_csv:
    save_path = "data/OHSUMED/full_ohsumed_sensitivity_labeled.csv"
    
    # Check if the file already exists
    if not os.path.exists(save_path):
        df_1_train.to_csv(save_path, index=False)
        print(f"File saved to {save_path}")
    else:
        print(f"File already exists at {save_path}, skipping save.")

In [13]:
# Combine title and abstract
df_data['text'] = df_data['Title'] + " " + df_data['Abstract']

df_data = df_data[['text', 'sensitive_label', "relevance"]]  # we only need these three columns for the ML part

In [14]:
# A list which contains indices of rows which have a 1 as label
# We can use it to find records which have been labeled as sensitive
indices = df_data.index[df_data['sensitive_label'] == 1].tolist()
indices[0:5]

[6, 22, 24, 25, 26]

In [15]:
# An example of two rows, one from each group
row_selection = df_data.iloc[23:25]
row_selection

Unnamed: 0,text,sensitive_label,relevance
23,Development of a small caliber vascular graft by a new crosslinking method incorporating slow heparin release collagen and natural tissue compliance.,0,-1.0
24,Strontium overload in uremic patients on regular dialytic treatment.,1,-1.0


In [16]:
# Split the data into training, validation, and test sets

#test data is all data judged as relevant, rest is train data
data_test = df_data[df_data["relevance"] >-1]
data_train =df_data[df_data["relevance"] == -1]
data_train = data_train.drop(["relevance"], axis=1)
print(len(data_test))
#split text and label of test data
test_texts = data_test["text"]
test_labels = data_test["sensitive_label"]


#15% of the training data is used for validation
train_texts, val_texts, train_labels, val_labels = train_test_split(
    data_train["text"], data_train["sensitive_label"], test_size=0.85, random_state=123, stratify=data_train["sensitive_label"]
)

14430


In [17]:
print(len(test_texts))
print(test_texts.isna().sum())  # to check if there are missings

14430
0


# Logistic Regression

For the following, there was little information in the paper. We made basic assumptions on how to implement the LR. While not mentioned in the paper, TF-IDF is commonly used for obtaining features. 

**Note: the following cell typically takes a few minutes to run.**

In [45]:
# Convert text to numerical features using TF-IDF
vectorizer = TfidfVectorizer(max_features=10000, ngram_range=(1, 2), stop_words='english')
X_train = vectorizer.fit_transform(train_texts)
X_val = vectorizer.transform(val_texts)
X_test = vectorizer.transform(test_texts)

In [46]:
# Train the logistic regression model
model_lr = LogisticRegression(max_iter=1000, class_weight='balanced', random_state=123)
model_lr.fit(X_train, train_labels)

In [47]:
# Get probabilities for the validation set
val_probs = model_lr.predict_proba(X_val)[:, 1]

In [48]:
# As mentioned in the paper, use "a grid search in the range [0, 1] with step size 0.01 to find the threshold that optimized the F1 measure.

thresholds = np.linspace(0, 1, 101)
best_threshold = 0
best_f1 = 0

for threshold in thresholds:
    val_preds = (val_probs >= threshold).astype(int)
    _, _, f1, _ = precision_recall_fscore_support(val_labels, val_preds, average='binary')
    if f1 > best_f1:
        best_f1 = f1
        best_threshold = threshold

print(f"Optimal Threshold: {best_threshold}")

Optimal Threshold: 0.71


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [49]:
# Evaluate on the test set using the optimal threshold
test_probs = model_lr.predict_proba(X_test)[:, 1]
test_preds = (test_probs >= best_threshold).astype(int)

precision, recall, f1, _ = precision_recall_fscore_support(test_labels, test_preds, average='binary')
accuracy = accuracy_score(test_labels, test_preds)

# Calculate F2 score
f2 = fbeta_score(test_labels, test_preds, beta=2, average='binary')

print("Intrinsic sensitivity classification results:")
print(f"Results of our code: Precision: {precision:.2f}, Recall: {recall:.2f}, F1: {f1:.2f}, F2: {f2:.2f}, Accuracy: {accuracy:.2f}")
print("Results in the paper: Presision: 76.72, Recall: 73.29, F1 74.96, F2: 73.95, Accuracy 94.01")

Intrinsic sensitivity classification results:
Results of our code: Precision: 0.71, Recall: 0.71, F1: 0.71, F2: 0.71, Accuracy: 0.93
Results in the paper: Presision: 76.72, Recall: 73.29, F1 74.96, F2: 73.95, Accuracy 94.01


![alt text](image.png)

# DistilBERT

In [23]:
# Prepare datasets
train_data = Dataset.from_dict({"text": train_texts, "label": train_labels})
val_data = Dataset.from_dict({"text": val_texts, "label": val_labels})
test_data = Dataset.from_dict({"text": test_texts, "label": test_labels})

In [24]:
# Heavily shorten the training data for now, otherwise model training takes forever
n = 50
v = 10
t = 10
train_data = train_data.select(range(n))  # only take first n rows
val_data = val_data.select(range(v))
test_data = test_data.select(range(t))

In [25]:
tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")
model = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased")
print("DistilBERT is ready!")

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


DistilBERT is ready!


**Note: the following cell typically takes a few minutes to run.**

In [26]:
# Tokenize data
def tokenize_function(example):
    return tokenizer(example["text"], padding="max_length", truncation=True, max_length=512)

train_data = train_data.map(tokenize_function, batched=True)
val_data = val_data.map(tokenize_function, batched=True)
test_data = test_data.map(tokenize_function, batched=True)

Map:   0%|          | 0/50 [00:00<?, ? examples/s]

Map:   0%|          | 0/10 [00:00<?, ? examples/s]

Map:   0%|          | 0/10 [00:00<?, ? examples/s]

In [27]:
# Set data format for PyTorch
train_data.set_format(type="torch", columns=["input_ids", "attention_mask", "label"])
val_data.set_format(type="torch", columns=["input_ids", "attention_mask", "label"])
test_data.set_format(type="torch", columns=["input_ids", "attention_mask", "label"])

In [28]:
model = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=2)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [29]:
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    save_strategy="epoch",
    logging_dir="./logs",
    logging_steps=10,
    load_best_model_at_end=True
)



In [39]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_data,
    eval_dataset=val_data,
    compute_metrics=compute_metrics
)

In [40]:
trainer.train()

Epoch,Training Loss,Validation Loss,Precision,Recall,F1,F2,Accuracy
1,No log,0.325025,0.0,0.0,0.0,0.0,0.9
2,No log,0.310492,0.0,0.0,0.0,0.0,0.9
3,0.223100,0.309542,0.0,0.0,0.0,0.0,0.9


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


TrainOutput(global_step=12, training_loss=0.2066141851246357, metrics={'train_runtime': 212.4387, 'train_samples_per_second': 0.706, 'train_steps_per_second': 0.056, 'total_flos': 19870109798400.0, 'train_loss': 0.2066141851246357, 'epoch': 3.0})

In [41]:
results = trainer.evaluate(test_data)
print(results)

{'eval_loss': 0.757250189781189, 'eval_precision': 0.0, 'eval_recall': 0.0, 'eval_f1': 0.0, 'eval_f2': 0.0, 'eval_accuracy': 0.7, 'eval_runtime': 2.5648, 'eval_samples_per_second': 3.899, 'eval_steps_per_second': 0.39, 'epoch': 3.0}


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


# Combined

In [42]:
distilbert_preds = trainer.predict(test_data)
distilbert_logits = distilbert_preds.predictions
test_preds_distilbert = np.argmax(distilbert_logits, axis=-1)

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [53]:
test_preds_distilbert

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0])

In [50]:
test_probs_lr = model_lr.predict_proba(X_test)[:, 1]  # Probability of class 1
test_preds_lr = (test_probs_lr >= best_threshold).astype(int)

In [54]:
# Combined predictions: 1 if either model predicts 1
# test_preds_combined = (test_preds_lr | test_preds_distilbert).astype(int)

# Extrinsic Evaluation