In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import precision_recall_fscore_support, accuracy_score
from sklearn.model_selection import train_test_split
import numpy as np
from datasets import Dataset
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification, Trainer, TrainingArguments

In [2]:
import sys
import os
project_root = os.path.abspath(os.path.join(os.getcwd(), ".."))
sys.path.insert(0, project_root)

# Data preparation

In [3]:
data_path = "../data/OHSUMED/full_ohsumed_sensitivity_labeled.csv"

df = pd.read_csv(data_path)

In [4]:
# Combine title and abstract
df['text'] = df['title'] + " " + df['abstract']

df = df[['text', 'sensitive_label']]

In [5]:
df.head()

Unnamed: 0,text,sensitive_label
0,The binding of acetaldehyde to the active site...,0
1,Reductions in breath ethanol readings in norma...,0
2,Does the blockade of opioid receptors influenc...,0
3,Drinkwatchers--description of subjects and eva...,0
4,Platelet affinity for serotonin is increased i...,0


In [6]:
# Split the data into training, validation, and test sets
train_texts, temp_texts, train_labels, temp_labels = train_test_split(
    df['text'], df['sensitive_label'], test_size=0.3, random_state=123, stratify=df['sensitive_label']
)

val_texts, test_texts, val_labels, test_labels = train_test_split(
    temp_texts, temp_labels, test_size=0.5, random_state=123, stratify=temp_labels
)

# Logistic Regression

**NOTE**: the following cell will run for a few minutes usually.

In [None]:
# Convert text to numerical features using TF-IDF
vectorizer = TfidfVectorizer(max_features=10000, ngram_range=(1, 2), stop_words='english')
X_train = vectorizer.fit_transform(train_texts)
X_val = vectorizer.transform(val_texts)
X_test = vectorizer.transform(test_texts)

In [None]:
# Train the logistic regression model
model = LogisticRegression(max_iter=1000, class_weight='balanced', random_state=123)
model.fit(X_train, train_labels)

In [None]:
# Get probabilities for the validation set
val_probs = model.predict_proba(X_val)[:, 1]

In [None]:
# Determine the optimal threshold
thresholds = np.linspace(0, 1, 101)
best_threshold = 0
best_f1 = 0

In [None]:
for threshold in thresholds:
    val_preds = (val_probs >= threshold).astype(int)
    _, _, f1, _ = precision_recall_fscore_support(val_labels, val_preds, average='binary')
    if f1 > best_f1:
        best_f1 = f1
        best_threshold = threshold

print(f"Optimal Threshold: {best_threshold}")

In [None]:
# Evaluate on the test set using the optimal threshold
test_probs = model.predict_proba(X_test)[:, 1]
test_preds = (test_probs >= best_threshold).astype(int)

precision, recall, f1, _ = precision_recall_fscore_support(test_labels, test_preds, average='binary')
accuracy = accuracy_score(test_labels, test_preds)

print(f"Test Results - Precision: {precision}, Recall: {recall}, F1: {f1}, Accuracy: {accuracy}")

# DistilBERT

In [7]:
tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")
model = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased")
print("DistilBERT is ready!")

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


DistilBERT is ready!


In [11]:
# Prepare datasets
train_data = Dataset.from_dict({"text": train_texts, "label": train_labels})

train_data = train_data.select(range(500))  # 

val_data = Dataset.from_dict({"text": val_texts, "label": val_labels})
test_data = Dataset.from_dict({"text": test_texts, "label": test_labels})

**NOTE**: the following cell will run for a few minutes usually.

In [12]:
# Tokenize data
tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")

def tokenize_function(example):
    return tokenizer(example["text"], padding="max_length", truncation=True, max_length=512)

train_data = train_data.map(tokenize_function, batched=True)
val_data = val_data.map(tokenize_function, batched=True)
test_data = test_data.map(tokenize_function, batched=True)

Map:   0%|          | 0/243994 [00:00<?, ? examples/s]

Map:   0%|          | 0/52285 [00:00<?, ? examples/s]

Map:   0%|          | 0/52285 [00:00<?, ? examples/s]

In [13]:
# Set data format for PyTorch
train_data.set_format(type="torch", columns=["input_ids", "attention_mask", "label"])
val_data.set_format(type="torch", columns=["input_ids", "attention_mask", "label"])
test_data.set_format(type="torch", columns=["input_ids", "attention_mask", "label"])

In [14]:
model = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=2)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [15]:
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    save_strategy="epoch",
    logging_dir="./logs",
    logging_steps=10,
    load_best_model_at_end=True
)

In [16]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_data,
    eval_dataset=val_data
)

trainer.train()

Epoch,Training Loss,Validation Loss


KeyboardInterrupt: 

In [None]:
results = trainer.evaluate(test_data)
print(results)