1. Setup Environment

Install required packages.

In [None]:
%pip install -q transformers==4.20.1 datasets==2.10.0 pandas==1.4.2 numpy==1.22.4 scikit-learn==1.1.1 torch==1.11.0 nltk==3.7 imbalanced-learn==0.9.1


In [None]:
import torch
import pandas as pd
import numpy as np
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.metrics import accuracy_score, f1_score, classification_report
import nltk
from nltk.corpus import stopwords
from imblearn.over_sampling import SMOTE


In [None]:
nltk.download('stopwords')


In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f'Using device: {device}')


2. Create and Preprocess drug_use_data.csv

Load SetFit/ade_corpus_v2_classification train split, create CSV, and preprocess.

In [9]:
import pandas as pd
import re
import urllib.request

# Define splits
splits = {'train': 'train.jsonl', 'test': 'test.jsonl'}

# Load via hf:// protocol
try:
    df = pd.read_json("hf://datasets/SetFit/ade_corpus_v2_classification/" + splits["train"], lines=True)
except Exception as e:
    print(f"hf:// loading failed: {e}")
    print("Falling back to direct URL...")
    url = "https://huggingface.co/datasets/SetFit/ade_corpus_v2_classification/resolve/main/train.jsonl"
    urllib.request.urlretrieve(url, "train.jsonl")
    df = pd.read_json("train.jsonl", lines=True)

# Simulate substance and symptom labels
substance_map = {
    'morphine': 'opioid', 'oxycodone': 'opioid', 'fentanyl': 'opioid',
    'cocaine': 'stimulant', 'methamphetamine': 'stimulant',
    'placebo': 'none'
}
symptom_list = ['nausea', 'confusion', 'drowsiness', 'overdose']

def assign_labels(text):
    substance = 'none'
    symptoms = []
    text_lower = str(text).lower()
    for drug, subst in substance_map.items():
        if drug in text_lower:
            substance = subst
            break
    for symp in symptom_list:
        if symp in text_lower:
            symptoms.append(symp)
    return substance, symptoms if symptoms else ['none']

# Apply labels
df['substance_label'], df['symptom_labels'] = zip(*df['text'].apply(assign_labels))

# Save to CSV
df[['text', 'substance_label', 'symptom_labels']].to_csv('drug_use_data.csv', index=False)
print('Dataset saved as drug_use_data.csv')

# Preprocess text
def preprocess_text(text):
    text = str(text).lower()
    text = re.sub(r'http\S+|www\S+|https\S+', '', text)
    text = re.sub(r'@\w+', '', text)
    text = re.sub(r'#\w+', '', text)
    text = re.sub(r'[^\x00-\x7F]+', '', text)
    text = re.sub(r'[^\w\s]', '', text)
    stop_words = set(stopwords.words('english'))
    text = ' '.join(word for word in text.split() if word not in stop_words)
    return text

df['text'] = df['text'].apply(preprocess_text)

# Encode labels
substance_classes = df['substance_label'].unique()
substance2id = {label: idx for idx, label in enumerate(substance_classes)}
df['substance_label'] = df['substance_label'].map(substance2id)

mlb = MultiLabelBinarizer()
symptom_encoded = mlb.fit_transform(df['symptom_labels'])
symptom_df = pd.DataFrame(symptom_encoded, columns=mlb.classes_)

df = pd.concat([df[['text', 'substance_label']], symptom_df], axis=1)

# Split data
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42, stratify=df['substance_label'])

print(f'Training samples: {len(train_df)}, Test samples: {len(test_df)}')

Dataset saved as drug_use_data.csv
Training samples: 14109, Test samples: 3528


3. Tokenize Data

Tokenize using BioBERT.

In [10]:
tokenizer = AutoTokenizer.from_pretrained('dmis-lab/biobert-base-cased-v1.2')

def tokenize_data(texts, max_length=128):
    return tokenizer(
        texts.tolist(),
        padding=True,
        truncation=True,
        max_length=max_length,
        return_tensors='pt'
    )

train_encodings = tokenize_data(train_df['text'])
test_encodings = tokenize_data(test_df['text'])

class DrugUseDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, substance_labels, symptom_labels):
        self.encodings = encodings
        self.substance_labels = substance_labels
        self.symptom_labels = symptom_labels

    def __getitem__(self, idx):
        item = {key: val[idx] for key, val in self.encodings.items()}
        item['substance_labels'] = torch.tensor(self.substance_labels[idx], dtype=torch.long)
        item['symptom_labels'] = torch.tensor(self.symptom_labels[idx], dtype=torch.float)
        return item

    def __len__(self):
        return len(self.substance_labels)

symptom_columns = mlb.classes_
train_dataset = DrugUseDataset(
    train_encodings,
    train_df['substance_label'].values,
    train_df[symptom_columns].values
)
test_dataset = DrugUseDataset(
    test_encodings,
    test_df['substance_label'].values,
    test_df[symptom_columns].values
)

Downloading: 100%|██████████| 1.08k/1.08k [00:00<?, ?B/s]
Downloading: 100%|██████████| 208k/208k [00:00<00:00, 487kB/s] 


4. Define Custom Model

BioBERT for multi-task classification.

In [None]:
class BioBERTMultiTask(torch.nn.Module):
    def __init__(self, num_substance_classes, num_symptom_labels):
        super(BioBERTMultiTask, self).__init__()
        self.bert = AutoModelForSequenceClassification.from_pretrained(
            'dmis-lab/biobert-base-cased-v1.2',
            output_hidden_states=True
        )
        self.dropout = torch.nn.Dropout(0.3)
        self.substance_classifier = torch.nn.Linear(768, num_substance_classes)
        self.symptom_classifier = torch.nn.Linear(768, num_symptom_labels)

    def forward(self, input_ids, attention_mask, substance_labels=None, symptom_labels=None):
        outputs = self.bert(input_ids, attention_mask=attention_mask)
        pooled_output = outputs.hidden_states[-1][:, 0]
        pooled_output = self.dropout(pooled_output)

        substance_logits = self.substance_classifier(pooled_output)
        symptom_logits = self.symptom_classifier(pooled_output)

        loss = None
        if substance_labels is not None and symptom_labels is not None:
            # Compute class weights for substance labels
            class_counts = np.bincount(substance_labels.cpu().numpy())
            class_weights = torch.tensor(1.0 / class_counts, dtype=torch.float).to(device)
            substance_loss = torch.nn.CrossEntropyLoss(weight=class_weights)(substance_logits, substance_labels)
            symptom_loss = torch.nn.BCEWithLogitsLoss()(symptom_logits, symptom_labels)
            loss = substance_loss + symptom_loss

        return {'loss': loss, 'substance_logits': substance_logits, 'symptom_logits': symptom_logits}

model = BioBERTMultiTask(num_substance_classes=len(substance_classes), num_symptom_labels=len(symptom_columns))
model.to(device)

Downloading:  94%|█████████▍| 390M/416M [01:22<00:22, 1.17MB/s] 

5. Train Model


In [None]:
def compute_metrics(pred):
    substance_preds = np.argmax(pred.predictions[0], axis=1)
    substance_labels = pred.label_ids[0]
    symptom_preds = (pred.predictions[1] > 0.5).astype(int)
    symptom_labels = pred.label_ids[1]

    substance_accuracy = accuracy_score(substance_labels, substance_preds)
    symptom_f1 = f1_score(symptom_labels, symptom_preds, average='micro')

    return {
        'substance_accuracy': substance_accuracy,
        'symptom_f1': symptom_f1
    }

training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=1,
    per_device_train_batch_size=64,
    per_device_eval_batch_size=64,
    learning_rate=2e-5,
    warmup_steps=100,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=50,
    evaluation_strategy='epoch',
    save_strategy='epoch',
    load_best_model_at_end=True,
    metric_for_best_model='substance_accuracy'
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics
)

trainer.train()

6. Evaluate Model

Evaluate and print results.

In [None]:
eval_results = trainer.evaluate()
print(f'Evaluation Results: {eval_results}')

predictions = trainer.predict(test_dataset)
substance_preds = np.argmax(predictions.predictions[0], axis=1)
symptom_preds = (predictions.predictions[1] > 0.5).astype(int)

print('Substance Classification Report:')
print(classification_report(test_df['substance_label'], substance_preds, target_names=substance_classes))

print('Symptom Classification Report:')
print(classification_report(test_df[symptom_columns], symptom_preds, target_names=symptom_columns, zero_division=0))

7. Save Model

In [None]:
model.save_pretrained('./biobert_drug_use_model')
tokenizer.save_pretrained('./biobert_drug_use_model')
print('Model and tokenizer saved!')