1. Setup Environment

Install required packages.

In [17]:
%pip install -q transformers==4.20.1 datasets==2.10.0 pandas==1.4.2 numpy==1.22.4 scikit-learn==1.1.1 torch==1.11.0 nltk==3.7 imbalanced-learn==0.9.1


Note: you may need to restart the kernel to use updated packages.


In [18]:
import torch
import pandas as pd
import numpy as np
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.metrics import accuracy_score, f1_score, classification_report
import nltk
from nltk.corpus import stopwords
from imblearn.over_sampling import SMOTE


In [19]:
nltk.download('stopwords')


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\satvi\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [20]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f'Using device: {device}')


Using device: cpu


2. Create and Preprocess drug_use_data.csv

Load SetFit/ade_corpus_v2_classification train split, create CSV, and preprocess.

In [21]:
import pandas as pd
import re
import urllib.request

# Define splits
splits = {'train': 'train.jsonl', 'test': 'test.jsonl'}

# Load via hf:// protocol
try:
    df = pd.read_json("hf://datasets/SetFit/ade_corpus_v2_classification/" + splits["train"], lines=True)
except Exception as e:
    print(f"hf:// loading failed: {e}")
    print("Falling back to direct URL...")
    url = "https://huggingface.co/datasets/SetFit/ade_corpus_v2_classification/resolve/main/train.jsonl"
    urllib.request.urlretrieve(url, "train.jsonl")
    df = pd.read_json("train.jsonl", lines=True)

# Simulate substance and symptom labels
substance_map = {
    'morphine': 'opioid', 'oxycodone': 'opioid', 'fentanyl': 'opioid',
    'cocaine': 'stimulant', 'methamphetamine': 'stimulant',
    'placebo': 'none'
}
symptom_list = ['nausea', 'confusion', 'drowsiness', 'overdose']

def assign_labels(text):
    substance = 'none'
    symptoms = []
    text_lower = str(text).lower()
    for drug, subst in substance_map.items():
        if drug in text_lower:
            substance = subst
            break
    for symp in symptom_list:
        if symp in text_lower:
            symptoms.append(symp)
    return substance, symptoms if symptoms else ['none']

# Apply labels
df['substance_label'], df['symptom_labels'] = zip(*df['text'].apply(assign_labels))

# Save to CSV
df[['text', 'substance_label', 'symptom_labels']].to_csv('drug_use_data.csv', index=False)
print('Dataset saved as drug_use_data.csv')

# Preprocess text
def preprocess_text(text):
    text = str(text).lower()
    text = re.sub(r'http\S+|www\S+|https\S+', '', text)
    text = re.sub(r'@\w+', '', text)
    text = re.sub(r'#\w+', '', text)
    text = re.sub(r'[^\x00-\x7F]+', '', text)
    text = re.sub(r'[^\w\s]', '', text)
    stop_words = set(stopwords.words('english'))
    text = ' '.join(word for word in text.split() if word not in stop_words)
    return text

df['text'] = df['text'].apply(preprocess_text)

# Encode labels
substance_classes = df['substance_label'].unique()
substance2id = {label: idx for idx, label in enumerate(substance_classes)}
df['substance_label'] = df['substance_label'].map(substance2id)

mlb = MultiLabelBinarizer()
symptom_encoded = mlb.fit_transform(df['symptom_labels'])
symptom_df = pd.DataFrame(symptom_encoded, columns=mlb.classes_)

df = pd.concat([df[['text', 'substance_label']], symptom_df], axis=1)

# Split data
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42, stratify=df['substance_label'])

print(f'Training samples: {len(train_df)}, Test samples: {len(test_df)}')

Dataset saved as drug_use_data.csv
Training samples: 14109, Test samples: 3528


3. Tokenize Data

Tokenize using BioBERT.

In [22]:
tokenizer = AutoTokenizer.from_pretrained('dmis-lab/biobert-base-cased-v1.2', use_fast=True, do_lower_case=False)

def tokenize_data(texts, max_length=64): 
    return tokenizer(
        texts.tolist(),
        padding=True,
        truncation=True,
        max_length=max_length,
        return_tensors='pt'
    )

train_encodings = tokenize_data(train_df['text'])
test_encodings = tokenize_data(test_df['text'])

class DrugUseDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, substance_labels, symptom_labels):
        self.encodings = encodings
        self.substance_labels = substance_labels
        self.symptom_labels = symptom_labels

    def __getitem__(self, idx):
        item = {key: val[idx] for key, val in self.encodings.items()}
        item['substance_labels'] = torch.tensor(self.substance_labels[idx], dtype=torch.long)
        item['symptom_labels'] = torch.tensor(self.symptom_labels[idx], dtype=torch.float)
        return item

    def __len__(self):
        return len(self.substance_labels)

symptom_columns = mlb.classes_
train_dataset = DrugUseDataset(
    train_encodings,
    train_df['substance_label'].values,
    train_df[symptom_columns].values
)
test_dataset = DrugUseDataset(
    test_encodings,
    test_df['substance_label'].values,
    test_df[symptom_columns].values
)

Could not locate the tokenizer configuration file, will try to use the model config instead.
loading configuration file https://huggingface.co/dmis-lab/biobert-base-cased-v1.2/resolve/main/config.json from cache at C:\Users\satvi/.cache\huggingface\transformers\ece5e89bab3b63a40e413c7f599e6081663cad06eb394e48d5023930733d15a3.ad895c9bc4687ffedea1a4cc498ac3f67ebd2083732981c2a06f548cde7d6582
Model config BertConfig {
  "_name_or_path": "dmis-lab/biobert-base-cased-v1.2",
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_version": "4.20.1",
  "type_vocab_size": 2,
  "use_cache"

4. Define Custom Model

BioBERT for multi-task classification.

In [23]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Simple feedforward network instead of transformer
class SimpleMultiTaskModel(torch.nn.Module):
    def __init__(self, input_size, num_substance_classes, num_symptom_labels):
        super(SimpleMultiTaskModel, self).__init__()
        self.hidden1 = torch.nn.Linear(input_size, 256)
        self.hidden2 = torch.nn.Linear(256, 128)
        self.dropout = torch.nn.Dropout(0.3)
        self.substance_classifier = torch.nn.Linear(128, num_substance_classes)
        self.symptom_classifier = torch.nn.Linear(128, num_symptom_labels)
        self.num_substance_classes = num_substance_classes
        
    def forward(self, x, substance_labels=None, symptom_labels=None):
        # Forward pass through hidden layers
        hidden = torch.relu(self.hidden1(x))
        hidden = self.dropout(hidden)
        hidden = torch.relu(self.hidden2(hidden))
        hidden = self.dropout(hidden)
        
        # Classification heads
        substance_logits = self.substance_classifier(hidden)
        symptom_logits = self.symptom_classifier(hidden)
        
        loss = None
        if substance_labels is not None and symptom_labels is not None:
            # Compute class weights for substance labels
            class_counts = np.bincount(substance_labels.cpu().numpy(), minlength=self.num_substance_classes)
            class_weights = torch.tensor(1.0 / (class_counts + 1e-6), dtype=torch.float).to(substance_labels.device)
            substance_loss = torch.nn.CrossEntropyLoss(weight=class_weights)(substance_logits, substance_labels)
            symptom_loss = torch.nn.BCEWithLogitsLoss()(symptom_logits, symptom_labels)
            loss = substance_loss + symptom_loss
            
        return {'loss': loss, 'substance_logits': substance_logits, 'symptom_logits': symptom_logits}

# Create TF-IDF features from text data
print("Creating TF-IDF features...")
vectorizer = TfidfVectorizer(max_features=5000, stop_words='english', ngram_range=(1, 2))
X_train_tfidf = vectorizer.fit_transform(train_df['text']).toarray()
X_test_tfidf = vectorizer.transform(test_df['text']).toarray()

print(f"TF-IDF feature shape: {X_train_tfidf.shape}")

# Create custom dataset for TF-IDF features
class TFIDFDataset(torch.utils.data.Dataset):
    def __init__(self, features, substance_labels, symptom_labels):
        self.features = torch.tensor(features, dtype=torch.float32)
        self.substance_labels = torch.tensor(substance_labels, dtype=torch.long)
        self.symptom_labels = torch.tensor(symptom_labels, dtype=torch.float32)

    def __getitem__(self, idx):
        return {
            'x': self.features[idx],
            'substance_labels': self.substance_labels[idx],
            'symptom_labels': self.symptom_labels[idx]
        }

    def __len__(self):
        return len(self.features)

# Create datasets
train_dataset = TFIDFDataset(
    X_train_tfidf,
    train_df['substance_label'].values,
    train_df[symptom_columns].values
)

test_dataset = TFIDFDataset(
    X_test_tfidf,
    test_df['substance_label'].values,
    test_df[symptom_columns].values
)

# Initialize model
model = SimpleMultiTaskModel(
    input_size=5000, 
    num_substance_classes=len(substance_classes), 
    num_symptom_labels=len(symptom_columns)
)
model.to(device)

print(f"Model created with {sum(p.numel() for p in model.parameters())} parameters")

Creating TF-IDF features...
TF-IDF feature shape: (14109, 5000)
Model created with 1314184 parameters


5. Train Model


In [24]:
def compute_metrics(pred):
    substance_preds = np.argmax(pred.predictions[0], axis=1)
    substance_labels = pred.label_ids[0]
    symptom_preds = (pred.predictions[1] > 0.5).astype(int)
    symptom_labels = pred.label_ids[1]

    substance_accuracy = accuracy_score(substance_labels, substance_preds)
    symptom_f1 = f1_score(symptom_labels, symptom_preds, average='micro')

    return {
        'substance_accuracy': substance_accuracy,
        'symptom_f1': symptom_f1
    }

# Reduce batch size to prevent OOM
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,
    per_device_train_batch_size=8,  # Reduced from 32
    per_device_eval_batch_size=8,   # Reduced from 32
    learning_rate=2e-5,
    warmup_steps=200,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=50,
    evaluation_strategy='epoch',
    save_strategy='epoch',
    load_best_model_at_end=True,
    metric_for_best_model='substance_accuracy',
    dataloader_pin_memory=False,    # Disable pin memory to save RAM
    gradient_accumulation_steps=4   # Maintain effective batch size
)

import gc
torch.cuda.empty_cache() if torch.cuda.is_available() else None
gc.collect()

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics
)

trainer.train()

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
  0%|          | 0/1323 [13:57<?, ?it/s]
***** Running training *****
  Num examples = 14109
  Num Epochs = 3
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 32
  Gradient Accumulation steps = 4
  Total optimization steps = 1323
  4%|▍         | 54/1323 [00:01<00:42, 30.13it/s]

{'loss': 1.8132, 'learning_rate': 5e-06, 'epoch': 0.11}


  8%|▊         | 104/1323 [00:03<00:41, 29.05it/s]

{'loss': 1.8114, 'learning_rate': 1e-05, 'epoch': 0.23}


 12%|█▏        | 153/1323 [00:05<00:39, 29.53it/s]

{'loss': 1.8051, 'learning_rate': 1.5000000000000002e-05, 'epoch': 0.34}


 15%|█▌        | 205/1323 [00:06<00:35, 31.22it/s]

{'loss': 1.7953, 'learning_rate': 2e-05, 'epoch': 0.45}


 19%|█▉        | 253/1323 [00:08<00:36, 29.16it/s]

{'loss': 1.7817, 'learning_rate': 1.910952804986643e-05, 'epoch': 0.57}


 23%|██▎       | 304/1323 [00:10<00:35, 28.47it/s]

{'loss': 1.7644, 'learning_rate': 1.821905609973286e-05, 'epoch': 0.68}


 27%|██▋       | 355/1323 [00:11<00:30, 32.25it/s]

{'loss': 1.7446, 'learning_rate': 1.732858414959929e-05, 'epoch': 0.79}


 30%|███       | 403/1323 [00:13<00:29, 30.81it/s]

{'loss': 1.7207, 'learning_rate': 1.643811219946572e-05, 'epoch': 0.91}


 33%|███▎      | 439/1323 [00:14<00:26, 33.86it/s]***** Running Evaluation *****
  Num examples = 3528
  Batch size = 8
                                                  
 33%|███▎      | 441/1323 [00:17<00:26, 33.86it/s]Saving model checkpoint to ./results\checkpoint-441
Trainer.model is not a `PreTrainedModel`, only saving its state dict.
 33%|███▎      | 443/1323 [00:17<03:28,  4.23it/s]

{'eval_loss': 1.6849604845046997, 'eval_substance_accuracy': 0.9943310657596371, 'eval_symptom_f1': 0.0, 'eval_runtime': 2.7014, 'eval_samples_per_second': 1305.991, 'eval_steps_per_second': 163.249, 'epoch': 1.0}


 34%|███▍      | 455/1323 [00:17<01:28,  9.84it/s]

{'loss': 1.695, 'learning_rate': 1.5547640249332147e-05, 'epoch': 1.02}


 38%|███▊      | 503/1323 [00:18<00:25, 32.76it/s]

{'loss': 1.6685, 'learning_rate': 1.4657168299198576e-05, 'epoch': 1.13}


 42%|████▏     | 555/1323 [00:20<00:22, 33.48it/s]

{'loss': 1.6362, 'learning_rate': 1.3766696349065006e-05, 'epoch': 1.25}


 46%|████▌     | 603/1323 [00:21<00:21, 33.55it/s]

{'loss': 1.6038, 'learning_rate': 1.2876224398931433e-05, 'epoch': 1.36}


 50%|████▉     | 655/1323 [00:23<00:21, 31.71it/s]

{'loss': 1.5779, 'learning_rate': 1.1985752448797864e-05, 'epoch': 1.47}


 53%|█████▎    | 703/1323 [00:24<00:19, 32.48it/s]

{'loss': 1.5389, 'learning_rate': 1.1095280498664294e-05, 'epoch': 1.59}


 57%|█████▋    | 755/1323 [00:26<00:17, 32.93it/s]

{'loss': 1.5067, 'learning_rate': 1.0204808548530723e-05, 'epoch': 1.7}


 61%|██████    | 803/1323 [00:28<00:17, 30.20it/s]

{'loss': 1.4721, 'learning_rate': 9.31433659839715e-06, 'epoch': 1.81}


 65%|██████▍   | 855/1323 [00:29<00:14, 32.66it/s]

{'loss': 1.4446, 'learning_rate': 8.42386464826358e-06, 'epoch': 1.93}


 66%|██████▋   | 879/1323 [00:30<00:13, 33.13it/s]***** Running Evaluation *****
  Num examples = 3528
  Batch size = 8
                                                  
 67%|██████▋   | 882/1323 [00:33<00:13, 33.13it/s]Saving model checkpoint to ./results\checkpoint-882
Trainer.model is not a `PreTrainedModel`, only saving its state dict.
 67%|██████▋   | 883/1323 [00:33<01:46,  4.14it/s]

{'eval_loss': 1.4124690294265747, 'eval_substance_accuracy': 0.9943310657596371, 'eval_symptom_f1': 0.0, 'eval_runtime': 2.7691, 'eval_samples_per_second': 1274.066, 'eval_steps_per_second': 159.258, 'epoch': 2.0}


 68%|██████▊   | 903/1323 [00:33<00:27, 15.06it/s]

{'loss': 1.4124, 'learning_rate': 7.5333926981300095e-06, 'epoch': 2.04}


 72%|███████▏  | 955/1323 [00:35<00:11, 31.51it/s]

{'loss': 1.3923, 'learning_rate': 6.642920747996439e-06, 'epoch': 2.15}


 76%|███████▌  | 1003/1323 [00:37<00:09, 33.11it/s]

{'loss': 1.3694, 'learning_rate': 5.7524487978628674e-06, 'epoch': 2.27}


 80%|███████▉  | 1055/1323 [00:38<00:08, 32.78it/s]

{'loss': 1.3481, 'learning_rate': 4.861976847729297e-06, 'epoch': 2.38}


 83%|████████▎ | 1103/1323 [00:40<00:06, 32.63it/s]

{'loss': 1.3341, 'learning_rate': 3.971504897595726e-06, 'epoch': 2.49}


 87%|████████▋ | 1155/1323 [00:41<00:05, 33.45it/s]

{'loss': 1.3143, 'learning_rate': 3.081032947462155e-06, 'epoch': 2.61}


 91%|█████████ | 1203/1323 [00:43<00:03, 32.57it/s]

{'loss': 1.3056, 'learning_rate': 2.1905609973285845e-06, 'epoch': 2.72}


 95%|█████████▍| 1255/1323 [00:44<00:02, 31.92it/s]

{'loss': 1.3004, 'learning_rate': 1.3000890471950135e-06, 'epoch': 2.83}


 98%|█████████▊| 1303/1323 [00:46<00:00, 30.66it/s]

{'loss': 1.2897, 'learning_rate': 4.0961709706144254e-07, 'epoch': 2.95}


100%|██████████| 1323/1323 [00:46<00:00, 32.64it/s]***** Running Evaluation *****
  Num examples = 3528
  Batch size = 8
                                                   
100%|██████████| 1323/1323 [00:49<00:00, 32.64it/s]Saving model checkpoint to ./results\checkpoint-1323
Trainer.model is not a `PreTrainedModel`, only saving its state dict.


Training completed. Do not forget to share your model on huggingface.co/models =)


Loading best model from ./results\checkpoint-441 (score: 0.9943310657596371).
100%|██████████| 1323/1323 [00:49<00:00, 26.58it/s]

{'eval_loss': 1.2903836965560913, 'eval_substance_accuracy': 0.9943310657596371, 'eval_symptom_f1': 0.0, 'eval_runtime': 2.7717, 'eval_samples_per_second': 1272.858, 'eval_steps_per_second': 159.107, 'epoch': 3.0}
{'train_runtime': 49.7833, 'train_samples_per_second': 850.225, 'train_steps_per_second': 26.575, 'train_loss': 1.5510746054097908, 'epoch': 3.0}





TrainOutput(global_step=1323, training_loss=1.5510746054097908, metrics={'train_runtime': 49.7833, 'train_samples_per_second': 850.225, 'train_steps_per_second': 26.575, 'train_loss': 1.5510746054097908, 'epoch': 3.0})

6. Evaluate Model

Evaluate and print results.

In [25]:
eval_results = trainer.evaluate()
print(f'Evaluation Results: {eval_results}')

predictions = trainer.predict(test_dataset)
substance_preds = np.argmax(predictions.predictions[0], axis=1)
symptom_preds = (predictions.predictions[1] > 0.5).astype(int)

print('Substance Classification Report:')
print(classification_report(test_df['substance_label'], substance_preds, target_names=substance_classes))

print('Symptom Classification Report:')
print(classification_report(test_df[symptom_columns], symptom_preds, target_names=symptom_columns, zero_division=0))

***** Running Evaluation *****
  Num examples = 3528
  Batch size = 8
100%|██████████| 441/441 [00:02<00:00, 167.00it/s]
***** Running Prediction *****
  Num examples = 3528
  Batch size = 8


Evaluation Results: {'eval_loss': 1.6849604845046997, 'eval_substance_accuracy': 0.9943310657596371, 'eval_symptom_f1': 0.0, 'eval_runtime': 2.6544, 'eval_samples_per_second': 1329.113, 'eval_steps_per_second': 166.139, 'epoch': 3.0}


 97%|█████████▋| 427/441 [00:02<00:00, 163.35it/s]

Substance Classification Report:
              precision    recall  f1-score   support

        none       0.99      1.00      1.00      3508
      opioid       0.00      0.00      0.00        18
   stimulant       0.00      0.00      0.00         2

    accuracy                           0.99      3528
   macro avg       0.33      0.33      0.33      3528
weighted avg       0.99      0.99      0.99      3528

Symptom Classification Report:
              precision    recall  f1-score   support

   confusion       0.00      0.00      0.00         3
  drowsiness       0.00      0.00      0.00         3
      nausea       0.00      0.00      0.00        19
        none       0.00      0.00      0.00      3485
    overdose       0.00      0.00      0.00        19

   micro avg       0.00      0.00      0.00      3529
   macro avg       0.00      0.00      0.00      3529
weighted avg       0.00      0.00      0.00      3529
 samples avg       0.00      0.00      0.00      3529



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


7. Save Model

In [26]:
model.save_pretrained('./biobert_drug_use_model')
tokenizer.save_pretrained('./biobert_drug_use_model')
print('Model and tokenizer saved!')

AttributeError: 'SimpleMultiTaskModel' object has no attribute 'save_pretrained'

100%|██████████| 441/441 [00:20<00:00, 163.35it/s]