# Drug Review Sentiment — Updated Pipeline
Centralized outputs in `results/` and parameterized training via `src/train_model.py`.


In [18]:
# Optional: Colab setup — uncomment if running in Colab
# %pip install -U transformers datasets peft scikit-learn mlflow accelerate evaluate matplotlib seaborn kagglehub

import os, sys, pathlib, mlflow
import tensorboard
# Auto-detect repo root: if launched from notebooks/, go one level up
cwd = pathlib.Path().resolve()
repo_root = cwd.parent if cwd.name == 'notebooks' else cwd
os.chdir(repo_root)
sys.path.insert(0, str(repo_root))
print('CWD:', os.getcwd())

from src import logging_config as logcfg, config
config.ensure_dirs()
logcfg.setup_logger(config.TRAIN_LOG)

os.makedirs('data', exist_ok=True)
os.makedirs('results', exist_ok=True)

# Configure MLflow to track under results/mlruns via src.config
mlflow.set_tracking_uri(config.mlflow_uri())
print('MLflow tracking URI:', mlflow.get_tracking_uri())


CWD: /home/sandra/drug-review-sentiment
MLflow tracking URI: file:///home/sandra/drug-review-sentiment/results/mlruns


In [19]:
# 1. Mount Google Drive (for saving models/results)
#from google.colab import drive
#drive.mount('/content/drive')

# 2. Clone your GitHub repo
#git clone https://github.com/sandragodinhosilva/drug-review-sentiment
#%cd drug-review-sentiment

# 3. Install dependencies
# !pip install -r requirements.txt

# 4. Tensorboard
%load_ext tensorboard
# %tensorboard --logdir /content/drive/MyDrive/biobert_project/biobert_logs

# 5. Verify GPU availability
import torch
device = "cuda" if torch.cuda.is_available() else "cpu"
print("CUDA available:", torch.cuda.is_available())
print("Device:", torch.cuda.get_device_name(0) if torch.cuda.is_available() else "CPU")
# print("Current device:", torch.cuda.current_device())
# print("Tensor test:", torch.rand(3,3).cuda())


# 5. 

The tensorboard extension is already loaded. To reload it, use:
  %reload_ext tensorboard
CUDA available: False
Device: CPU


In [20]:
# Parameters
models = {
    "distilbert-base-uncased": {
        "model": "distilbert-base-uncased",  # or 'dmis-lab/biobert-base-cased-v1.1'
        "subset_frac": 0.005,  # use 0.0/None for full data
        "epochs": 1,
        "batch_size": 16,
        "fp16": True,  # set False on CPU
    },
    "biobert-base-cased-v1.1": {
        "model": "dmis-lab/biobert-base-cased-v1.1",
        "subset_frac": 0.005,
        "epochs": 1,
        "batch_size": 16,
        "fp16": True,  # set False on CPU
    },
}


In [29]:
# Preflight: check data files exist and have expected columns
import os
import shutil
import pandas as pd
from pathlib import Path

# Expected filenames
train_file = "drugsComTrain_raw.tsv"
test_file = "drugsComTest_raw.tsv"

# Local and Colab/Drive paths
local_dir = Path("data")
drive_dir = Path("/content/drive/MyDrive/data")  # Adjust if your Drive path differs
local_train = local_dir / train_file
local_test = local_dir / test_file
drive_train = drive_dir / train_file
drive_test = drive_dir / test_file

local_dir.mkdir(parents=True, exist_ok=True)

def ensure_local(src: Path, dst: Path) -> bool:
    if dst.exists():
        sz = dst.stat().st_size / 1e6
        print(f"FOUND: {dst} ({sz:.1f} MB)")
        return True
    if src.exists():
        dst.parent.mkdir(parents=True, exist_ok=True)
        shutil.copy2(src, dst)
        sz = dst.stat().st_size / 1e6
        print(f"COPIED from {src} -> {dst} ({sz:.1f} MB)")
        return True
    print(f"MISSING: {dst}. Place the TSV in data/ or {src.parent}/")
    return False

# Check training and test files
ok_train = ensure_local(drive_train, local_train)
ok_test = ensure_local(drive_test, local_test)
ok = ok_train and ok_test

# Quick schema check
for pth in [local_train, local_test]:
    if pth.exists():
        try:
            df = pd.read_csv(pth, sep="\t", nrows=5)
            print(f"{pth.name} columns:", list(df.columns))
            need = {"review", "rating"}
            missing = need - set(df.columns)
            if missing:
                print(f"WARNING: {pth.name} missing expected columns: {missing}")
        except Exception as e:
            print(f"Could not read sample from {pth}: {e}")

print("Preflight OK" if ok else "Preflight FAILED")


FOUND: data/drugsComTrain_raw.tsv (84.3 MB)
FOUND: data/drugsComTest_raw.tsv (28.1 MB)
drugsComTrain_raw.tsv columns: ['Unnamed: 0', 'drugName', 'condition', 'review', 'rating', 'date', 'usefulCount']
drugsComTest_raw.tsv columns: ['Unnamed: 0', 'drugName', 'condition', 'review', 'rating', 'date', 'usefulCount']
Preflight OK


In [10]:
# Run training for each model in the dict
from src.train_model import train_model
for name, cfg in models.items():
    m = cfg.get('model', name)
    bs = cfg.get('batch_size', 16)
    eps = cfg.get('epochs', 1)
    subset = cfg.get('subset_frac', None)
    use_fp16 = cfg.get('fp16', False)
    print(f'=== Training {m} ===')
    train_model(
        model_name=m,
        batch_size=bs,
        num_train_epochs=eps,
        imbalance_strategy='weighted_loss',
        fp16=use_fp16,
        subset_frac=subset,
        use_tensorboard=True,
    )


=== Training distilbert-base-uncased ===


2025-09-14 19:11:47 - ipykernel_launcher - INFO - Loaded train=(161297, 7) from data/drugsComTrain_raw.tsv; test=(53766, 7) from data/drugsComTest_raw.tsv
2025-09-14 19:11:48 - ipykernel_launcher - INFO - Label counts — train: {1: 113209, 0: 48088}; test: {1: 37559, 0: 16207}


KeyboardInterrupt: 

In [11]:
# Compare latest runs (by run dir mtime) and show test metrics per run
import glob, os, json
runs = sorted(glob.glob('results/mlruns/*/*'), key=os.path.getmtime)[-10:]
summary = []
for rd in runs:
    test_metrics = {}
    for f in glob.glob(f'{rd}/metrics/test_*'):
        with open(f) as fh:
            t, v, step = fh.read().strip().split()
            test_metrics[os.path.basename(f)] = float(v)
    if test_metrics:
        run_name=None
        tag = os.path.join(rd, 'tags', 'mlflow.runName')
        if os.path.exists(tag):
            run_name = open(tag).read().strip()
        summary.append((os.path.basename(rd), run_name, test_metrics))

for run_id, run_name, tm in summary:
    print(f'Run: {run_id}  Name: {run_name}')
    print(json.dumps(tm, indent=2))


Run: a871100d092f49d887fbca4a0152e038  Name: distilbert-base-uncased-lora-128-weighted_loss
{
  "test_eval_f1": 0.8113695090439277,
  "test_eval_accuracy": 0.6983471074380165,
  "test_eval_steps_per_second": 0.913,
  "test_eval_runtime": 8.7661,
  "test_eval_loss": 0.6745060086250305,
  "test_eval_recall": 0.9289940828402367,
  "test_eval_precision": 0.7201834862385321,
  "test_eval_samples_per_second": 27.606,
  "test_epoch": 1.0
}
Run: 2e976997a8244ec5b4e998bc08d57ba9  Name: distilbert-base-uncased-lora-128-weighted_loss
{
  "test_eval_f1": 0.8113695090439277,
  "test_eval_accuracy": 0.6983471074380165,
  "test_eval_steps_per_second": 0.565,
  "test_eval_runtime": 14.1579,
  "test_eval_loss": 0.6745060086250305,
  "test_eval_recall": 0.9289940828402367,
  "test_eval_precision": 0.7201834862385321,
  "test_eval_samples_per_second": 17.093,
  "test_epoch": 1.0
}
Run: 12dcab98a908407eae2724292216703e  Name: distilbert-base-uncased-lora-128-weighted_loss
{
  "test_eval_f1": 0.81136950904

In [12]:
# Compare latest runs (table) with nicer formatting
import glob, os, pandas as pd, numpy as np

def _read_metrics(run_dir):
    m = {}
    for f in glob.glob(f'{run_dir}/metrics/test_*'):
        with open(f) as fh:
            t, v, step = fh.read().strip().split()
            m[os.path.basename(f)] = float(v)
    return m

def _extract_model(run_name):
    # Expect '<model>-lora-<maxlen>-<imbalance>'
    if not run_name:
        return None
    parts = run_name.split('-lora-')[0]
    return parts

runs = sorted(glob.glob('results/mlruns/*/*'), key=os.path.getmtime)[-30:]
rows = []
for rd in runs:
    m = _read_metrics(rd)
    if not m:
        continue
    run_id = os.path.basename(rd)
    run_name = None
    tag = os.path.join(rd, 'tags', 'mlflow.runName')
    if os.path.exists(tag):
        run_name = open(tag).read().strip()
    rows.append({
        'run_id': run_id,
        'run_name': run_name,
        'model': _extract_model(run_name),
        'accuracy': m.get('test_eval_accuracy') or m.get('test_accuracy'),
        'precision': m.get('test_eval_precision') or m.get('test_precision'),
        'recall': m.get('test_eval_recall') or m.get('test_recall'),
        'f1': m.get('test_eval_f1') or m.get('test_f1'),
        'loss': m.get('test_eval_loss') or m.get('test_loss'),
    })

if rows:
    df = pd.DataFrame(rows)
    # Round numeric cols
    num_cols = ['accuracy','precision','recall','f1','loss']
    for col in num_cols:
        if col in df:
            df[col] = pd.to_numeric(df[col], errors='coerce').round(4)
    sort_col = 'f1' if df['f1'].notna().any() else 'accuracy'
    df = df.sort_values(by=sort_col, ascending=False, na_position='last').reset_index(drop=True)
    try:
        display(df.style.highlight_max(axis=0, subset=[sort_col], color='#d1ffd1'))
    except Exception:
        display(df)
else:
    print('No runs with test metrics found in results/mlruns.')


Unnamed: 0,run_id,run_name,model,accuracy,precision,recall,f1,loss
0,abdbafc2361e489981e41636a3ea3b1a,distilbert-base-uncased-lora-128-weighted_loss,distilbert-base-uncased,0.7231,0.7237,0.9763,0.8312,0.6636
1,255739eca4994d8f9c5e8bc0da3a0f8a,distilbert-base-uncased-lora-128-weighted_loss,distilbert-base-uncased,0.7231,0.7237,0.9763,0.8312,0.6636
2,19480f2e08ff4011811d623a0759291b,distilbert-base-uncased-lora-128-weighted_loss,distilbert-base-uncased,0.7231,0.7237,0.9763,0.8312,0.6636
3,12dcab98a908407eae2724292216703e,distilbert-base-uncased-lora-128-weighted_loss,distilbert-base-uncased,0.6983,0.7202,0.929,0.8114,0.6745
4,2e976997a8244ec5b4e998bc08d57ba9,distilbert-base-uncased-lora-128-weighted_loss,distilbert-base-uncased,0.6983,0.7202,0.929,0.8114,0.6745
5,a871100d092f49d887fbca4a0152e038,distilbert-base-uncased-lora-128-weighted_loss,distilbert-base-uncased,0.6983,0.7202,0.929,0.8114,0.6745
6,7c40f803f5ad446cb36321523e103ccd,biobert-base-cased-v1.1-lora-128-weighted_loss,biobert-base-cased-v1.1,0.3099,0.625,0.0296,0.0565,0.7092
