# Start

In [1]:
import os
from pathlib import Path

PROJECT_DIR  = '/content'
os.chdir(Path(PROJECT_DIR))
print("Current Working Directory:", Path.cwd())

Current Working Directory: /content


In [2]:
!if [ -d "pdm" ]; then rm -rf pdm; fi

In [3]:
!git clone https://github.com/soroushdty/pdm.git
REPO_DIR = Path.cwd() / "pdm"
print(REPO_DIR)
os.chdir(REPO_DIR)
print("Current Working Directory:", Path.cwd())

Cloning into 'pdm'...
remote: Enumerating objects: 251, done.[K
remote: Counting objects: 100% (91/91), done.[K
remote: Compressing objects: 100% (89/89), done.[K
remote: Total 251 (delta 58), reused 2 (delta 2), pack-reused 160 (from 1)[K
Receiving objects: 100% (251/251), 108.04 KiB | 1.50 MiB/s, done.
Resolving deltas: 100% (139/139), done.
/content/pdm
Current Working Directory: /content/pdm


In [4]:
from funcs.requirements_utils import install_missing
installed = install_missing(Path.cwd() / "requirements.txt", quiet=False)
print("Installed:", installed)

Installed: []


In [5]:
# Load config
from funcs.config_loader import load_config
CONFIG_PATH = Path.cwd() / "config.json"
cfg = load_config(CONFIG_PATH)
cfg

{'DIR_INPUT': 'input',
 'DIR_OUTPUT': 'output',
 'patient_col': 'Patient',
 'physician_col': 'Physician',
 'item_col': 'Item',
 'classes': ['Behavioral health',
  'Diagnoses',
  'Disabilities',
  'Infectious diseases',
  'Genetics',
  'Medications',
  'Sexual and reproductive health',
  'Social determinants of health',
  'Violence',
  'Other'],
 'other_class': ['Disabilities',
  'Infectious diseases',
  'Genetics',
  'Sexual and reproductive health',
  'Violence',
  'Social determinants of health'],
 'llms': ['sentence-transformers/all-MiniLM-L6-v2',
  'sentence-transformers/all-mpnet-base-v2',
  'sentence-transformers/multi-qa-MiniLM-L6-cos-v1',
  'pritamdeka/BioBERT-mnli-snli-scinli-scitail-mednli-stsb',
  'sentence-transformers/biomed-mpnet-base',
  'cambridgeltl/SapBERT-from-PubMedBERT-fulltext',
  'bert-base-uncased',
  'roberta-base',
  'distilbert-base-uncased',
  'nreimers/MiniLM-L6-H384-uncased',
  'emilyalsentzer/Bio_ClinicalBERT',
  'monologg/biobert_v1.1_pubmed',
  'bionlp/

# run

In [7]:
# Create run directory + logger, then run pipeline
from funcs.logger_setup import setup_logger
from funcs.preprocessing import run

run_dir = Path.cwd() / cfg["DIR_OUTPUT"]
print("Run directory:", run_dir)
logger = setup_logger(run_dir / "log.txt")
print("Logger:", run_dir / "log.txt")
p = run(cfg)

Run directory: /content/pdm/output
[23:38:42 02-01-26] INFO: Logging initialized at: /content/pdm/output/log.txt
Logger: /content/pdm/output/log.txt
[23:38:42 02-01-26] INFO: [train] Class columns to average: ['Behavioral health', 'Diagnoses', 'Disabilities', 'Infectious diseases', 'Genetics', 'Medications', 'Sexual and reproductive health', 'Social determinants of health', 'Violence', 'Other']
[23:38:42 02-01-26] INFO: [train] Unique (Patient, Item) pairs found: 879
[23:38:44 02-01-26] INFO: [train] Rows before: 1758; rows after: 879; pairs merged: 879
[23:38:44 02-01-26] INFO: [test] Class columns to average: ['Behavioral health', 'Diagnoses', 'Disabilities', 'Infectious diseases', 'Genetics', 'Medications', 'Sexual and reproductive health', 'Social determinants of health', 'Violence', 'Other']
[23:38:44 02-01-26] INFO: [test] Unique (Patient, Item) pairs found: 120
[23:38:44 02-01-26] INFO: [test] Rows before: 240; rows after: 120; pairs merged: 120
[23:39:08 02-01-26] INFO: Saved i

In [8]:
p

{'merged_counts': {'train': 879, 'test': 120},
 'paths': {'output_dir': '/content/pdm/output/23-38_FEB-01',
  'train_map': '/content/pdm/output/23-38_FEB-01/merge_map_train.json',
  'test_map': '/content/pdm/output/23-38_FEB-01/merge_map_test.json',
  'train_csv': '/content/pdm/output/23-38_FEB-01/train.csv',
  'test_csv': '/content/pdm/output/23-38_FEB-01/test.csv'},
 'shapes': {'train_raw': (1758, 14),
  'test_raw': (240, 14),
  'train_final': (868, 13),
  'test_final': (89, 13)}}

In [12]:
train = p['paths']['train_csv']
test = p['paths']['test_csv']
train_map = p['paths']['train_map']
test_map = p['paths']['test_map']
print(train)
print(test)
print(train_map)
print(test_map)

/content/pdm/output/train.csv
/content/pdm/output/test.csv
/content/pdm/output/merge_map_train.json
/content/pdm/output/merge_map_test.json
