In [6]:
def backend_select(backend='colab'):
  sol_path = '/home/sdianaty/pdm'
  colab_path = '/content/drive/MyDrive/pdm'
  if backend=='colab':
    return colab_path
  elif backend=='sol':
    return sol_path
  else:
    raise ValueError("backend must be one of 'sol' or 'colab'")

In [7]:
import os
from pathlib import Path
backend = 'colab' # Switch backend
PROJECT_DIR = backend_select(backend)
os.chdir(PROJECT_DIR)
print("Current Working Directory:", Path.cwd())

if backend == 'colab':
  from google.colab import drive
  drive.mount('/content/drive')

from funcs.requirements_utils import install_missing
installed = install_missing(PROJECT_DIR / "requirements.txt", quiet=False)
print("Installed:", installed)

Current Working Directory: /content/drive/MyDrive/pdm
Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


ModuleNotFoundError: No module named 'funcs'

In [None]:
!git clone https://github.com/soroushdty/pdm.git

Cloning into 'pdm'...
remote: Enumerating objects: 53, done.[K
remote: Counting objects: 100% (53/53), done.[K
remote: Compressing objects: 100% (48/48), done.[K
remote: Total 53 (delta 18), reused 0 (delta 0), pack-reused 0 (from 0)[K
Receiving objects: 100% (53/53), 24.51 KiB | 6.13 MiB/s, done.
Resolving deltas: 100% (18/18), done.


In [None]:
# Add project folder to PYTHONPATH
import os
import sys
from pathlib import Path

PROJECT_NAME = "pdm"
PROJECT_DIR = (Path.cwd() / PROJECT_NAME).resolve()
if str(PROJECT_DIR) not in sys.path:
    sys.path.insert(0, str(PROJECT_DIR))

os.chdir(PROJECT_DIR)

# Install requirements
from funcs.requirements_utils import install_missing
# Assuming requirements.txt is within the PROJECT_DIR
installed = install_missing(PROJECT_DIR / "requirements.txt", quiet=False)
print("Installed:", installed)

Installed: []


In [None]:
# Load config
from funcs.config_loader import load_config, validate_config

CONFIG_PATH = os.path.join(PROJECT_DIR, "config.json")
cfg = validate_config(load_config(CONFIG_PATH))
cfg

{'dataset': '/content/drive/MyDrive/pdm/input/dataset.xlsx',
 'output_path': '/content/drive/MyDrive/pdm/output',
 'patient_col': 'Patient',
 'physician_col': 'Physician',
 'item_col': 'Item',
 'classes': ['Behavioral health',
  'Diagnoses',
  'Disabilities',
  'Infectious diseases',
  'Genetics',
  'Medications',
  'Sexual and reproductive health',
  'Social determinants of health',
  'Violence',
  'Other']}

In [None]:
# Create run directory + logger, then run pipeline
from funcs.paths import make_run_dir
from funcs.logger_setup import setup_logger
from funcs.preprocessing import run
from pathlib import Path

run_dir = make_run_dir(cfg["output_path"])
logger = setup_logger(run_dir / "log.txt")

# IMPORTANT: pipeline writes outputs into cfg["output_path"].
# We'll point cfg output_path at this run directory for this run.
#note: resolve later
cfg_run = dict(cfg)
cfg_run["output_path"] = str(run_dir)

[17:24:25 01-25-26] INFO: Logging initialized at: /content/drive/MyDrive/pdm/output/17-24_JAN-25/log.txt


In [None]:
p = run(cfg_run)
train = p['paths']['train_csv']
test = p['paths']['test_csv']
train_map = p['paths']['train_map']
test_map = p['paths']['test_map']

[17:24:34 01-25-26] INFO: [train] Class columns to average: ['Behavioral health', 'Diagnoses', 'Disabilities', 'Infectious diseases', 'Genetics', 'Medications', 'Sexual and reproductive health', 'Social determinants of health', 'Violence', 'Other']
[17:24:34 01-25-26] INFO: [train] Unique (Patient, Item) pairs found: 879
[17:24:37 01-25-26] INFO: [train] Rows before: 1758; rows after: 879; pairs merged: 879
[17:24:37 01-25-26] INFO: [test] Class columns to average: ['Behavioral health', 'Diagnoses', 'Disabilities', 'Infectious diseases', 'Genetics', 'Medications', 'Sexual and reproductive health', 'Social determinants of health', 'Violence', 'Other']
[17:24:37 01-25-26] INFO: [test] Unique (Patient, Item) pairs found: 120
[17:24:37 01-25-26] INFO: [test] Rows before: 240; rows after: 120; pairs merged: 120
[17:25:00 01-25-26] INFO: Saved item standardization map: /content/drive/MyDrive/pdm/output/17-24_JAN-25/merge_map_train.json
[17:25:01 01-25-26] INFO: Saved item standardization map