# PDM Preprocessing

In [1]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [2]:
!git clone https://github.com/soroushdty/pdm.git

Cloning into 'pdm'...
remote: Enumerating objects: 34, done.[K
remote: Counting objects: 100% (34/34), done.[K
remote: Compressing objects: 100% (29/29), done.[K
remote: Total 34 (delta 6), reused 0 (delta 0), pack-reused 0 (from 0)[K
Receiving objects: 100% (34/34), 19.62 KiB | 669.00 KiB/s, done.
Resolving deltas: 100% (6/6), done.


In [3]:
# Add project folder to PYTHONPATH
import os
import sys
from pathlib import Path

PROJECT_NAME = "pdm"
PROJECT_DIR = (Path.cwd() / PROJECT_NAME).resolve()
if str(PROJECT_DIR) not in sys.path:
    sys.path.insert(0, str(PROJECT_DIR))

os.chdir(PROJECT_DIR)

# Install requirements
from funcs.requirements_utils import install_missing
# Assuming requirements.txt is within the PROJECT_DIR
installed = install_missing(PROJECT_DIR / "requirements.txt", quiet=False)
print("Installed:", installed)

Installed: []


In [4]:
# Load config
from funcs.config_loader import load_config, validate_config

CONFIG_PATH = os.path.join(PROJECT_DIR, "config.json")
cfg = validate_config(load_config(CONFIG_PATH))
cfg

{'dataset': '/content/drive/MyDrive/pdm/input/dataset.xlsx',
 'patient_col': 'Patient',
 'physician_col': 'Physician',
 'item_col': 'Item',
 'output_path': '/content/drive/MyDrive/pdm/output/1_dedup',
 'duplicates': '/content/drive/MyDrive/pdm/configs/duplicates.xlsx',
 'classes': ['Behavioral health',
  'Diagnoses',
  'Disabilities',
  'Infectious diseases',
  'Genetics',
  'Medications',
  'Sexual and reproductive health',
  'Social determinants of health',
  'Violence',
  'Other']}

In [5]:
# Create run directory + logger, then run pipeline
from funcs.paths import make_run_dir
from funcs.logger_setup import setup_logger
from funcs.pipeline import run_pipeline
from pathlib import Path

run_dir = make_run_dir(cfg["output_path"])
logger = setup_logger(run_dir / "log.txt")

# IMPORTANT: pipeline writes outputs into cfg["output_path"].
# We'll point cfg output_path at this run directory for this run.
cfg_run = dict(cfg)
cfg_run["output_path"] = str(run_dir)

result = run_pipeline(cfg_run)
result

[16:37:20 01-25-26] INFO: Logging initialized at: /content/drive/MyDrive/pdm/output/1_dedup/16-37_JAN-25/log.txt
[16:37:23 01-25-26] INFO: [train] Class columns to average: ['Behavioral health', 'Diagnoses', 'Disabilities', 'Infectious diseases', 'Genetics', 'Medications', 'Sexual and reproductive health', 'Social determinants of health', 'Violence', 'Other']
[16:37:23 01-25-26] INFO: [train] Unique (Patient, Item) pairs found: 879
[16:37:25 01-25-26] INFO: [train] Rows before: 1758; rows after: 879; pairs merged: 879
[16:37:25 01-25-26] INFO: [test] Class columns to average: ['Behavioral health', 'Diagnoses', 'Disabilities', 'Infectious diseases', 'Genetics', 'Medications', 'Sexual and reproductive health', 'Social determinants of health', 'Violence', 'Other']
[16:37:25 01-25-26] INFO: [test] Unique (Patient, Item) pairs found: 120
[16:37:26 01-25-26] INFO: [test] Rows before: 240; rows after: 120; pairs merged: 120
[16:37:52 01-25-26] INFO: Saved item standardization map: /content/dr

{'merged_counts': {'train': 879, 'test': 120},
 'paths': {'output_dir': '/content/drive/MyDrive/pdm/output/1_dedup/16-37_JAN-25',
  'train_map': '/content/drive/MyDrive/pdm/output/1_dedup/16-37_JAN-25/merge_map_train.json',
  'test_map': '/content/drive/MyDrive/pdm/output/1_dedup/16-37_JAN-25/merge_map_test.json',
  'train_csv': '/content/drive/MyDrive/pdm/output/1_dedup/16-37_JAN-25/train.csv',
  'test_csv': '/content/drive/MyDrive/pdm/output/1_dedup/16-37_JAN-25/test.csv'},
 'shapes': {'train_raw': (1758, 14),
  'test_raw': (240, 14),
  'train_final': (868, 13),
  'test_final': (89, 13)}}