<a href="https://colab.research.google.com/github/soroushdty/pdm/blob/main/main.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# PDM Preprocessing

In [2]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [3]:
!git clone https://github.com/soroushdty/pdm.git

Cloning into 'pdm'...
remote: Enumerating objects: 28, done.[K
remote: Counting objects: 100% (28/28), done.[K
remote: Compressing objects: 100% (23/23), done.[K
remote: Total 28 (delta 3), reused 0 (delta 0), pack-reused 0 (from 0)[K
Receiving objects: 100% (28/28), 16.95 KiB | 5.65 MiB/s, done.
Resolving deltas: 100% (3/3), done.


In [None]:
# Add project folder to PYTHONPATH
import sys
from pathlib import Path

PROJECT_DIR = Path("/content/drive/MyDrive/pdm/New")
# Changed to add PROJECT_DIR itself to sys.path, assuming pdm_preprocess is within it.
if str(PROJECT_DIR) not in sys.path:
    sys.path.insert(0, str(PROJECT_DIR))

# Install requirements (reads /content/requirements.txt)
from pdm_preprocess.requirements_utils import install_missing
# Assuming requirements.txt is within the PROJECT_DIR
installed = install_missing(PROJECT_DIR / "requirements.txt", quiet=False)
print("Installed:", installed)

Installed: []


In [None]:
# Load config
from pdm_preprocess.config_loader import load_config, validate_config

CONFIG_PATH = "/content/drive/MyDrive/pdm/configs/1_preprocessing.json"  # <-- change if needed
cfg = validate_config(load_config(CONFIG_PATH))
cfg

{'dataset': '/content/drive/MyDrive/pdm/input/dataset.xlsx',
 'patient_col': 'Patient',
 'physician_col': 'Physician',
 'item_col': 'Item',
 'output_path': '/content/drive/MyDrive/pdm/output/1_dedup',
 'duplicates': '/content/drive/MyDrive/pdm/configs/duplicates.xlsx',
 'classes': ['Behavioral health',
  'Diagnoses',
  'Disabilities',
  'Infectious diseases',
  'Genetics',
  'Medications',
  'Sexual and reproductive health',
  'Social determinants of health',
  'Violence',
  'Other']}

In [None]:
# Create run directory + logger, then run pipeline
from pdm_preprocess.paths import make_run_dir
from pdm_preprocess.logger_setup import setup_logger
from pdm_preprocess.pipeline import run_pipeline
from pathlib import Path

run_dir = make_run_dir(cfg["output_path"])
logger = setup_logger(run_dir / "log.txt")

# IMPORTANT: pipeline writes outputs into cfg["output_path"].
# We'll point cfg output_path at this run directory for this run.
cfg_run = dict(cfg)
cfg_run["output_path"] = str(run_dir)

result = run_pipeline(cfg_run)
result

[16:42:23 01-24-26] INFO: Logging initialized at: /content/drive/MyDrive/pdm/output/1_dedup/16-42_JAN-24/log.txt
[16:42:24 01-24-26] INFO: [train] Class columns to average: ['Behavioral health', 'Diagnoses', 'Disabilities', 'Infectious diseases', 'Genetics', 'Medications', 'Sexual and reproductive health', 'Social determinants of health', 'Violence', 'Other']
[16:42:24 01-24-26] INFO: [train] Unique (Patient, Item) pairs found: 879
[16:42:25 01-24-26] INFO: [train] Rows before: 1758; rows after: 879; pairs merged: 879
[16:42:25 01-24-26] INFO: [test] Class columns to average: ['Behavioral health', 'Diagnoses', 'Disabilities', 'Infectious diseases', 'Genetics', 'Medications', 'Sexual and reproductive health', 'Social determinants of health', 'Violence', 'Other']
[16:42:25 01-24-26] INFO: [test] Unique (Patient, Item) pairs found: 120
[16:42:26 01-24-26] INFO: [test] Rows before: 240; rows after: 120; pairs merged: 120
[16:42:40 01-24-26] INFO: Saved item standardization map: /content/dr

{'merged_counts': {'train': 879, 'test': 120},
 'paths': {'output_dir': '/content/drive/MyDrive/pdm/output/1_dedup/16-42_JAN-24',
  'train_map': '/content/drive/MyDrive/pdm/output/1_dedup/16-42_JAN-24/merge_map_train.json',
  'test_map': '/content/drive/MyDrive/pdm/output/1_dedup/16-42_JAN-24/merge_map_test.json',
  'train_csv': '/content/drive/MyDrive/pdm/output/1_dedup/16-42_JAN-24/train.csv',
  'test_csv': '/content/drive/MyDrive/pdm/output/1_dedup/16-42_JAN-24/test.csv'},
 'shapes': {'train_raw': (1758, 14),
  'test_raw': (240, 14),
  'train_final': (868, 13),
  'test_final': (89, 13)}}