#prequisites

In [1]:
# Set PROJECT_DIR
import os
from pathlib import Path
print("Current Working Directory:", Path.cwd())

!if [ -d "pdm" ]; then rm -rf pdm; fi
!git clone https://github.com/soroushdty/pdm.git

PROJECT_DIR = Path.cwd() / "pdm"
os.chdir(PROJECT_DIR)
print("PROJECT_DIR: ", Path.cwd())

Current Working Directory: /content
Cloning into 'pdm'...
remote: Enumerating objects: 646, done.[K
remote: Counting objects: 100% (172/172), done.[K
remote: Compressing objects: 100% (167/167), done.[K
remote: Total 646 (delta 97), reused 3 (delta 3), pack-reused 474 (from 1)[K
Receiving objects: 100% (646/646), 1.31 MiB | 6.76 MiB/s, done.
Resolving deltas: 100% (384/384), done.
PROJECT_DIR:  /content/pdm


In [2]:
# Install Requirements
from funcs.requirements_utils import install_missing
installed = install_missing(Path.cwd() / "requirements.txt", quiet=False)
print("Installed:", installed)

Installed: ['imbalanced-learn', 'iterative-stratification']


In [3]:
# Import
import numpy as np
import pandas as pd
from funcs.config_loader import load_config
from funcs.logger_setup import setup_logger
from funcs.preprocessing import run as preprocessing
from funcs.mean_pooling import mean_pooling
from funcs.sanitize_col_name import sanitize_col_name
from funcs. compute_embeddings import compute_embeddings
from funcs.train_ensemble_pipeline import train_ensemble_pipeline
from funcs.get_dataset_from_npz import get_dataset_from_npz



In [4]:
# Load config
CONFIG_PATH = Path.cwd() / "config.json"
cfg = load_config(CONFIG_PATH)
cfg

{'DIR_INPUT': 'input',
 'DIR_OUTPUT': 'output',
 'patient_col': 'Patient',
 'physician_col': 'Physician',
 'item_col': 'Item',
 'classes': ['Behavioral health',
  'Diagnoses',
  'Disabilities',
  'Infectious diseases',
  'Genetics',
  'Medications',
  'Sexual and reproductive health',
  'Social determinants of health',
  'Violence',
  'Other'],
 'other_class': ['Disabilities',
  'Infectious diseases',
  'Genetics',
  'Sexual and reproductive health',
  'Violence',
  'Social determinants of health'],
 'llms': ['sentence-transformers/all-MiniLM-L6-v2',
  'sentence-transformers/all-mpnet-base-v2',
  'sentence-transformers/multi-qa-MiniLM-L6-cos-v1',
  'pritamdeka/BioBERT-mnli-snli-scinli-scitail-mednli-stsb',
  'sentence-transformers/biomed-mpnet-base',
  'cambridgeltl/SapBERT-from-PubMedBERT-fulltext',
  'bert-base-uncased',
  'roberta-base',
  'distilbert-base-uncased',
  'nreimers/MiniLM-L6-H384-uncased',
  'emilyalsentzer/Bio_ClinicalBERT',
  'monologg/biobert_v1.1_pubmed',
  'bionlp/

In [5]:
# Start Logging
DIR_LOG = Path.cwd() / cfg["DIR_OUTPUT"] / "log.txt"
logger = setup_logger(DIR_LOG)

[06:54:14 02-02-26] INFO: Logging initialized at: /content/pdm/output/log.txt


# Preprocessing

In [6]:
p = preprocessing(cfg)
print('\n', p)

[06:55:23 02-02-26] INFO: [train] Class columns to average: ['Behavioral health', 'Diagnoses', 'Disabilities', 'Infectious diseases', 'Genetics', 'Medications', 'Sexual and reproductive health', 'Social determinants of health', 'Violence', 'Other']
[06:55:23 02-02-26] INFO: [train] Unique (Patient, Item) pairs found: 879
[06:55:24 02-02-26] INFO: [train] Rows before: 1758; rows after: 879; pairs merged: 879
[06:55:24 02-02-26] INFO: [test] Class columns to average: ['Behavioral health', 'Diagnoses', 'Disabilities', 'Infectious diseases', 'Genetics', 'Medications', 'Sexual and reproductive health', 'Social determinants of health', 'Violence', 'Other']
[06:55:24 02-02-26] INFO: [test] Unique (Patient, Item) pairs found: 120
[06:55:24 02-02-26] INFO: [test] Rows before: 240; rows after: 120; pairs merged: 120
[06:55:38 02-02-26] INFO: Saved item standardization map: /content/pdm/output/merge_map_train.json
[06:55:39 02-02-26] INFO: Saved item standardization map: /content/pdm/output/merge

In [12]:
train_csv = pd.read_csv(PROJECT_DIR / cfg ['DIR_OUTPUT'] / 'train.csv')
test_csv = pd.read_csv(PROJECT_DIR / cfg ['DIR_OUTPUT'] / 'test.csv')
train_map = PROJECT_DIR / cfg ['DIR_OUTPUT'] / 'merge_map_train.json'
test_map = PROJECT_DIR / cfg ['DIR_OUTPUT'] / 'merge_map_test.json'
print('Shape of train.csv', train_csv.shape)
print('Shape of test.csv', test_csv.shape)
print('Path of train_map:', train_map)
print('Path of test_map:', test_map)

Shape of train.csv (868, 13)
Shape of test.csv (89, 13)
Path of train_map: /content/pdm/output/merge_map_train.json
Path of test_map: /content/pdm/output/merge_map_test.json


# y npz files generation

In [None]:
cfg # WHERE IS CIMILARITY MTEHOD?

col_included = [c for c in cfg['classes'] if c not in cfg['other_class']]

z_train = train_csv.loc[:, col_included]
y_train = z_train.reset_index().to_numpy()
y_train_file_name = Path.cwd() / cfg['DIR_OUTPUT'] / 'y_train.npz'
np.savez_compressed(y_train_file_name, **{'0': y_train})

z_test = test_csv.loc[:, col_included]
y_test = z_test.reset_index().to_numpy()
y_test_file_name = Path.cwd() / cfg['DIR_OUTPUT'] / 'y_test.npz'
np.savez_compressed(y_test_file_name, **{'0': y_test})

# embedding

In [7]:
y_train_path = Path.cwd() / cfg['DIR_OUTPUT'] / 'y_train.npz'
y_test_path = Path.cwd() / cfg['DIR_OUTPUT'] / 'y_test.npz'
y_train = np.load(y_train_path)
y_test = np.load(y_test_path)

In [25]:
llm_example = cfg['llms'][0]
BATCH_SIZE = cfg['BATCH_SIZE']
print(llm_example)
print(BATCH_SIZE)

sentence-transformers/all-MiniLM-L6-v2
128


In [24]:
items_train = train_csv['Item']
items_test = test_csv['Item']
items = pd.concat([items_train, items_test]).drop_duplicates().sort_values()
print(items.shape)
embedding = compute_embeddings(llm_example, items, BATCH_SIZE)
print(embedding.shape)

(651,)
Computing embeddings for sentence-transformers/all-MiniLM-L6-v2...
Attempting to load sentence-transformers/all-MiniLM-L6-v2 as SentenceTransformer...
[03:06:12 02-02-26] INFO: Load pretrained SentenceTransformer: sentence-transformers/all-MiniLM-L6-v2


Batches:   0%|          | 0/6 [00:00<?, ?it/s]

Success with SentenceTransformer.
(651, 384)


In [31]:
sanitized = sanitize_col_name(llm_example)
print('LLM: ', sanitized)
embedding_file_name = Path.cwd() / cfg['DIR_OUTPUT'] / f'embdedding-{sanitized}-item.npz'
item_embedding_dict = {item: embedding[i] for i, item in enumerate(items.to_list())}
np.savez_compressed(embedding_file_name, **item_embedding_dict)
print('Saved to: ', embedding_file_name)

LLM:  sentence_transformers__all_MiniLM_L6_v2
Saved to:  /content/pdm/output/embdedding-sentence_transformers__all_MiniLM_L6_v2-item.npz


# base model

In [9]:
x_path = Path.cwd() / cfg["DIR_OUTPUT"] /'embdedding-sentence_transformers__all_MiniLM_L6_v2-item.npz'
y_train_path = Path.cwd() / cfg["DIR_OUTPUT"] /'y_train.npz'
y_test_path = Path.cwd() / cfg["DIR_OUTPUT"] /'y_test.npz'
train_csv_path = Path.cwd() / cfg["DIR_OUTPUT"] / 'train.csv'
test_csv_path = Path.cwd() / cfg["DIR_OUTPUT"] / 'test.csv'
train_map = Path.cwd() / cfg["DIR_OUTPUT"] /'merge_map_train.json'
test_map = Path.cwd() / cfg["DIR_OUTPUT"] /'merge_map_test.json'

print(x_path)
print(y_train_path)
print(y_test_path)
print(train_map)
print(test_map)
print(train_csv_path)
print(test_csv_path)

/content/pdm/output/embdedding-sentence_transformers__all_MiniLM_L6_v2-item.npz
/content/pdm/output/y_train.npz
/content/pdm/output/y_test.npz
/content/pdm/output/merge_map_train.json
/content/pdm/output/merge_map_test.json
/content/pdm/output/train.csv
/content/pdm/output/test.csv


In [10]:
x = np.load(x_path)
train_df = pd.read_csv(train_csv_path)
test_df = pd.read_csv(test_csv_path)

print("len(x.keys): ", len(x.keys()))
print("Shape of train_df:", train_df.shape)
print("Shape of test_df:", test_df.shape)

len(x.keys):  651
Shape of train_df: (868, 13)
Shape of test_df: (89, 13)


In [11]:
y_train = np.load(y_train_path)['0']
y_test = np.load(y_test_path)['0']
print("Shape of y_train:", y_train.shape)
print("Shape of y_test:", y_test.shape)

X_train, X_test = get_dataset_from_npz(x, y_train, y_test, train_map, test_map)
print("Shape of X_train:", X_train.shape)
print("Shape of X_test:", X_test.shape)

Shape of y_train: (868, 5)
Shape of y_test: (89, 5)
Shape of X_train: (868, 384)
Shape of X_test: (89, 384)


In [12]:
import logging, traceback
def main():
    try:
        train_ensemble_pipeline(X_train, y_train, X_test, y_test, cfg)
        print("Pipeline executed successfully.")
    except Exception as e:
        logging.info(f"CRITICAL ERROR: {e}")
        traceback.print_exc()


if __name__ == "__main__":
    main()

[06:32:57 02-02-26] INFO: Starting Training on X_train: (868, 384)...


  explained_variance_ratio_ = explained_variance_ / total_var
  explained_variance_ratio_ = explained_variance_ / total_var
  explained_variance_ratio_ = explained_variance_ / total_var
  explained_variance_ratio_ = explained_variance_ / total_var
  explained_variance_ratio_ = explained_variance_ / total_var
  explained_variance_ratio_ = explained_variance_ / total_var
  explained_variance_ratio_ = explained_variance_ / total_var
  explained_variance_ratio_ = explained_variance_ / total_var
  explained_variance_ratio_ = explained_variance_ / total_var
  explained_variance_ratio_ = explained_variance_ / total_var
  explained_variance_ratio_ = explained_variance_ / total_var
  explained_variance_ratio_ = explained_variance_ / total_var
  explained_variance_ratio_ = explained_variance_ / total_var
  explained_variance_ratio_ = explained_variance_ / total_var
  explained_variance_ratio_ = explained_variance_ / total_var
  explained_variance_ratio_ = explained_variance_ / total_var
  explai

[06:33:11 02-02-26] INFO: CRITICAL ERROR: shape mismatch: value array of shape (174,5) could not be broadcast to indexing result of shape (174,4)


Traceback (most recent call last):
  File "/tmp/ipython-input-2116884173.py", line 4, in main
    train_ensemble_pipeline(X_train, y_train, X_test, y_test, cfg)
  File "/content/pdm/funcs/train_ensemble_pipeline.py", line 103, in train_ensemble_pipeline
    oof_preds[val_ix] = val_probs_cal
    ~~~~~~~~~^^^^^^^^
ValueError: shape mismatch: value array of shape (174,5) could not be broadcast to indexing result of shape (174,4)
