#prequisites

## load repo

In [1]:
import os
from pathlib import Path

PROJECT_DIR  = '/content'
os.chdir(Path(PROJECT_DIR))
print("Current Working Directory:", Path.cwd())

Current Working Directory: /content


In [2]:
!if [ -d "pdm" ]; then rm -rf pdm; fi

In [3]:
!git clone https://github.com/soroushdty/pdm.git
REPO_DIR = Path.cwd() / "pdm"
print(REPO_DIR)
os.chdir(REPO_DIR)
print("Current Working Directory:", Path.cwd())

Cloning into 'pdm'...
remote: Enumerating objects: 459, done.[K
remote: Counting objects: 100% (156/156), done.[K
remote: Compressing objects: 100% (153/153), done.[K
remote: Total 459 (delta 97), reused 3 (delta 3), pack-reused 303 (from 2)[K
Receiving objects: 100% (459/459), 1.25 MiB | 4.38 MiB/s, done.
Resolving deltas: 100% (264/264), done.
/content/pdm
Current Working Directory: /content/pdm


## Import Prequisites

In [4]:
from funcs.requirements_utils import install_missing
installed = install_missing(Path.cwd() / "requirements.txt", quiet=False)
print("Installed:", installed)

Installed: []


In [5]:
import numpy as np
import pandas as pd
from funcs.config_loader import load_config
from funcs.logger_setup import setup_logger
from funcs.preprocessing import run as preprocessing
from funcs.mean_pooling import mean_pooling
from funcs.sanitize_col_name import sanitize_col_name
from funcs. compute_embeddings import compute_embeddings



## load config

In [6]:
# Load config
CONFIG_PATH = Path.cwd() / "config.json"
cfg = load_config(CONFIG_PATH)
cfg

{'DIR_INPUT': 'input',
 'DIR_OUTPUT': 'output',
 'patient_col': 'Patient',
 'physician_col': 'Physician',
 'item_col': 'Item',
 'classes': ['Behavioral health',
  'Diagnoses',
  'Disabilities',
  'Infectious diseases',
  'Genetics',
  'Medications',
  'Sexual and reproductive health',
  'Social determinants of health',
  'Violence',
  'Other'],
 'other_class': ['Disabilities',
  'Infectious diseases',
  'Genetics',
  'Sexual and reproductive health',
  'Violence',
  'Social determinants of health'],
 'llms': ['sentence-transformers/all-MiniLM-L6-v2',
  'sentence-transformers/all-mpnet-base-v2',
  'sentence-transformers/multi-qa-MiniLM-L6-cos-v1',
  'pritamdeka/BioBERT-mnli-snli-scinli-scitail-mednli-stsb',
  'sentence-transformers/biomed-mpnet-base',
  'cambridgeltl/SapBERT-from-PubMedBERT-fulltext',
  'bert-base-uncased',
  'roberta-base',
  'distilbert-base-uncased',
  'nreimers/MiniLM-L6-H384-uncased',
  'emilyalsentzer/Bio_ClinicalBERT',
  'monologg/biobert_v1.1_pubmed',
  'bionlp/

## start log

In [7]:
DIR_LOG = Path.cwd() / cfg["DIR_OUTPUT"] / "log.txt"
logger = setup_logger(DIR_LOG)
print("Logger path:", DIR_LOG)

[03:30:11 02-02-26] INFO: Logging initialized at: /content/pdm/output/log.txt
Logger path: /content/pdm/output/log.txt


# Preprocessing (dont run)

In [None]:
p = preprocessing(cfg)
print('\n', p)

[01:25:39 02-02-26] INFO: [train] Class columns to average: ['Behavioral health', 'Diagnoses', 'Disabilities', 'Infectious diseases', 'Genetics', 'Medications', 'Sexual and reproductive health', 'Social determinants of health', 'Violence', 'Other']
[01:25:39 02-02-26] INFO: [train] Unique (Patient, Item) pairs found: 879
[01:25:41 02-02-26] INFO: [train] Rows before: 1758; rows after: 879; pairs merged: 879
[01:25:41 02-02-26] INFO: [test] Class columns to average: ['Behavioral health', 'Diagnoses', 'Disabilities', 'Infectious diseases', 'Genetics', 'Medications', 'Sexual and reproductive health', 'Social determinants of health', 'Violence', 'Other']
[01:25:41 02-02-26] INFO: [test] Unique (Patient, Item) pairs found: 120
[01:25:41 02-02-26] INFO: [test] Rows before: 240; rows after: 120; pairs merged: 120
[01:26:06 02-02-26] INFO: Saved item standardization map: /content/pdm/output/merge_map_train.json
[01:26:07 02-02-26] INFO: Saved item standardization map: /content/pdm/output/merge

In [None]:
train = p['paths']['train_csv']
test = p['paths']['test_csv']
train_map = p['paths']['train_map']
test_map = p['paths']['test_map']
print(train)
print(test)
print(train_map)
print(test_map)

/content/pdm/output/train.csv
/content/pdm/output/test.csv
/content/pdm/output/merge_map_train.json
/content/pdm/output/merge_map_test.json


# y npz files generation (dont run)

In [None]:

train_csv = pd.read_csv('/content/pdm/output/train.csv')
test_csv = pd.read_csv('/content/pdm/output/test.csv')

In [None]:
cfg # WHERE IS CIMILARITY MTEHOD?

col_included = [c for c in cfg['classes'] if c not in cfg['other_class']]

z_train = train_csv.loc[:, col_included]
y_train = z_train.reset_index().to_numpy()
y_train_file_name = Path.cwd() / cfg['DIR_OUTPUT'] / 'y_train.npz'
np.savez_compressed(y_train_file_name, **{'0': y_train})

z_test = test_csv.loc[:, col_included]
y_test = z_test.reset_index().to_numpy()
y_test_file_name = Path.cwd() / cfg['DIR_OUTPUT'] / 'y_test.npz'
np.savez_compressed(y_test_file_name, **{'0': y_test})

# embedding (dont run)

In [7]:

train_csv = pd.read_csv('/content/pdm/output/train.csv')
test_csv = pd.read_csv('/content/pdm/output/test.csv')

y_train_path = Path.cwd() / cfg['DIR_OUTPUT'] / 'y_train.npz'
y_test_path = Path.cwd() / cfg['DIR_OUTPUT'] / 'y_test.npz'
y_train = np.load(y_train_path)
y_test = np.load(y_test_path)

DEBUG:2026-02-02 09:37:21,089:jax._src.path:41: etils.epath found. Using etils.epath for file I/O.


[02:37:23 02-02-26] INFO: TensorFlow version 2.19.0 available.
[02:37:23 02-02-26] INFO: JAX version 0.7.2 available.


In [25]:
llm_example = cfg['llms'][0]
BATCH_SIZE = cfg['BATCH_SIZE']
print(llm_example)
print(BATCH_SIZE)

sentence-transformers/all-MiniLM-L6-v2
128


In [24]:
items_train = train_csv['Item']
items_test = test_csv['Item']
items = pd.concat([items_train, items_test]).drop_duplicates().sort_values().reset_index(drop=True)
print(items.shape)
embedding = compute_embeddings(llm_example, items, BATCH_SIZE)
print(embedding.shape)

(651,)
Computing embeddings for sentence-transformers/all-MiniLM-L6-v2...
Attempting to load sentence-transformers/all-MiniLM-L6-v2 as SentenceTransformer...
[03:06:12 02-02-26] INFO: Load pretrained SentenceTransformer: sentence-transformers/all-MiniLM-L6-v2


Batches:   0%|          | 0/6 [00:00<?, ?it/s]

Success with SentenceTransformer.
(651, 384)


In [31]:
sanitized = sanitize_col_name(llm_example)
print('LLM: ', sanitized)
embedding_file_name = Path.cwd() / cfg['DIR_OUTPUT'] / f'embdedding-{sanitized}-item.npz'
item_embedding_dict = {item: embedding[i] for i, item in enumerate(items.to_list())}
np.savez_compressed(embedding_file_name, **item_embedding_dict)
print('Saved to: ', embedding_file_name)

LLM:  sentence_transformers__all_MiniLM_L6_v2
Saved to:  /content/pdm/output/embdedding-sentence_transformers__all_MiniLM_L6_v2-item.npz


# base model

In [12]:
import numpy as np
x_path = '/content/pdm/output/embdedding-sentence_transformers__all_MiniLM_L6_v2-item.npz'
x = np.load(x_path)
y_train_path = '/content/pdm/output/y_train.npz'
y_test_path = '/content/pdm/output/y_test.npz'
y_train = np.load(y_train_path)['0']
y_test = np.load(y_test_path)['0']
train_map = '/content/pdm/output/merge_map_train.json'
test_map = '/content/pdm/output/merge_map_test.json'

In [11]:
print(len(x.keys()))
print(y_train.shape)
print(y_test.shape)

651
(868, 5)
(89, 5)


In [13]:
from funcs.get_dataset_from_npz import get_dataset_from_npz
get_dataset_from_npz(x, y_train, y_test, train_map, test_map)

ModuleNotFoundError: No module named 'funcs.get_dataset_from_npz'