#prequisites

## load repo

In [1]:
import os
from pathlib import Path

PROJECT_DIR  = '/content'
os.chdir(Path(PROJECT_DIR))
print("Current Working Directory:", Path.cwd())

Current Working Directory: /content


In [2]:
!if [ -d "pdm" ]; then rm -rf pdm; fi

In [3]:
!git clone https://github.com/soroushdty/pdm.git
REPO_DIR = Path.cwd() / "pdm"
print(REPO_DIR)
os.chdir(REPO_DIR)
print("Current Working Directory:", Path.cwd())

Cloning into 'pdm'...
remote: Enumerating objects: 328, done.[K
remote: Counting objects: 100% (26/26), done.[K
remote: Compressing objects: 100% (23/23), done.[K
remote: Total 328 (delta 11), reused 3 (delta 3), pack-reused 302 (from 2)[K
Receiving objects: 100% (328/328), 178.30 KiB | 1.43 MiB/s, done.
Resolving deltas: 100% (177/177), done.
/content/pdm
Current Working Directory: /content/pdm


## Install Prequisites

In [4]:
from funcs.requirements_utils import install_missing
installed = install_missing(Path.cwd() / "requirements.txt", quiet=False)
print("Installed:", installed)

Installed: []


## load config

In [5]:
# Load config
from funcs.config_loader import load_config
CONFIG_PATH = Path.cwd() / "config.json"
cfg = load_config(CONFIG_PATH)
cfg

{'DIR_INPUT': 'input',
 'DIR_OUTPUT': 'output',
 'patient_col': 'Patient',
 'physician_col': 'Physician',
 'item_col': 'Item',
 'classes': ['Behavioral health',
  'Diagnoses',
  'Disabilities',
  'Infectious diseases',
  'Genetics',
  'Medications',
  'Sexual and reproductive health',
  'Social determinants of health',
  'Violence',
  'Other'],
 'other_class': ['Disabilities',
  'Infectious diseases',
  'Genetics',
  'Sexual and reproductive health',
  'Violence',
  'Social determinants of health'],
 'llms': ['sentence-transformers/all-MiniLM-L6-v2',
  'sentence-transformers/all-mpnet-base-v2',
  'sentence-transformers/multi-qa-MiniLM-L6-cos-v1',
  'pritamdeka/BioBERT-mnli-snli-scinli-scitail-mednli-stsb',
  'sentence-transformers/biomed-mpnet-base',
  'cambridgeltl/SapBERT-from-PubMedBERT-fulltext',
  'bert-base-uncased',
  'roberta-base',
  'distilbert-base-uncased',
  'nreimers/MiniLM-L6-H384-uncased',
  'emilyalsentzer/Bio_ClinicalBERT',
  'monologg/biobert_v1.1_pubmed',
  'bionlp/

## start log

In [6]:
from funcs.logger_setup import setup_logger
from funcs.preprocessing import run as preprocessing


DIR_LOG = Path.cwd() / cfg["DIR_OUTPUT"] / "log.txt"
logger = setup_logger(DIR_LOG)
print("Logger path:", DIR_LOG)

[01:25:34 02-02-26] INFO: Logging initialized at: /content/pdm/output/log.txt
Logger path: /content/pdm/output/log.txt


# Preprocessing (dont run)

In [7]:
p = preprocessing(cfg)
print('\n', p)

[01:25:39 02-02-26] INFO: [train] Class columns to average: ['Behavioral health', 'Diagnoses', 'Disabilities', 'Infectious diseases', 'Genetics', 'Medications', 'Sexual and reproductive health', 'Social determinants of health', 'Violence', 'Other']
[01:25:39 02-02-26] INFO: [train] Unique (Patient, Item) pairs found: 879
[01:25:41 02-02-26] INFO: [train] Rows before: 1758; rows after: 879; pairs merged: 879
[01:25:41 02-02-26] INFO: [test] Class columns to average: ['Behavioral health', 'Diagnoses', 'Disabilities', 'Infectious diseases', 'Genetics', 'Medications', 'Sexual and reproductive health', 'Social determinants of health', 'Violence', 'Other']
[01:25:41 02-02-26] INFO: [test] Unique (Patient, Item) pairs found: 120
[01:25:41 02-02-26] INFO: [test] Rows before: 240; rows after: 120; pairs merged: 120
[01:26:06 02-02-26] INFO: Saved item standardization map: /content/pdm/output/merge_map_train.json
[01:26:07 02-02-26] INFO: Saved item standardization map: /content/pdm/output/merge

In [8]:
train = p['paths']['train_csv']
test = p['paths']['test_csv']
train_map = p['paths']['train_map']
test_map = p['paths']['test_map']
print(train)
print(test)
print(train_map)
print(test_map)

/content/pdm/output/train.csv
/content/pdm/output/test.csv
/content/pdm/output/merge_map_train.json
/content/pdm/output/merge_map_test.json


# y npz files generation (dont run)

In [9]:
import numpy as np
import pandas as pd
train = pd.read_csv('/content/pdm/output/train.csv')
test = pd.read_csv('/content/pdm/output/test.csv')

In [11]:
cfg # WHERE IS CIMILARITY MTEHOD?

col_included = [c for c in cfg['classes'] if c not in cfg['other_class']]

z_train = train.loc[:, col_included]
y_train = z_train.reset_index().to_numpy()
y_train_file_name = Path.cwd() / cfg['DIR_OUTPUT'] / 'y_train.npz'
np.savez_compressed(y_train_file_name, **{'0': y_train})

z_test = test.loc[:, col_included]
y_test = z_test.reset_index().to_numpy()
y_test_file_name = Path.cwd() / cfg['DIR_OUTPUT'] / 'y_test.npz'
np.savez_compressed(y_test_file_name, **{'0': y_test})

# embedding

In [13]:
import numpy as np
import pandas as pd

y_train_path = Path.cwd() / cfg['DIR_OUTPUT'] / 'y_train.npz'
y_test_path = Path.cwd() / cfg['DIR_OUTPUT'] / 'y_test.npz'
y_train = np.load(y_train_path)
y_test = np.load(y_test_path)

In [23]:
from funcs.mean_pooling import mean_pooling
from funcs.sanitize_col_name import sanitize_col_name
from funcs. compute_embeddings import compute_embeddings

In [29]:
llm_example = cfg['llms'][0]
item_example = train['Item'][0]
BATCH_SIZE = cfg['BATCH_SIZE']
print(llm_example)
print(item_example)
print(BATCH_SIZE)

sentence-transformers/all-MiniLM-L6-v2
53-Year-Old
128


In [24]:
embedding_example = compute_embeddings(llm_example, item_example, BATCH_SIZE)
print(embedding_example)