#prequisites

## load repo

In [None]:
import os
from pathlib import Path

PROJECT_DIR  = '/content'
os.chdir(Path(PROJECT_DIR))
print("Current Working Directory:", Path.cwd())

Current Working Directory: /content


In [None]:
!if [ -d "pdm" ]; then rm -rf pdm; fi

In [None]:
!git clone https://github.com/soroushdty/pdm.git
REPO_DIR = Path.cwd() / "pdm"
print(REPO_DIR)
os.chdir(REPO_DIR)
print("Current Working Directory:", Path.cwd())

Cloning into 'pdm'...
remote: Enumerating objects: 301, done.[K
remote: Counting objects: 100% (141/141), done.[K
remote: Compressing objects: 100% (138/138), done.[K
remote: Total 301 (delta 82), reused 2 (delta 2), pack-reused 160 (from 1)[K
Receiving objects: 100% (301/301), 166.25 KiB | 1.87 MiB/s, done.
Resolving deltas: 100% (163/163), done.
/content/pdm
Current Working Directory: /content/pdm


## Install Prequisites

In [None]:
from funcs.requirements_utils import install_missing
installed = install_missing(Path.cwd() / "requirements.txt", quiet=False)
print("Installed:", installed)

Installed: []


## load config

In [None]:
# Load config
from funcs.config_loader import load_config
CONFIG_PATH = Path.cwd() / "config.json"
cfg = load_config(CONFIG_PATH)
cfg

{'DIR_INPUT': 'input',
 'DIR_OUTPUT': 'output',
 'patient_col': 'Patient',
 'physician_col': 'Physician',
 'item_col': 'Item',
 'classes': ['Behavioral health',
  'Diagnoses',
  'Disabilities',
  'Infectious diseases',
  'Genetics',
  'Medications',
  'Sexual and reproductive health',
  'Social determinants of health',
  'Violence',
  'Other'],
 'other_class': ['Disabilities',
  'Infectious diseases',
  'Genetics',
  'Sexual and reproductive health',
  'Violence',
  'Social determinants of health'],
 'llms': ['sentence-transformers/all-MiniLM-L6-v2',
  'sentence-transformers/all-mpnet-base-v2',
  'sentence-transformers/multi-qa-MiniLM-L6-cos-v1',
  'pritamdeka/BioBERT-mnli-snli-scinli-scitail-mednli-stsb',
  'sentence-transformers/biomed-mpnet-base',
  'cambridgeltl/SapBERT-from-PubMedBERT-fulltext',
  'bert-base-uncased',
  'roberta-base',
  'distilbert-base-uncased',
  'nreimers/MiniLM-L6-H384-uncased',
  'emilyalsentzer/Bio_ClinicalBERT',
  'monologg/biobert_v1.1_pubmed',
  'bionlp/

## start log

In [None]:
from funcs.logger_setup import setup_logger
from funcs.preprocessing import run as preprocessing


DIR_LOG = Path.cwd() / cfg["DIR_OUTPUT"] / "log.txt"
logger = setup_logger(DIR_LOG)
print("Logger path:", DIR_LOG)

[00:33:30 02-02-26] INFO: Logging initialized at: /content/pdm/output/log.txt
Logger path: /content/pdm/output/log.txt


# Preprocessing

In [None]:
p = preprocessing(cfg)
print('\n', p)

[00:34:04 02-02-26] INFO: [train] Class columns to average: ['Behavioral health', 'Diagnoses', 'Disabilities', 'Infectious diseases', 'Genetics', 'Medications', 'Sexual and reproductive health', 'Social determinants of health', 'Violence', 'Other']
[00:34:04 02-02-26] INFO: [train] Unique (Patient, Item) pairs found: 879
[00:34:07 02-02-26] INFO: [train] Rows before: 1758; rows after: 879; pairs merged: 879
[00:34:07 02-02-26] INFO: [test] Class columns to average: ['Behavioral health', 'Diagnoses', 'Disabilities', 'Infectious diseases', 'Genetics', 'Medications', 'Sexual and reproductive health', 'Social determinants of health', 'Violence', 'Other']
[00:34:07 02-02-26] INFO: [test] Unique (Patient, Item) pairs found: 120
[00:34:07 02-02-26] INFO: [test] Rows before: 240; rows after: 120; pairs merged: 120
[00:34:32 02-02-26] INFO: Saved item standardization map: /content/pdm/output/merge_map_train.json
[00:34:33 02-02-26] INFO: Saved item standardization map: /content/pdm/output/merge

In [None]:
train = p['paths']['train_csv']
test = p['paths']['test_csv']
train_map = p['paths']['train_map']
test_map = p['paths']['test_map']
print(train)
print(test)
print(train_map)
print(test_map)

/content/pdm/output/train.csv
/content/pdm/output/test.csv
/content/pdm/output/merge_map_train.json
/content/pdm/output/merge_map_test.json


# y npz files generation

In [None]:
import numpy as np
import pandas as pd
train = pd.read_csv('/content/pdm/output/train.csv')
test = pd.read_csv('/content/pdm/output/test.csv')

In [None]:
cfg # WHERE IS CIMILARITY MTEHOD?

col_included = [c for c in cfg['classes'] if c not in cfg['other_class']]

z_train = train.loc[:, col_included]
y_train = z.reset_index().to_numpy()
y_train_file_name = Path.cwd() / cfg['DIR_OUTPUT'] / 'y_train.npz'
np.savez_compressed(y_train_file_name, **{'0': y_train})

z_test = test.loc[:, col_included]
y_test = z.reset_index().to_numpy()
y_test_file_name = Path.cwd() / cfg['DIR_OUTPUT'] / 'y_test.npz'
np.savez_compressed(y_test_file_name, **{'0': y_test})

# embedding

In [None]:
import numpy as np
import pandas as pd
train = pd.read_csv('/content/pdm/output/train.csv')
test = pd.read_csv('/content/pdm/output/test.csv')