# process origin dataset

## load origin data

In [81]:
import pandas as pd

In [82]:
datas = pd.read_csv('/data/triage/Altered_Level_of_Consciousness.csv')
select_column = ['DISPOSITION', 'Heart.Rate', 'SystolicBP', 'DistolicBP', 'TriageNote']
datas = datas[select_column].dropna(axis=0, how='any')

## create huggingface datasets type data

In [83]:
import datasets

In [None]:
dataset = datasets.Dataset.from_dict(
    {
        'text': list(datas['TriageNote']), 
        'DISPOSITION': list(datas['DISPOSITION']),
        'Heart.Rate': list(datas['Heart.Rate']),
        'SystolicBP': list(datas['SystolicBP']),
        'DiastolicBP': list(datas['DistolicBP'])
    }
)

dataset

# process new dataset

## process DISPOSITION

In [None]:
dataset = dataset.map(lambda example: {'DISPOSITION': "discharge" if example['DISPOSITION'] == "D" else "admittance"}, num_proc=32)

dataset

In [None]:
dataset = dataset.rename_column('DISPOSITION', 'disposition')

dataset

## process Heart.Rate, SystolicBP and DiastolicBP

In [None]:
new_features = dataset.features.copy()

new_features

In [88]:
new_features["Heart.Rate"] = datasets.Value("int32")
new_features["SystolicBP"] = datasets.Value("int32")
new_features["DiastolicBP"] = datasets.Value("int32")

In [None]:
dataset = dataset.cast(new_features)

dataset

In [None]:
dataset = dataset.rename_columns({
    'Heart.Rate': 'heart_rate',
    "SystolicBP": "systolic_blood_pressure",
    "DiastolicBP": "diastolic_blood_pressure",
})

dataset

## process text

In [None]:
dataset[3]["text"]

In [None]:
import re

def process_text(example):
    text = example['text']
    text = text.strip()
    text = re.sub(r'\s+', ' ', text)
    if text[-1] != ".":
        text += "."
    tmp = []
    for attribute_name in ["disposition", 'heart_rate', 'systolic_blood_pressure', 'diastolic_blood_pressure']:
        attribute_value = example[attribute_name]
        tmp.append(f"The {' '.join(attribute_name.split('_'))} is {attribute_value}.")
    text = " ".join(tmp) + " " + text
    return {"text": text}

dataset = dataset.map(process_text, num_proc=32)

dataset

## create candidate list

In [None]:
import random
from collections import Counter

seed = 2023
random.seed(seed)

candidate_num = 10

def create_candidate(example):
    target = example[attribute]
    if attribute != "disposition":
        tmp_list = list(attribute_counter.keys())
        tmp_list.remove(target)
        candidate_list = [target] + random.sample(tmp_list, candidate_num - 1)
    else:
        if target == "discharge":
            candidate_list = ["discharge", "admittance"]
        else:
            candidate_list = ["admittance", "discharge"]

    return {f"{attribute}_candidate_list": candidate_list}


attribute_list = ['disposition', 'heart_rate', 'systolic_blood_pressure', 'diastolic_blood_pressure']

for attribute in attribute_list:
    attribute_counter = Counter(dataset[attribute])
    dataset = dataset.map(create_candidate, num_proc=32)

dataset

In [94]:
dataset.save_to_disk("./tmp/triage")

                                                                                               

## get embedding & save

### sup-simcse-bert-base-uncased

In [None]:
!torchrun --nproc_per_node=8 ../embedding/sup-simcse-bert-base-uncased.py \
    --input_dataset "./tmp/triage" \
    --output_dataset "your_output_dir" \
    --train_size 0 \
    --valid_size 0 \
    --test_size 4668 \

### e5-large-v2

In [None]:
!torchrun --nproc_per_node=8 ../embedding/e5-large-v2.py \
    --input_dataset "./tmp/triage" \
    --output_dataset "your_output_dir" \
    --train_size 0 \
    --valid_size 0 \
    --test_size 4668 \

### bge-large-en

In [None]:
!torchrun --nproc_per_node=8 ../embedding/bge-large-en.py \
    --input_dataset "./tmp/triage" \
    --output_dataset "your_output_dir" \
    --train_size 0 \
    --valid_size 0 \
    --test_size 4668 \