In [12]:
from pathlib import Path
from datasets import load_dataset
import pandas as pd

# Define paths using pathlib for easier directory creation
DATA_PATH = Path("../data")
OMI_PATH_processed = DATA_PATH / "processed" / "omi-health"
OMI_PATH_raw = DATA_PATH / "raw" / "omi-health"



OMI_PATH_processed.mkdir(parents=True, exist_ok=True)

In [13]:
# Load the dataset
ds_omi_health = load_dataset("omi-health/medical-dialogue-to-soap-summary")
ds_adesouza = load_dataset("adesouza1/soap_notes")

# Save the dataset to disk
ds_omi_health.save_to_disk(OMI_PATH_raw)

Saving the dataset (1/1 shards): 100%|██████████| 9250/9250 [00:00<00:00, 56816.82 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 500/500 [00:00<00:00, 46082.14 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 250/250 [00:00<00:00, 26471.84 examples/s]


In [14]:
if 'train' in ds_omi_health:
    train_df_omi = ds_omi_health['train'].to_pandas()

if 'validation' in ds_omi_health:
    val_df_omi = ds_omi_health['validation'].to_pandas()

if 'test' in ds_omi_health:
    test_df_omi = ds_omi_health['test'].to_pandas()

# Check the first few rows of the dataset
train_df_omi.head()

Unnamed: 0,dialogue,soap,prompt,messages,messages_nosystem
0,"Doctor: Hello, how can I help you today?\nPati...",S: The patient's mother reports that her 13-ye...,Create a Medical SOAP note summary from the di...,"[{'role': 'system', 'content': 'You are an exp...","[{'role': 'user', 'content': 'You are an exper..."
1,"Doctor: Hello, what brings you in today?\nPati...","S: The patient, a 21-month-old male, presented...",Create a Medical SOAP note summary from the di...,"[{'role': 'system', 'content': 'You are an exp...","[{'role': 'user', 'content': 'You are an exper..."
2,"Doctor: Hello, how can I help you today?\nPati...","S: Patient reports experiencing fatigue, night...",Create a Medical SOAP note summary from the di...,"[{'role': 'system', 'content': 'You are an exp...","[{'role': 'user', 'content': 'You are an exper..."
3,"Doctor: Hello, Patient D. How are you feeling ...","S: Patient D, a 60-year-old African American m...",Create a medical SOAP summary of this dialogue.,"[{'role': 'system', 'content': 'You are an exp...","[{'role': 'user', 'content': 'You are an exper..."
4,"Doctor: Hello, I see that you have a history o...","S: The patient, a married woman with a 7-year ...",Create a Medical SOAP note summary from the di...,"[{'role': 'system', 'content': 'You are an exp...","[{'role': 'user', 'content': 'You are an exper..."


In [15]:
train_df_omi.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9250 entries, 0 to 9249
Data columns (total 5 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   dialogue           9250 non-null   object
 1   soap               9250 non-null   object
 2   prompt             9250 non-null   object
 3   messages           9250 non-null   object
 4   messages_nosystem  9250 non-null   object
dtypes: object(5)
memory usage: 361.5+ KB


In [16]:
# Let's check one example
train_df_omi.iloc[0]['dialogue']

"Doctor: Hello, how can I help you today?\nPatient: My son has been having some issues with speech and development. He's 13 years old now.\nDoctor: I see. Can you tell me more about his symptoms? Does he have any issues with muscle tone or hypotonia?\nPatient: No, he doesn't have hypotonia. But he has mild to moderate speech and developmental delay, and he's been diagnosed with attention deficit disorder.\nDoctor: Thank you for sharing that information. We'll run some tests, including an MRI, to get a better understanding of your son's condition. \n(After the tests)\nDoctor: The MRI results are in, and I'm glad to say that there are no structural brain anomalies. However, I did notice some physical characteristics. Does your son have any facial features like retrognathia, mild hypertelorism, or a slightly elongated philtrum and thin upper lip?\nPatient: Yes, he has all of those features. His hands are also broad and short. And his feet have mild syndactyly of the second and third toe, 

In [17]:
train_df_omi.iloc[0]['soap']

"S: The patient's mother reports that her 13-year-old son has mild to moderate speech and developmental delays and has been diagnosed with attention deficit disorder. She denies any issues with muscle tone or hypotonia. The patient also exhibits certain physical characteristics, including retrognathia, mild hypertelorism, an elongated philtrum, thin upper lip, broad and short hands, mild syndactyly of the second and third toes, and a sandal gap in both feet.\nO: An MRI of the brain showed no structural anomalies. Whole Exome Sequencing (WES) revealed a de novo frameshift variant Chr1(GRCh37):g.244217335del, NM_205768.2(ZBTB18):c.259del(p.(Leu87Cysfs*21)), indicating a premature termination codon located more than 400 codons upstream of the canonical termination codon.\nA: The primary diagnosis is a genetic disorder associated with the identified frameshift mutation, which likely contributes to the patient's speech and developmental delays and attention deficit disorder. The physical ch

In [18]:
# Separate S,O, A, P from soap into different columns
def split_soap(soap):
    # Split the SOAP note into its components
    components = soap.split('\n')
    soap_dict = {}
    for component in components:
        if ':' in component:
            key, value = component.split(':', 1)
            soap_dict[key.strip()] = value.strip()
    return soap_dict

# Apply the function to the dataframe
soap_df = train_df_omi['soap'].apply(split_soap).apply(pd.Series)

# Concatenate the original dataframe with the new soap_df
train_df_omi = pd.concat([train_df_omi, soap_df], axis=1)

# Drop the original soap column
#train_df_omi = train_df_omi.drop(columns=['soap'])

# Rename the columns
train_df_omi = train_df_omi.rename(columns={
    'S': 'subjective',
    'O': 'objective',
    'A': 'assessment',
    'P': 'plan'
})

# Check the first few rows of the updated dataframe
train_df_omi.head()

Unnamed: 0,dialogue,soap,prompt,messages,messages_nosystem,subjective,objective,assessment,plan,- Serum parathormone (PTH),...,- Slit lamp examination of the right eye,- Scheimpflug densitometry,- Ears,- Nose,- Oral,- Neck,- Additional findings on the following day,- Laboratory results,- Transvaginal ultrasound showing diffuse fibromatosis with two uterine masses,- Abdominal CT indicated increased uterine volume with two masses
0,"Doctor: Hello, how can I help you today?\nPati...",S: The patient's mother reports that her 13-ye...,Create a Medical SOAP note summary from the di...,"[{'role': 'system', 'content': 'You are an exp...","[{'role': 'user', 'content': 'You are an exper...",The patient's mother reports that her 13-year-...,An MRI of the brain showed no structural anoma...,The primary diagnosis is a genetic disorder as...,The management plan includes regular follow-up...,,...,,,,,,,,,,
1,"Doctor: Hello, what brings you in today?\nPati...","S: The patient, a 21-month-old male, presented...",Create a Medical SOAP note summary from the di...,"[{'role': 'system', 'content': 'You are an exp...","[{'role': 'user', 'content': 'You are an exper...","The patient, a 21-month-old male, presented wi...",Hip ultrasound showed no joint effusion. Spine...,Primary diagnosis is Spondylodiscitis with ass...,Initiated broad-spectrum intravenous therapy w...,,...,,,,,,,,,,
2,"Doctor: Hello, how can I help you today?\nPati...","S: Patient reports experiencing fatigue, night...",Create a Medical SOAP note summary from the di...,"[{'role': 'system', 'content': 'You are an exp...","[{'role': 'user', 'content': 'You are an exper...","Patient reports experiencing fatigue, night sw...","Vital signs normal. BMI 37.2 kg/m2, weight 263...",The patient presents with symptoms suggestive ...,Continue current medications. Schedule follow-...,,...,,,,,,,,,,
3,"Doctor: Hello, Patient D. How are you feeling ...","S: Patient D, a 60-year-old African American m...",Create a medical SOAP summary of this dialogue.,"[{'role': 'system', 'content': 'You are an exp...","[{'role': 'user', 'content': 'You are an exper...","Patient D, a 60-year-old African American male...",Patient is currently asymptomatic. No physical...,Patient D is at an increased risk for prostate...,Plan to have a detailed conversation about PSA...,,...,,,,,,,,,,
4,"Doctor: Hello, I see that you have a history o...","S: The patient, a married woman with a 7-year ...",Create a Medical SOAP note summary from the di...,"[{'role': 'system', 'content': 'You are an exp...","[{'role': 'user', 'content': 'You are an exper...","The patient, a married woman with a 7-year his...",Physical examination confirmed hirsutism and m...,The primary diagnosis is Polycystic Ovarian Sy...,The management plan includes proceeding with i...,,...,,,,,,,,,,


In [19]:
# Only keep the columns we need
train_df_omi = train_df_omi[['dialogue', 'subjective', 'objective', 'assessment', 'plan']]

# Save the updated dataframe to disk
train_df_omi.to_csv(OMI_PATH_processed / 'train.csv', index=False)