In [2]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

In [3]:
diagnoses_labels = pd.read_csv('../MIMIC_3_data/D_ICD_DIAGNOSES.csv.gz', compression='gzip')

print(diagnoses_labels.head())

   ROW_ID ICD9_CODE               SHORT_TITLE  \
0     174     01166     TB pneumonia-oth test   
1     175     01170    TB pneumothorax-unspec   
2     176     01171   TB pneumothorax-no exam   
3     177     01172  TB pneumothorx-exam unkn   
4     178     01173  TB pneumothorax-micro dx   

                                          LONG_TITLE  
0  Tuberculous pneumonia [any form], tubercle bac...  
1              Tuberculous pneumothorax, unspecified  
2  Tuberculous pneumothorax, bacteriological or h...  
3  Tuberculous pneumothorax, bacteriological or h...  
4  Tuberculous pneumothorax, tubercle bacilli fou...  


In [4]:
clinical_notes = pd.read_csv('../MIMIC_3_data/NOTEEVENTS.csv.gz', compression='gzip')

print(clinical_notes.head())

  clinical_notes = pd.read_csv('../MIMIC_3_data/NOTEEVENTS.csv.gz', compression='gzip')


   ROW_ID  SUBJECT_ID   HADM_ID   CHARTDATE CHARTTIME STORETIME  \
0     174       22532  167853.0  2151-08-04       NaN       NaN   
1     175       13702  107527.0  2118-06-14       NaN       NaN   
2     176       13702  167118.0  2119-05-25       NaN       NaN   
3     177       13702  196489.0  2124-08-18       NaN       NaN   
4     178       26880  135453.0  2162-03-25       NaN       NaN   

            CATEGORY DESCRIPTION  CGID  ISERROR  \
0  Discharge summary      Report   NaN      NaN   
1  Discharge summary      Report   NaN      NaN   
2  Discharge summary      Report   NaN      NaN   
3  Discharge summary      Report   NaN      NaN   
4  Discharge summary      Report   NaN      NaN   

                                                TEXT  
0  Admission Date:  [**2151-7-16**]       Dischar...  
1  Admission Date:  [**2118-6-2**]       Discharg...  
2  Admission Date:  [**2119-5-4**]              D...  
3  Admission Date:  [**2124-7-21**]              ...  
4  Admission Da

In [5]:
# filter for discharge summaries
discharge_summaries = clinical_notes[clinical_notes['CATEGORY'] == 'Discharge summary']
print(discharge_summaries.head())


   ROW_ID  SUBJECT_ID   HADM_ID   CHARTDATE CHARTTIME STORETIME  \
0     174       22532  167853.0  2151-08-04       NaN       NaN   
1     175       13702  107527.0  2118-06-14       NaN       NaN   
2     176       13702  167118.0  2119-05-25       NaN       NaN   
3     177       13702  196489.0  2124-08-18       NaN       NaN   
4     178       26880  135453.0  2162-03-25       NaN       NaN   

            CATEGORY DESCRIPTION  CGID  ISERROR  \
0  Discharge summary      Report   NaN      NaN   
1  Discharge summary      Report   NaN      NaN   
2  Discharge summary      Report   NaN      NaN   
3  Discharge summary      Report   NaN      NaN   
4  Discharge summary      Report   NaN      NaN   

                                                TEXT  
0  Admission Date:  [**2151-7-16**]       Dischar...  
1  Admission Date:  [**2118-6-2**]       Discharg...  
2  Admission Date:  [**2119-5-4**]              D...  
3  Admission Date:  [**2124-7-21**]              ...  
4  Admission Da

In [6]:

diagnoses_icd9 = pd.read_csv('../MIMIC_3_data/DIAGNOSES_ICD.csv.gz', compression='gzip')
print(diagnoses_icd9.head())

   ROW_ID  SUBJECT_ID  HADM_ID  SEQ_NUM ICD9_CODE
0    1297         109   172335      1.0     40301
1    1298         109   172335      2.0       486
2    1299         109   172335      3.0     58281
3    1300         109   172335      4.0      5855
4    1301         109   172335      5.0      4254


In [7]:
services_data = pd.read_csv('../MIMIC_3_data/SERVICES.csv.gz', compression='gzip')
print(services_data.head())

# procedure codes
procedure_codes = pd.read_csv('../MIMIC_3_data/PROCEDURES_ICD.csv.gz', compression='gzip')
print(procedure_codes.head())

   ROW_ID  SUBJECT_ID  HADM_ID         TRANSFERTIME PREV_SERVICE CURR_SERVICE
0     758         471   135879  2122-07-22 14:07:27        TSURG          MED
1     759         471   135879  2122-07-26 18:31:49          MED        TSURG
2     760         472   173064  2172-09-28 19:22:15          NaN         CMED
3     761         473   129194  2201-01-09 20:16:45          NaN           NB
4     762         474   194246  2181-03-23 08:24:41          NaN           NB
   ROW_ID  SUBJECT_ID  HADM_ID  SEQ_NUM  ICD9_CODE
0     944       62641   154460        3       3404
1     945        2592   130856        1       9671
2     946        2592   130856        2       3893
3     947       55357   119355        1       9672
4     948       55357   119355        2        331


In [8]:
# merge diagnoses with labels to get full disease names
diagnoses_merged = diagnoses_icd9.merge(diagnoses_labels, left_on='ICD9_CODE', right_on='ICD9_CODE', how='left')

print(diagnoses_merged.head())

# length of unique ICD-9 codes
unique_icd9_codes = diagnoses_merged['ICD9_CODE'].nunique()
print(f'Number of unique ICD-9 codes: {unique_icd9_codes}')


   ROW_ID_x  SUBJECT_ID  HADM_ID  SEQ_NUM ICD9_CODE  ROW_ID_y  \
0      1297         109   172335      1.0     40301    4312.0   
1      1298         109   172335      2.0       486    5528.0   
2      1299         109   172335      3.0     58281    5892.0   
3      1300         109   172335      4.0      5855    5913.0   
4      1301         109   172335      5.0      4254    4435.0   

                SHORT_TITLE                                         LONG_TITLE  
0    Mal hyp kid w cr kid V  Hypertensive chronic kidney disease, malignant...  
1   Pneumonia, organism NOS                    Pneumonia, organism unspecified  
2  Chr nephritis in oth dis  Chronic glomerulonephritis in diseases classif...  
3  Chron kidney dis stage V                    Chronic kidney disease, Stage V  
4   Prim cardiomyopathy NEC                     Other primary cardiomyopathies  
Number of unique ICD-9 codes: 6984


In [9]:
# create organ name mapping based on ICD-9 code prefixes

# list all the organs and based on that go through the ICD-9 codes and create mapping for each organ the list of 
# ICD codes that correspond to that organ

organs =  {
    'heart',
    'lungs',
    'liver',
    'kidney',
    'brain',
    'stomach',
    'intestines',
    'pancreas',
    'bladder',
    'skin',
    'bones',
    'muscles',
    'blood vessels',
    'nerves',
    'eyes',
    'ears',
    'throat',
    'reproductive organs'
}

# loop through the diagnoses_merged and create a mapping of ICD-9 code prefixes to organ names
# organd mapping should have keys as organs and values as list of ICD-9 code prefixes
organ_mapping = {organ: [] for organ in organs}

for index, row in diagnoses_merged.iterrows():
    # make safe string conversions to avoid errors from NaN or non-string values
    icd_code = str(row.get('ICD9_CODE', ''))
    long_title = row.get('LONG_TITLE', '')
    if pd.isna(long_title):
        long_title = ''
    long_title = str(long_title).lower()

    for organ in organs:
        if organ in long_title:
            prefix = icd_code
            if prefix not in organ_mapping[organ]:
                organ_mapping[organ].append(prefix)
            
# print the organ mapping
for organ, prefixes in organ_mapping.items():
    print(f'{organ}: {prefixes}')




nerves: ['9518', '9579', '9570', '2251', '1920', '9578', '3529', '9558']
kidney: ['40301', '5855', '28521', '5845', '5849', '5854', '40390', '1890', '5859', '40391', '5853', '5939', '58089', 'E8791', 'V1052', 'V4573', '5920', '40492', '99681', '40311', '5829', '86612', '59381', '58389', 'V420', '7533', '40310', '86602', '5932', '75312', '5852', '40300', '86600', '5839', '44581', '59389', '1980', '75310', '5819', '5848', '2230', '23691', '75319', '5846', '5809', '40491', '40493', '86610', '75313', '40403', '86601', '40401', '86611', '58189', '66932', '86603', 'E8702', '7944', 'V1869', '58289', '40413', '5909', '5847', '5851', '40490', 'V1651', '86613', '75317', '40400', '66934', 'E8742', '40411', '5931', '5890', '75314']
bladder: ['57410', '5964', 'V1051', '5960', '57420', '8670', '57400', '57421', '57490', '57491', '57471', '5759', '57460', '5758', '34461', '1882', '8671', '1560', '5755', '59654', '57401', '5967', '57461', '1888', 'V435', '57481', '57470', '5753', '7535', '1880', '7536

In [10]:
# use notes for context and predict organ based on ICD-9 codes
# for each discharge summary, get the corresponding ICD-9 codes from diagnoses_merged
# then use the organ mapping to predict which organs are involved based on the ICD-9 codes

# what is HADM_ID in discharge_summaries
print(discharge_summaries['HADM_ID'].head())

discharge_summaries['ICD9_CODES'] = discharge_summaries['HADM_ID'].map(
    lambda hadm_id: diagnoses_merged[diagnoses_merged['HADM_ID'] == hadm_id]['ICD9_CODE'].tolist()
)
print(discharge_summaries[['HADM_ID', 'ICD9_CODES']].head())

# predict organs based on ICD-9 codes
def predict_organs(icd9_codes):
    involved_organs = set()
    for code in icd9_codes:
        for organ, prefixes in organ_mapping.items():
            if code in prefixes:
                involved_organs.add(organ)
    return list(involved_organs)

discharge_summaries['PREDICTED_ORGANS'] = discharge_summaries['ICD9_CODES'].apply(predict_organs)
print(discharge_summaries[['HADM_ID', 'PREDICTED_ORGANS']].head())

0    167853.0
1    107527.0
2    167118.0
3    196489.0
4    135453.0
Name: HADM_ID, dtype: float64


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  discharge_summaries['ICD9_CODES'] = discharge_summaries['HADM_ID'].map(


    HADM_ID                                         ICD9_CODES
0  167853.0  [01193, 4254, 42731, 2639, 2762, 5070, 5119, 2...
1  107527.0         [5191, 49121, 51881, 486, 2761, 2449, 311]
2  167118.0               [5191, 5185, 496, 2762, 45340, 5533]
3  196489.0  [51884, 5849, 34830, 49121, 2760, 4160, 3594, ...
4  135453.0  [80506, 5070, 42823, 2930, 4538, E882, 4280, 4...
    HADM_ID PREDICTED_ORGANS
0  167853.0               []
1  107527.0               []
2  167118.0               []
3  196489.0         [kidney]
4  135453.0   [bones, heart]


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  discharge_summaries['PREDICTED_ORGANS'] = discharge_summaries['ICD9_CODES'].apply(predict_organs)


In [11]:
# Create training data for LLM
# Format: Input (question/prompt) -> Output (answer with diseases)

import numpy as np
from sklearn.model_selection import train_test_split

# Step 1: Create dataset with organ-disease pairs
training_data = []

def create_training_examples():
    for _, row in discharge_summaries.iterrows():
        organs = row['PREDICTED_ORGANS']
        icd_codes = row['ICD9_CODES']
        clinical_note = row.get('TEXT', '')[:500]  # Use first 500 characters of the note as context
        
        # Skip if no organs or ICD codes
        if not organs or not icd_codes:
            continue
        
        # Get disease names for the ICD codes
        diseases = diagnoses_merged[diagnoses_merged['ICD9_CODE'].isin(icd_codes)]['LONG_TITLE'].unique()
        
        # filter out NaN (floats) and empty strings, convert to clean strings
        disease_list = [str(d).strip() for d in diseases if pd.notna(d) and str(d).strip() != '']
        if len(disease_list) == 0:
            continue
        
        # Create training examples for each organ
        for organ in organs:
            # Format 1: Question-Answer format
            question = f"What diseases can affect the {organ}?"
            answer = ", ".join(disease_list[:10])  # Limit to top 10 diseases
            
            training_data.append({
                'input': question,
                'output': answer,
                'organ': organ,
                'icd_codes': icd_codes,
                'hadm_id': row['HADM_ID']
            })
            
            # Format 2: Instruction format (for instruction-tuned models)
            instruction = f"List common diseases that affect the {organ} based on clinical data."
            training_data.append({
                'input': instruction,
                'output': answer,
                'organ': organ,
                'icd_codes': icd_codes,
                'hadm_id': row['HADM_ID']
            })

            # Format 3: Contextual format (for models that use context)
            context_prompt = f"Given the clinical note: '{clinical_note}...', what diseases might affect the {organ}?"
            training_data.append({
                'input': context_prompt,
                'output': answer,
                'organ': organ,
                'icd_codes': icd_codes,
                'hadm_id': row['HADM_ID']
            })

# if csv already exists, skip creation and set variables from csv
import os
if not os.path.exists('train_organ_diseases.csv') or not os.path.exists('val_organ_diseases.csv'):
    print("Creating training examples...")
    create_training_examples()
    print(f"Total training examples created: {len(training_data)}")
    # Convert to DataFrame
    training_df = pd.DataFrame(training_data)
    print(f"Total training examples: {len(training_df)}")
    print(training_df.head())

    # Step 2: Split into train and validation sets (80-20 split)
    train_df, val_df = train_test_split(training_df, test_size=0.2, random_state=42, stratify=training_df['organ'])

    print(f"\nTraining set size: {len(train_df)}")
    print(f"Validation set size: {len(val_df)}")

    # Step 3: Save to files for LLM training
    train_df.to_csv('train_organ_diseases.csv', index=False)
    val_df.to_csv('val_organ_diseases.csv', index=False)

else :
    print("Training examples CSV files already exist. Skipping creation.")
    train_df = pd.read_csv('train_organ_diseases.csv')
    val_df = pd.read_csv('val_organ_diseases.csv')


# Step 4: Create JSONL format (common for LLM fine-tuning)

# If json already exists, skip creation
if os.path.exists('train_organ_diseases.jsonl') and os.path.exists('val_organ_diseases.jsonl'):
    print("JSONL files already exist. Skipping creation.")
else:
    print("Creating JSONL files...")
    import json
    def create_jsonl(df, filename):
        with open(filename, 'w') as f:
            for _, row in df.iterrows():
                json_obj = {
                    'prompt': row['input'],
                    'completion': row['output'],
                    'metadata': {
                        'organ': row['organ'],
                        'hadm_id': row['hadm_id']
                    }
                }
                f.write(json.dumps(json_obj) + '\n')

    create_jsonl(train_df, 'train_organ_diseases.jsonl')
    create_jsonl(val_df, 'val_organ_diseases.jsonl')
    print("\nTraining data saved to:")
    print("- train_organ_diseases.csv / .jsonl")
    print("- val_organ_diseases.csv / .jsonl")

Training examples CSV files already exist. Skipping creation.
JSONL files already exist. Skipping creation.
