In [1]:
import pandas as pd
import numpy as np
import random

from sklearn.model_selection import train_test_split

In [11]:

# Load the dataset into a Pandas DataFrame
df_filtered = pd.read_csv("C:/Users/vrl2k/Desktop/MS/Spring 2023/Deep Learning/Project/Proj_mimic_out/labeled_records.csv")
df_icd = pd.read_csv("C:/Users/vrl2k/Desktop/MS/Spring 2023/Deep Learning/Project/MIMIC db/D_ICD_DIAGNOSES.csv")

In [12]:
# concatenate 'short' column based on 'HADM' and 'SEQM'
df_short = df_filtered.groupby(['SUBJECT_ID', 'HADM_ID', (df_filtered['SEQ_NUM']-1)//10])['SHORT_TITLE'].apply(lambda x: ', '.join(x)).reset_index()
df_short.rename(columns={'SHORT_TITLE': 'SHORT_CONCAT'}, inplace=True)

# concatenate 'long' column based on 'HADM' and 'SEQM'
df_long = df_filtered.groupby(['SUBJECT_ID', 'HADM_ID', (df_filtered['SEQ_NUM']-1)//10])['LONG_TITLE'].apply(lambda x: ', '.join(x)).reset_index()
df_long.rename(columns={'LONG_TITLE': 'LONG_CONCAT'}, inplace=True)


# merge the two dataframes based on 'HADM' and SEQM_group
df_concatenated = pd.merge(df_short, df_long, on=['SUBJECT_ID','HADM_ID', 'SEQ_NUM'])
df_concatenated.nunique()

SUBJECT_ID      39760
HADM_ID         52074
SEQ_NUM             4
SHORT_CONCAT    78106
LONG_CONCAT     78156
dtype: int64

In [13]:
#check for ICD-9 code for keywords 'Heart failure'
# Define the keywords to search for
keywords = ['heart failure']

# Create a new column with 1 if any of the keywords are found in the ICD9 code title, 0 otherwise
df_concatenated['hf_all'] = df_concatenated['LONG_CONCAT'].apply(lambda x: 1 if any(keyword in x.lower() for keyword in keywords) else 0)

#check for ICD-9 code for keywords 'Heart failure'
# Define the keywords to search for
keywords = ['without heart failure']

# Create a new column with 1 if any of the keywords are found in the ICD9 code title, 0 otherwise
df_concatenated['hf_no'] = df_concatenated['LONG_CONCAT'].apply(lambda x: 1 if any(keyword in x.lower() for keyword in keywords) else 0)

df_concatenated['HF'] = df_concatenated['hf_all'] - df_concatenated['hf_no']

#check for ICD-9 code for keywords 'Heart failure'
# Define the keywords to search for
keywords = ['Diabetes','diabetes']

# Create a new column with 1 if any of the keywords are found in the ICD9 code title, 0 otherwise
df_concatenated['Diabetes'] = df_concatenated['LONG_CONCAT'].apply(lambda x: 1 if any(keyword in x.lower() for keyword in keywords) else 0)

df_concatenated = df_concatenated.drop(['hf_all','hf_no'], axis=1)

df_concatenated.to_csv('C:/Users/vrl2k/Desktop/MS/Spring 2023/Deep Learning/Project/Proj_mimic_out/patient_wICD_concat.csv', index=False)

In [14]:
# Define the number of splits
num_splits = 5

df_short = df_concatenated.drop(['LONG_CONCAT'], axis=1)

# Calculate the split size
split_size = int(np.ceil(len(df_short) / num_splits))

# Split the dataframe
df_split_short = [df_short[i:i+split_size] for i in range(0, len(df_short), split_size)]

In [15]:
for i, df in enumerate(df_split_short):
    filename = f"C:/Users/vrl2k/Desktop/MS/Spring 2023/Deep Learning/Project/Proj_mimic_out/patient_split_short_{i+1}.csv" # create filename for each dataframe
    df.to_csv(filename, index=False) # write dataframe to CSV file

In [16]:
hf_short_terms = df_icd.loc[df_icd['LONG_TITLE'].str.contains('heart failure'), 'SHORT_TITLE'].tolist()

hf_short_terms_wcomma = [item + ',' for item in hf_short_terms]

In [17]:
#df_split_short_mask = df_split_short_mask.reset_index(drop=True)
# Define the masking probabilities
delete_prob = 0.8
unchanged_prob = 0.1
replace_prob = 0.1

def mask_short_concat(row):
    # Check if any hf_term is present in the string
    if any(term in row for term in hf_short_terms_wcomma):
        # Apply the masking strategy
        prob = random.uniform(0, 1)
        if prob <= 0.8:
            # Delete the hf_term from the string
            for term in hf_short_terms_wcomma:
                row = row.replace(term, '')
        elif prob <= 0.9:
            # Do nothing
            pass
        else:
            # Replace the hf_term with 'Asthmatic'
            for term in hf_short_terms_wcomma:
                row = row.replace(term, 'Asthmatic, ')
    return row

# Loop through the first 4 dataframes in the list
for i in range(4):
    # Loop through the 'SHORT_CONCAT' column
    df_split_short_mask[i]['SHORT_CONCAT'] = df_split_short_mask[i]['SHORT_CONCAT'].apply(mask_short_concat)

        

In [18]:
for i, df in enumerate(df_split_short_mask):
    filename = f"C:/Users/vrl2k/Desktop/MS/Spring 2023/Deep Learning/Project/Proj_mimic_out/patient_split_shortmask_{i+1}.csv" # create filename for each dataframe
    df.to_csv(filename, index=False) # write dataframe to CSV file

In [19]:
df_short['SHORT_CONCAT'].to_csv('C:/Users/vrl2k/Desktop/MS/Spring 2023/Deep Learning/Project/Proj_mimic_out/patient_shortconcat.csv', index=False)

In [20]:
# Creating model files without any of the terms related to 'heart failure' and 'Diabetes' (diseases being trained for)
all_short_terms = df_icd.loc[df_icd['LONG_TITLE'].str.contains('heart failure|diabetes|Diabetes'), 'SHORT_TITLE'].tolist()
#all_short_terms_wcomma = [item + ',' for item in all_short_terms]

In [21]:
df_split_short_removed = df_split_short
def remove_short_concat(row):
    # Check if any all_short_term is present in the string
    for term in all_short_terms:
        if term in row or term + ',' in row:
            row = row.replace(term, '').replace(term + ',', '')
    
    return row

# Loop through the first 4 dataframes in the list
for i in range(4):
    # Loop through the 'SHORT_CONCAT' column
    df_split_short_removed[i]['SHORT_CONCAT'] = df_split_short_removed[i]['SHORT_CONCAT'].apply(remove_short_concat)
    
for i, df in enumerate(df_split_short_removed):
    filename = f"C:/Users/vrl2k/Desktop/MS/Spring 2023/Deep Learning/Project/Proj_mimic_out/patient_split_shortremoved_{i+1}.csv" # create filename for each dataframe
    df.to_csv(filename, index=False) # write dataframe to CSV file

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  del sys.path[0]
