# MultiTask Random Forest, SVM and LSTM

In [None]:
import os
import numpy as np
import pandas as pd
from functools import reduce
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler,MinMaxScaler, Normalizer,OneHotEncoder



#metrics
from sklearn.model_selection import StratifiedKFold, GridSearchCV, RepeatedKFold, RepeatedStratifiedKFold
from sklearn.model_selection import RandomizedSearchCV, train_test_split, cross_val_score,cross_validate
from scipy.stats import randint
from sklearn.metrics import classification_report,mean_squared_error
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix, precision_score, recall_score, ConfusionMatrixDisplay
from sklearn.metrics import roc_auc_score, roc_curve, RocCurveDisplay
from sklearn.metrics import PrecisionRecallDisplay

In [None]:
path =r'patient.csv'
file = open(path)
df_patient = pd.read_csv(file)
path1 = r"pastHistory.csv"
file1 = open (path1)
df_pasthis = pd.read_csv(file1)
path2 =r"diagnosis.csv"
file1 = open (path1)
file2 = open (path2)
df_diag = pd.read_csv(file2)

In [None]:
df_patient.uniquepid.value_counts()

021-239346    26
021-120471    26
006-181132    24
006-100497    24
015-76326     24
              ..
015-78659      1
015-91151      1
015-54973      1
015-50909      1
015-95573      1
Name: uniquepid, Length: 139367, dtype: int64

In [None]:
df_patient.columns

Index(['patientunitstayid', 'patienthealthsystemstayid', 'gender', 'age',
       'ethnicity', 'hospitalid', 'wardid', 'apacheadmissiondx',
       'admissionheight', 'hospitaladmittime24', 'hospitaladmitoffset',
       'hospitaladmitsource', 'hospitaldischargeyear',
       'hospitaldischargetime24', 'hospitaldischargeoffset',
       'hospitaldischargelocation', 'hospitaldischargestatus', 'unittype',
       'unitadmittime24', 'unitadmitsource', 'unitvisitnumber', 'unitstaytype',
       'admissionweight', 'dischargeweight', 'unitdischargetime24',
       'unitdischargeoffset', 'unitdischargelocation', 'unitdischargestatus',
       'uniquepid'],
      dtype='object')

In [None]:
nan_indices =  df_patient.apacheadmissiondx.isnull()
df_adm = df_patient.loc[nan_indices]

In [None]:
indexes_to_extract =[0,1,2]
extracted_diag = df_diag['diagnosisstring'].str.split('|').apply(lambda x: [x[i] for i in indexes_to_extract])

In [None]:
extracted_diag = pd.DataFrame(extracted_diag.tolist(), columns=[f'Value_{i+1}' for i in indexes_to_extract])
extracted_diag['patientunitstayid'] = df_diag['patientunitstayid']

In [None]:
extracted_diag[:50]
extracted_diag.patientunitstayid.value_counts()

3086603    4247
2747640    3951
3097437    3402
3064466    3372
3035079    3370
           ... 
2894229       1
2894245       1
2894279       1
2894549       1
2894580       1
Name: patientunitstayid, Length: 173109, dtype: int64

In [None]:
# path3 =r'treatment.csv'
# file3 = open(path3)
# df_treat = pd.read_csv(file3)

In [None]:
# df_treat

In [None]:
# df_treat.patientunitstayid.value_counts()

In [None]:
df_pasthis1 = df_pasthis['pasthistorypath'].str.split('Organ Systems/', expand=True)[1]
df_pasthis1['pasthistory'] = df_pasthis['pasthistorypath'].str.split('Organ Systems/', expand=True)[1]

In [None]:
df_pasthis1

0                            Cardiovascular (R)/Valve disease/AS
1              Renal  (R)/Renal Failure/renal failure- not cu...
2              Cardiovascular (R)/Congestive Heart Failure/CH...
3                                                           None
4              Cardiovascular (R)/Hypertension Requiring Trea...
                                     ...                        
1149176                                                     None
1149177                          Pulmonary/COPD/COPD  - moderate
1149178        Cardiovascular (R)/Arrhythmias/atrial fibrilla...
1149179                                                     None
pasthistory    0                        Cardiovascular (R)/Va...
Name: 1, Length: 1149181, dtype: object

In [None]:
df_pasthis1 = df_pasthis1.str.split('/', expand=True)
df_pasthis1= df_pasthis1.rename(columns={0: 'v1', 1: 'v2', 2: 'v3', 3: 'v4', 4: 'v5', 5: 'v6'})

In [None]:
df_pasthis1['patientunitstayid'] = df_pasthis['patientunitstayid'].astype('int64')

In [None]:
extracted_pasthis = df_pasthis['pasthistorypath'].str.split('/', expand=True)

In [None]:
indexes_to_extract =[0,1,2,3,4,5,6,7,8,9]
extracted_pasthis = pd.DataFrame(extracted_pasthis.values.tolist(), columns=[f'History_Value_{i+1}' for i in indexes_to_extract])
extracted_pasthis['patientunitstayid'] = df_pasthis['patientunitstayid']

In [None]:
extracted_pasthis.patientunitstayid.value_counts()

2747640    1449
3097437    1122
3064036     944
3036894     805
3114528     776
           ... 
3352519       1
3352526       1
3352531       1
3352538       1
3352563       1
Name: patientunitstayid, Length: 176258, dtype: int64

In [None]:
dfs =[df_patient,extracted_diag, extracted_pasthis]
df_patientdiagnose = reduce(lambda  left,right: pd.merge(left,right,on=['patientunitstayid'],
                                             how='outer'), dfs)

In [None]:
df_patientdiagnose.shape

(131970108, 42)

In [None]:
first_visits = df_patientdiagnose[df_patientdiagnose['unitvisitnumber'] == 1]

# Step 2: Filter the first visits to keep only those with an "Expired" discharge status
expired_after_first_visit = first_visits[first_visits['unitdischargestatus'] == 'Expired']


In [None]:
expired_after_first_visit

Unnamed: 0,patientunitstayid,patienthealthsystemstayid,gender,age,ethnicity,hospitalid,wardid,apacheadmissiondx,admissionheight,hospitaladmittime24,...,History_Value_1,History_Value_2,History_Value_3,History_Value_4,History_Value_5,History_Value_6,History_Value_7,History_Value_8,History_Value_9,History_Value_10
0,141168,128919,Female,70,Caucasian,59,91,"Rhythm disturbance (atrial, supraventricular)",152.4,15:54:00,...,notes,Progress Notes,Past History,Organ Systems,Cardiovascular (R),Valve disease,AS,,,
1,141168,128919,Female,70,Caucasian,59,91,"Rhythm disturbance (atrial, supraventricular)",152.4,15:54:00,...,notes,Progress Notes,Past History,Organ Systems,Renal (R),Renal Failure,renal failure- not currently dialyzed,,,
2,141168,128919,Female,70,Caucasian,59,91,"Rhythm disturbance (atrial, supraventricular)",152.4,15:54:00,...,notes,Progress Notes,Past History,Organ Systems,Cardiovascular (R),Congestive Heart Failure,CHF - class II,,,
3,141168,128919,Female,70,Caucasian,59,91,"Rhythm disturbance (atrial, supraventricular)",152.4,15:54:00,...,notes,Progress Notes,Past History,Past History Obtain Options,Performed,,,,,
4,141168,128919,Female,70,Caucasian,59,91,"Rhythm disturbance (atrial, supraventricular)",152.4,15:54:00,...,notes,Progress Notes,Past History,Organ Systems,Cardiovascular (R),Hypertension Requiring Treatment,hypertension requiring treatment,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
131969673,3353226,2743075,Female,79,African American,458,1109,"Effusions, pleural",121.9,17:09:00,...,notes,Progress Notes,Past History,Organ Systems,Endocrine (R),Hypothyroidism,hypothyroidism,,,
131969674,3353226,2743075,Female,79,African American,458,1109,"Effusions, pleural",121.9,17:09:00,...,notes,Progress Notes,Past History,Organ Systems,Cardiovascular (R),Hypertension Requiring Treatment,hypertension requiring treatment,,,
131969675,3353226,2743075,Female,79,African American,458,1109,"Effusions, pleural",121.9,17:09:00,...,notes,Progress Notes,Past History,Organ Systems,Renal (R),Renal Failure,renal failure - hemodialysis,,,
131969676,3353226,2743075,Female,79,African American,458,1109,"Effusions, pleural",121.9,17:09:00,...,notes,Progress Notes,Past History,Organ Systems,Endocrine (R),Insulin Dependent Diabetes,insulin dependent diabetes,,,


In [None]:
# expired_after_first_visit.to_csv('expired_after_first_visit.csv')

In [None]:
expired_after_first_visit.shape

(20926604, 42)

In [None]:
expired_after_first_visit_ids = expired_after_first_visit['patientunitstayid']

# Step 2: Filter the DataFrame to exclude rows corresponding to these patient unit stay IDs
filtered_df = df_patientdiagnose[~df_patientdiagnose['patientunitstayid'].isin(expired_after_first_visit_ids)]

# Output the filtered DataFrame
filtered_df

Unnamed: 0,patientunitstayid,patienthealthsystemstayid,gender,age,ethnicity,hospitalid,wardid,apacheadmissiondx,admissionheight,hospitaladmittime24,...,History_Value_1,History_Value_2,History_Value_3,History_Value_4,History_Value_5,History_Value_6,History_Value_7,History_Value_8,History_Value_9,History_Value_10
192,141178,128927,Female,52,Caucasian,60,83,,162.6,08:56:00,...,,,,,,,,,,
193,141179,128927,Female,52,Caucasian,60,83,,162.6,08:56:00,...,,,,,,,,,,
194,141194,128941,Male,68,Caucasian,73,92,"Sepsis, renal/UTI (including bladder)",180.3,18:18:40,...,notes,Progress Notes,Past History,Past History Obtain Options,Performed,,,,,
195,141194,128941,Male,68,Caucasian,73,92,"Sepsis, renal/UTI (including bladder)",180.3,18:18:40,...,notes,Progress Notes,Past History,Past History Obtain Options,Performed,,,,,
196,141194,128941,Male,68,Caucasian,73,92,"Sepsis, renal/UTI (including bladder)",180.3,18:18:40,...,notes,Progress Notes,Past History,Organ Systems,Endocrine (R),Non-Insulin Dependent Diabetes,medication dependent,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
131970103,3353254,2743102,Male,81,Caucasian,459,1108,"Bleeding, lower GI",185.4,07:43:00,...,notes,Progress Notes,Past History,Organ Systems,Cardiovascular (R),Arrhythmias,atrial fibrillation - chronic,,,
131970104,3353254,2743102,Male,81,Caucasian,459,1108,"Bleeding, lower GI",185.4,07:43:00,...,notes,Progress Notes,Past History,Past History Obtain Options,Performed,,,,,
131970105,3353254,2743102,Male,81,Caucasian,459,1108,"Bleeding, lower GI",185.4,07:43:00,...,notes,Progress Notes,Past History,Organ Systems,Pulmonary,COPD,COPD - moderate,,,
131970106,3353254,2743102,Male,81,Caucasian,459,1108,"Bleeding, lower GI",185.4,07:43:00,...,notes,Progress Notes,Past History,Organ Systems,Cardiovascular (R),Arrhythmias,atrial fibrillation - chronic,,,


In [None]:
df_patientdiagnose.unitvisitnumber.value_counts()

1     108610962
2      13367317
3       6924000
4       1367113
5        817152
6        502324
8        355876
12        14169
7          9156
9           794
10          684
16          169
18          150
15           91
11           70
13           39
14           36
17            6
Name: unitvisitnumber, dtype: int64

In [None]:
# value_to_check = 3353226

# filtered_df2 = df_patientdiagnose[df_patientdiagnose['History_Value_5'] == value_to_check]

In [None]:
# filtered_df2.unitdischargestatus

In [None]:
# notfount_diagnose = df_patientdiagnose.loc[df_patientdiagnose.Value_1.isnull()]
# notfount_pasthis = df_patientdiagnose.loc[df_patientdiagnose.History_Value_5.isnull()]

In [None]:
# notfount_diagnose.uniquepid.value_counts()

In [None]:
# notfount_pasthis.uniquepid.value_counts()

In [None]:
# notfount_type = df_patientdiagnose.loc[df_patientdiagnose.unitstaytype.isnull()]


In [None]:
# value_to_check = 590180

# filtered_df = notfount[notfount['patienthealthsystemstayid'] == value_to_check]

In [None]:
# df_patientdiagnose.uniquepid.value_counts()

In [None]:
df_patientdiagnose = filtered_df

In [None]:
df_patientdiagnose.shape

(111043504, 42)

In [None]:
df_patientdiagnose.Value_1.isna().sum()

35329

In [None]:
df_patientdiagnose = df_patientdiagnose.dropna(subset=['Value_1'])

In [None]:
df_patientdiagnose.shape

(111008175, 42)

In [None]:
df_patientdiagnose.History_Value_5.isna().sum()

23033

In [None]:
df_patientdiagnose = df_patientdiagnose.dropna(subset=['History_Value_5'])

In [None]:
df_patientdiagnose= df_patientdiagnose.dropna(subset = ['apacheadmissiondx'])

In [None]:
df_patientdiagnose.shape

(110941527, 42)

In [None]:
df_patientdiagnose = df_patientdiagnose.drop_duplicates()

In [None]:
df_patientdiagnose.shape

(2528667, 42)

In [None]:
df_patientdiagnose.columns

Index(['patientunitstayid', 'patienthealthsystemstayid', 'gender', 'age',
       'ethnicity', 'hospitalid', 'wardid', 'apacheadmissiondx',
       'admissionheight', 'hospitaladmittime24', 'hospitaladmitoffset',
       'hospitaladmitsource', 'hospitaldischargeyear',
       'hospitaldischargetime24', 'hospitaldischargeoffset',
       'hospitaldischargelocation', 'hospitaldischargestatus', 'unittype',
       'unitadmittime24', 'unitadmitsource', 'unitvisitnumber', 'unitstaytype',
       'admissionweight', 'dischargeweight', 'unitdischargetime24',
       'unitdischargeoffset', 'unitdischargelocation', 'unitdischargestatus',
       'uniquepid', 'Value_1', 'Value_2', 'Value_3', 'History_Value_1',
       'History_Value_2', 'History_Value_3', 'History_Value_4',
       'History_Value_5', 'History_Value_6', 'History_Value_7',
       'History_Value_8', 'History_Value_9', 'History_Value_10'],
      dtype='object')

In [None]:
df = df_patientdiagnose.drop(columns =['unitdischargelocation','hospitaldischargestatus',
                                       'hospitaldischargelocation','hospitaladmitsource',
                                       'hospitaladmittime24','hospitaldischargetime24','unitadmittime24',
                                      'unitdischargetime24','hospitalid','wardid','hospitaldischargeyear',
                                       'Value_3','History_Value_10',
                                      'History_Value_9','History_Value_8','History_Value_7','History_Value_4',
                                       'History_Value_3','History_Value_2','History_Value_1'])

In [None]:
df = df.drop(columns =['History_Value_6'])

In [None]:
# df = df_patientdiagnose.drop(columns =['unitdischargelocation','hospitaldischargestatus',
#                                        'hospitaldischargelocation','hospitaladmitsource','unitadmitsource',
#                                        'patienthealthsystemstayid','uniquepid',
#                                        'hospitaladmittime24','hospitaldischargetime24','unitadmittime24',
#                                       'unitdischargetime24','hospitalid','wardid','hospitaldischargeyear','Value_3','History_Value_10',
#                                       'History_Value_9','History_Value_8','History_Value_7','History_Value_4',
#                                        'History_Value_3','History_Value_2','History_Value_1','unitdischargestatus'])

In [None]:
# df.History_Value_6.value_counts()

In [None]:
main_disease_values = df.Value_1.unique()
sub_disease_values =df.Value_2.unique()
main_past_values = df.History_Value_5.unique()
# sub_past_values =df.History_Value_6.unique()

In [None]:
for disease in main_disease_values:
    column_name = f"Diagnose_{disease}"
    df[column_name] = df['Value_1'].apply(lambda x: 1 if x == disease else 0)


In [None]:
df.shape

(2528667, 38)

In [None]:
df2 = pd.DataFrame()
for disease in sub_disease_values:
    column_name = f"Diagnose_{disease}"
    df2[column_name] = df['Value_2'].apply(lambda x: 1 if x == disease else 0)

df2

  df2[column_name] = df['Value_2'].apply(lambda x: 1 if x == disease else 0)
  df2[column_name] = df['Value_2'].apply(lambda x: 1 if x == disease else 0)
  df2[column_name] = df['Value_2'].apply(lambda x: 1 if x == disease else 0)
  df2[column_name] = df['Value_2'].apply(lambda x: 1 if x == disease else 0)
  df2[column_name] = df['Value_2'].apply(lambda x: 1 if x == disease else 0)
  df2[column_name] = df['Value_2'].apply(lambda x: 1 if x == disease else 0)
  df2[column_name] = df['Value_2'].apply(lambda x: 1 if x == disease else 0)
  df2[column_name] = df['Value_2'].apply(lambda x: 1 if x == disease else 0)
  df2[column_name] = df['Value_2'].apply(lambda x: 1 if x == disease else 0)
  df2[column_name] = df['Value_2'].apply(lambda x: 1 if x == disease else 0)
  df2[column_name] = df['Value_2'].apply(lambda x: 1 if x == disease else 0)
  df2[column_name] = df['Value_2'].apply(lambda x: 1 if x == disease else 0)
  df2[column_name] = df['Value_2'].apply(lambda x: 1 if x == disease else 0)

Unnamed: 0,"Diagnose_skin, bone and joint infections",Diagnose_respiratory failure,Diagnose_glucose metabolism,Diagnose_altered mental status / pain,Diagnose_seizures,Diagnose_chest pain / ASHD,Diagnose_coagulation disorders,Diagnose_GI infections,Diagnose_shock / hypotension,Diagnose_arrhythmias,...,Diagnose_sensory organ,Diagnose_connective tissue,Diagnose_oral cavity disease,Diagnose_eye surgery,Diagnose_trauma - systemic effects,Diagnose_signs/symptoms/ill-defined conditions,Diagnose_metabolic disorder,Diagnose_thymus,Diagnose_breast disorder,Diagnose_inflammatory
202,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
203,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
204,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
205,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
206,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
131970103,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
131970104,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
131970105,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
131970106,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [None]:
df = pd.concat([df, df2], axis=1)

In [None]:
df.shape

(2528667, 169)

In [None]:
df2 = pd.DataFrame()
for disease in main_past_values:
    column_name = f"Past_{disease}"
    df2[column_name] = df['History_Value_5'].apply(lambda x: 1 if x == disease else 0)
df2

Unnamed: 0,Past_Performed,Past_Cardiovascular (R),Past_Endocrine (R),Past_Renal (R),Past_Gastrointestinal (R),Past_Pulmonary,Past_Infectious Disease (R),Past_Not Obtainable,Past_No Health Problems,Past_Hematology,Past_Neurologic,Past_Rheumatic,Past_Not Performed
202,1,0,0,0,0,0,0,0,0,0,0,0,0
203,0,1,0,0,0,0,0,0,0,0,0,0,0
204,0,1,0,0,0,0,0,0,0,0,0,0,0
205,0,1,0,0,0,0,0,0,0,0,0,0,0
206,0,0,1,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
131970103,0,1,0,0,0,0,0,0,0,0,0,0,0
131970104,1,0,0,0,0,0,0,0,0,0,0,0,0
131970105,0,0,0,0,0,1,0,0,0,0,0,0,0
131970106,0,1,0,0,0,0,0,0,0,0,0,0,0


In [None]:
df = pd.concat([df, df2], axis=1)

In [None]:
df.shape

(2528667, 182)

In [None]:
# df2 = pd.DataFrame()
# for disease in sub_past_values:
#     column_name = f"past_{disease}"
#     df2[column_name] = df['History_Value_6'].apply(lambda x: 1 if x == disease else 0)

# df2

In [None]:
# df = pd.concat([df, df2], axis=1)

In [None]:
# df.shape

In [None]:
df.columns

Index(['patientunitstayid', 'patienthealthsystemstayid', 'gender', 'age',
       'ethnicity', 'apacheadmissiondx', 'admissionheight',
       'hospitaladmitoffset', 'hospitaldischargeoffset', 'unittype',
       ...
       'Past_Renal  (R)', 'Past_Gastrointestinal (R)', 'Past_Pulmonary',
       'Past_Infectious Disease (R)', 'Past_Not Obtainable',
       'Past_No Health Problems', 'Past_Hematology', 'Past_Neurologic',
       'Past_Rheumatic', 'Past_Not Performed'],
      dtype='object', length=182)

In [None]:
#pd.set_option('display.max_rows',None)
print(df.isna().sum())

patientunitstayid                0
patienthealthsystemstayid        0
gender                         521
age                            201
ethnicity                    19987
                             ...  
Past_No Health Problems          0
Past_Hematology                  0
Past_Neurologic                  0
Past_Rheumatic                   0
Past_Not Performed               0
Length: 182, dtype: int64


In [None]:
missing_values_percentage = df.isna().mean() * 100
columns_with_more_than_20_percent_missing  = missing_values_percentage[missing_values_percentage > 20]
columns_with_more_than_20_percent_missing

dischargeweight    43.161081
dtype: float64

In [None]:
df.unitdischargeoffset.max()

729176

In [None]:
df = df[df['unitdischargeoffset']<=300000]
df = df[df['unitdischargeoffset']>=0]

In [None]:
df_cleaned = df.groupby('patientunitstayid').max().reset_index()
df=df_cleaned

In [None]:
df=df_cleaned

In [None]:
# df.History_Value_6

In [None]:
df.uniquepid.value_counts()

021-239346    26
021-120471    26
015-76326     24
011-1993      22
027-37185     20
              ..
012-59488      1
012-61585      1
012-69649      1
012-24779      1
012-13612      1
Name: uniquepid, Length: 125213, dtype: int64

In [None]:
df = df.drop(columns=["patientunitstayid",'Value_1','Value_2','History_Value_5','dischargeweight', 'unitstaytype'], axis=1)

In [None]:
df['ethnicity']=df['ethnicity'].fillna('unknown')
df['gender'] = df['gender'].fillna('unknown')
df['unitadmitsource'] = df['unitadmitsource'].fillna('unknown')
df['age'] = df['age'].str.replace('>', '')
df['age'] = df['age'].str.strip()
#df = df.dropna(subset=['age'])
# df = df.dropna(subset=['apacheadmissiondx'])
# df = df.dropna(subset=['admissionheight'])
# df = df.dropna(subset=['admissionweight'])

In [None]:
from sklearn.impute import SimpleImputer

# Columns to impute
columns_to_impute = ['admissionheight','admissionweight', 'age']

# Create SimpleImputer with mean strategy
imputer = SimpleImputer(strategy='mean')

# Fit and transform the imputer on the selected columns
df[columns_to_impute] = imputer.fit_transform(df[columns_to_impute])
df[columns_to_impute] = df[columns_to_impute].round(1)



In [None]:
# df_patient_1 = df_cleaned.set_index('patientunitstayid')
# df_patient_2 = df.set_index('patientunitstayid')

# count = 0
# found = 0
# not_found_ids = []
# # Iterate through the indices of one DataFrame
# for patient_id in df_patient_2.index:
#     if patient_id in df_patient_1.index:
#         found += 1
#     else:
#         count += 1
#         not_found_ids.append(patient_id)

# print("Found:", found)
# print("Not found:", count)
# df_not_found = pd.DataFrame({'patientunitstayid': not_found_ids})

In [None]:
nan = df.isna().sum()

In [None]:
print(nan[:50])

patienthealthsystemstayid                          0
gender                                             0
age                                                0
ethnicity                                          0
apacheadmissiondx                                  0
admissionheight                                    0
hospitaladmitoffset                                0
hospitaldischargeoffset                            0
unittype                                           0
unitadmitsource                                    0
unitvisitnumber                                    0
admissionweight                                    0
unitdischargeoffset                                0
unitdischargestatus                               15
uniquepid                                          0
Diagnose_infectious diseases                       0
Diagnose_pulmonary                                 0
Diagnose_endocrine                                 0
Diagnose_neurologic                           

In [None]:
# find = 141194
# specific_rows = df_diag[df_diag['patientunitstayid'] == find]
# specific_rows

In [None]:
# df = df.drop(columns=["patientunitstayid",'Value_1','Value_2','History_Value_5','History_Value_6','dischargeweight'], axis=1)

In [None]:
# df['ethnicity']=df['ethnicity'].fillna('unknown')
# df['gender'] = df['gender'].fillna('unknown')

In [None]:
# df['age'] = df['age'].str.replace('>', '')
# df['age'] = df['age'].str.strip()

In [None]:
# df['gender'] = df['gender'].fillna('unknown')

In [None]:
# find = 141229
# specific_rows = df[df['patientunitstayid'] == find]
# specific_rows.age.astype(int)

In [None]:
def fillna (column):
    values = column.dropna()
    n_missing = column.isna().sum()

    if n_missing>0:
        mode_value = values.mode().iloc[0]
        #bootstrap_samples = np.random.choice(values,size =n_missing)
        column[column.isna()] = mode_value

    return column

df = df.apply(fillna, axis=0)

In [None]:
# df = df.drop_duplicates()

In [None]:
# df = df[df['unitdischargeoffset'] >= 0]

In [None]:
df.columns[:30]

Index(['patienthealthsystemstayid', 'gender', 'age', 'ethnicity',
       'apacheadmissiondx', 'admissionheight', 'hospitaladmitoffset',
       'hospitaldischargeoffset', 'unittype', 'unitadmitsource',
       'unitvisitnumber', 'admissionweight', 'unitdischargeoffset',
       'unitdischargestatus', 'uniquepid', 'Diagnose_infectious diseases',
       'Diagnose_pulmonary', 'Diagnose_endocrine', 'Diagnose_neurologic',
       'Diagnose_cardiovascular', 'Diagnose_hematology',
       'Diagnose_gastrointestinal', 'Diagnose_renal', 'Diagnose_oncology',
       'Diagnose_burns/trauma', 'Diagnose_toxicology', 'Diagnose_transplant',
       'Diagnose_surgery', 'Diagnose_general',
       'Diagnose_obstetrics/gynecology'],
      dtype='object')

In [None]:
# df.to_csv('cleaned_df.csv')

In [None]:
df.shape

(161950, 176)

In [None]:
df.isna().sum()

patienthealthsystemstayid    0
gender                       0
age                          0
ethnicity                    0
apacheadmissiondx            0
                            ..
Past_No Health Problems      0
Past_Hematology              0
Past_Neurologic              0
Past_Rheumatic               0
Past_Not Performed           0
Length: 176, dtype: int64

In [None]:
df = df.reset_index(drop=True)
df.unitvisitnumber

0         1
1         1
2         1
3         1
4         1
         ..
161945    1
161946    1
161947    1
161948    1
161949    1
Name: unitvisitnumber, Length: 161950, dtype: int64

In [None]:
# df_visit_more_than_one = df[df['unitvisitnumber']>1]

In [None]:
# df_visit_more_than_one.unitstaytype.value_counts()

In [None]:
# df_1visit = df[df['unitvisitnumber']==1]

In [None]:
# users_one_visit = df_1visit[~df_1visit['uniquepid'].isin(df_visit_more_than_one['uniquepid'])]

In [None]:
# users_one_visit

In [None]:
# users_first_visit = df_1visit[df_1visit['uniquepid'].isin(df_visit_more_than_one['uniquepid'])]

In [None]:
# users_first_visit

In [None]:
# sampled_df = users_one_visit.sample(n=20000, random_state=0)

In [None]:
# sampled_df.unitstaytype.value_counts()

In [None]:
# df_readmission = pd.DataFrame()
# df_readmission = df_visit_more_than_one

In [None]:
# df_readmission.loc[df_readmission.unitstaytype == 'stepdown/other'] = 'readmit'
# df_readmission.unitstaytype.value_counts()

In [None]:
# def replace_values(x):
#     if x == 'admit' or x == 'transfer' or x == 'stepdown/other':
#         return 'readmit'
#     elif x == 'readmit':
#         return 'readmit'
#     else:
#         return x
# df_readmission['unitstaytype'] = df_readmission['unitstaytype'].apply(replace_values)

In [None]:
# df_readmission.unitstaytype.value_counts()

In [None]:
# def replace_values2(x):
#     if  x == 'stepdown/other':
#         return 'admit'
#     elif x == 'admit':
#         return 'admit'
#     else:
#         return x
# sampled_df['unitstaytype'] = sampled_df['unitstaytype'].apply(replace_values2)

In [None]:
# def replace_values3(x):
#     if x == 'readmit' or x == 'transfer' or x == 'stepdown/other':
#         return 'admit'
#     elif x == 'admit':
#         return 'admit'
#     else:
#         return x
# users_first_visit['unitstaytype'] = users_first_visit['unitstaytype'].apply(replace_values3)

In [None]:
# users_first_visit.unitstaytype.value_counts()

In [None]:
# df_new = pd.concat([df_readmission, sampled_df], axis=0)

In [None]:
# df_new=pd.concat([df_new, users_first_visit], axis=0)

In [None]:
# df_reg = df

In [None]:
df.columns[:30]

Index(['patienthealthsystemstayid', 'gender', 'age', 'ethnicity',
       'apacheadmissiondx', 'admissionheight', 'hospitaladmitoffset',
       'hospitaldischargeoffset', 'unittype', 'unitadmitsource',
       'unitvisitnumber', 'admissionweight', 'unitdischargeoffset',
       'unitdischargestatus', 'uniquepid', 'Diagnose_infectious diseases',
       'Diagnose_pulmonary', 'Diagnose_endocrine', 'Diagnose_neurologic',
       'Diagnose_cardiovascular', 'Diagnose_hematology',
       'Diagnose_gastrointestinal', 'Diagnose_renal', 'Diagnose_oncology',
       'Diagnose_burns/trauma', 'Diagnose_toxicology', 'Diagnose_transplant',
       'Diagnose_surgery', 'Diagnose_general',
       'Diagnose_obstetrics/gynecology'],
      dtype='object')

In [None]:
#For each patient ID , for each hospital stay, keep the longest stay

def get_longest_negative_offset(df):
    # Filter rows where hospitaladmitoffset is negative
    df_negative = df[df['hospitaladmitoffset'] <= 0]

    # Sort by uniquepid, hospitalstayid, and hospitaladmitoffset
    df_negative_sorted = df_negative.sort_values(by=['uniquepid', 'patienthealthsystemstayid', 'hospitaladmitoffset'])

    # Group by uniquepid and hospitalstayid and get the first row for each group
    longest_offset_df = df_negative_sorted.groupby(['uniquepid', 'patienthealthsystemstayid']).first().reset_index()

#     # Filter patients with more than one hospital stay
#     patient_hospitalstay_counts = longest_offset_df['uniquepid'].value_counts()
#     patients_with_multiple_stays = patient_hospitalstay_counts[patient_hospitalstay_counts > 1].index

    return longest_offset_df


# Apply the function
result_df = get_longest_negative_offset(df)

In [None]:
result_df.unitvisitnumber.value_counts()

1     128354
2      14846
3       3080
4        784
5        273
6         99
7         43
8         18
9          9
10         4
11         2
13         2
12         2
16         1
18         1
Name: unitvisitnumber, dtype: int64

In [None]:
result_df.patienthealthsystemstayid.value_counts()

2731781    1
193705     1
178200     1
141169     1
183274     1
          ..
137216     1
188445     1
155908     1
177030     1
152389     1
Name: patienthealthsystemstayid, Length: 147518, dtype: int64

In [None]:
def replace_values(x):
    if x == 1:
        return 0
    elif x > 1:
        return 1
    else:
        return x
result_df['unitvisitnumber'] = result_df['unitvisitnumber'].apply(replace_values)

In [None]:
class_counts = result_df['unitvisitnumber'].value_counts()

# Assuming there are exactly two classes
class1_count = class_counts.iloc[0]
class2_count = class_counts.iloc[1]

# Calculate the percentage difference
percentage_difference = abs(class1_count - class2_count) / ((class1_count + class2_count) / 2) * 100

print(f'Percentage Difference: {percentage_difference:.2f}%')

Percentage Difference: 148.04%


In [None]:
result_df.to_csv('cleaned_df.csv')
df = result_df
df_reg = result_df

In [None]:
path =r'cleaned_df.csv'
file = open(path)
df= pd.read_csv(file)
df

Unnamed: 0.1,Unnamed: 0,uniquepid,patienthealthsystemstayid,gender,age,ethnicity,apacheadmissiondx,admissionheight,hospitaladmitoffset,hospitaldischargeoffset,...,Past_Renal (R),Past_Gastrointestinal (R),Past_Pulmonary,Past_Infectious Disease (R),Past_Not Obtainable,Past_No Health Problems,Past_Hematology,Past_Neurologic,Past_Rheumatic,Past_Not Performed
0,0,002-10009,193705,Female,76.0,Caucasian,"GI perforation/rupture, surgery for",160.0,-179,8183,...,0,0,0,0,0,0,1,0,0,0
1,1,002-10018,178200,Female,29.0,Caucasian,"Cardiovascular medical, other",162.6,-4854,18918,...,0,0,0,0,0,1,0,0,0,0
2,2,002-10034,141169,Female,23.0,Caucasian,"GI medical, other",162.6,-3331,4172,...,0,0,0,0,0,0,0,0,0,0
3,3,002-10050,183274,Female,67.0,Caucasian,Aortic valve replacement (isolated),160.0,-67,9697,...,0,0,0,0,0,0,0,0,0,0
4,4,002-10050,190893,Female,68.0,Caucasian,"Complications of previous open-heart surgery, ...",162.6,-2140,7772,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
147513,147513,035-9957,2741786,Male,74.0,Caucasian,Head only trauma,182.9,-370,4973,...,0,0,0,0,0,0,0,0,0,0
147514,147514,035-9959,2731423,Male,44.0,Caucasian,"Infarction, acute myocardial (MI)",185.4,0,3873,...,0,0,0,0,0,0,0,0,0,0
147515,147515,035-996,2736458,Male,55.0,African American,Rhythm disturbance (conduction defect),190.5,-73,11397,...,1,0,0,0,0,0,0,0,0,0
147516,147516,035-9966,2742533,Male,60.0,African American,"Apnea-sleep; surgery for (i.e., UPPP - uvulopa...",170.1,-261,2945,...,0,0,1,0,0,0,0,0,0,0


In [None]:
df = df.drop( columns=['Unnamed: 0','uniquepid','patienthealthsystemstayid'], axis=1)
df

Unnamed: 0,gender,age,ethnicity,apacheadmissiondx,admissionheight,hospitaladmitoffset,hospitaldischargeoffset,unittype,unitadmitsource,unitvisitnumber,...,Past_Renal (R),Past_Gastrointestinal (R),Past_Pulmonary,Past_Infectious Disease (R),Past_Not Obtainable,Past_No Health Problems,Past_Hematology,Past_Neurologic,Past_Rheumatic,Past_Not Performed
0,Female,76.0,Caucasian,"GI perforation/rupture, surgery for",160.0,-179,8183,Med-Surg ICU,Operating Room,0,...,0,0,0,0,0,0,1,0,0,0
1,Female,29.0,Caucasian,"Cardiovascular medical, other",162.6,-4854,18918,Med-Surg ICU,Floor,0,...,0,0,0,0,0,1,0,0,0,0
2,Female,23.0,Caucasian,"GI medical, other",162.6,-3331,4172,Med-Surg ICU,Floor,0,...,0,0,0,0,0,0,0,0,0,0
3,Female,67.0,Caucasian,Aortic valve replacement (isolated),160.0,-67,9697,Med-Surg ICU,Operating Room,0,...,0,0,0,0,0,0,0,0,0,0
4,Female,68.0,Caucasian,"Complications of previous open-heart surgery, ...",162.6,-2140,7772,Med-Surg ICU,Operating Room,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
147513,Male,74.0,Caucasian,Head only trauma,182.9,-370,4973,Cardiac ICU,Emergency Department,0,...,0,0,0,0,0,0,0,0,0,0
147514,Male,44.0,Caucasian,"Infarction, acute myocardial (MI)",185.4,0,3873,Cardiac ICU,Direct Admit,0,...,0,0,0,0,0,0,0,0,0,0
147515,Male,55.0,African American,Rhythm disturbance (conduction defect),190.5,-73,11397,Cardiac ICU,Emergency Department,0,...,1,0,0,0,0,0,0,0,0,0
147516,Male,60.0,African American,"Apnea-sleep; surgery for (i.e., UPPP - uvulopa...",170.1,-261,2945,MICU,Operating Room,0,...,0,0,1,0,0,0,0,0,0,0


In [None]:
df.to_csv('llm.csv')