In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.image as mpimg
import numpy as np
from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.preprocessing import LabelEncoder, MinMaxScaler, StandardScaler
from sklearn.metrics import roc_auc_score, f1_score
from datetime import datetime

In [2]:
# read the prescription table

df = '/mimic-iv-2.2'
def read_pre_table(df):
    pre = pd.read_csv('hosp/prescriptions.csv.gz')
    pre = pre[['subject_id','starttime','stoptime','drug']]
    return pre

pre = read_pre_table(df)
pre.head()

  pre = pd.read_csv('hosp/prescriptions.csv.gz')


Unnamed: 0,subject_id,starttime,stoptime,drug
0,10000032,2180-05-07 01:00:00,2180-05-07 22:00:00,Acetaminophen
1,10000032,2180-05-07 00:00:00,2180-05-07 22:00:00,Sodium Chloride 0.9% Flush
2,10000032,2180-05-08 08:00:00,2180-05-07 22:00:00,Furosemide
3,10000032,2180-05-07 01:00:00,2180-05-07 22:00:00,Raltegravir
4,10000032,2180-05-07 00:00:00,2180-05-07 22:00:00,Heparin


In [3]:
# read the fsnotime table

def read_fsnotime_table(df):
    fsno = pd.read_csv('fsnotime.csv.gz')

    return fsno

fsno = read_fsnotime_table(df)
fsno.head()

Unnamed: 0.1,Unnamed: 0,subject_id,hadm_id,recode,race,gender,anchor_age,blood,circulatory,congenital,...,misc,muscular,neoplasms,nervous,pregnancy,prenatal,respiratory,skin,systolic,diastolic
0,0,10274866,23488422,995,HISPANIC/LATINO,M,65,0,1,0,...,0,0,0,0,0,0,0,0,104,72
1,1,10274866,23488422,995,HISPANIC/LATINO,M,65,0,1,0,...,0,0,0,0,0,0,0,0,110,66
2,2,10274866,23488422,995,HISPANIC/LATINO,M,65,0,1,0,...,0,0,0,0,0,0,0,0,110,80
3,3,10274866,23488422,995,HISPANIC/LATINO,M,65,0,1,0,...,0,0,0,0,0,0,0,0,90,70
4,4,10274866,23488422,995,HISPANIC/LATINO,M,65,0,1,0,...,0,0,0,0,0,0,0,0,120,82


In [4]:
fsno.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 166309 entries, 0 to 166308
Data columns (total 26 columns):
 #   Column         Non-Null Count   Dtype 
---  ------         --------------   ----- 
 0   Unnamed: 0     166309 non-null  int64 
 1   subject_id     166309 non-null  int64 
 2   hadm_id        166309 non-null  int64 
 3   recode         166309 non-null  int64 
 4   race           166309 non-null  object
 5   gender         166309 non-null  object
 6   anchor_age     166309 non-null  int64 
 7   blood          166309 non-null  int64 
 8   circulatory    166309 non-null  int64 
 9   congenital     166309 non-null  int64 
 10  digestive      166309 non-null  int64 
 11  endocrine      166309 non-null  int64 
 12  genitourinary  166309 non-null  int64 
 13  infectious     166309 non-null  int64 
 14  injury         166309 non-null  int64 
 15  mental         166309 non-null  int64 
 16  misc           166309 non-null  int64 
 17  muscular       166309 non-null  int64 
 18  neop

In [5]:
time = pre[['subject_id','starttime','stoptime']]
time.head()
time.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15416708 entries, 0 to 15416707
Data columns (total 3 columns):
 #   Column      Dtype 
---  ------      ----- 
 0   subject_id  int64 
 1   starttime   object
 2   stoptime    object
dtypes: int64(1), object(2)
memory usage: 352.9+ MB


In [6]:
time.dropna(how='any')
time.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15416708 entries, 0 to 15416707
Data columns (total 3 columns):
 #   Column      Dtype 
---  ------      ----- 
 0   subject_id  int64 
 1   starttime   object
 2   stoptime    object
dtypes: int64(1), object(2)
memory usage: 352.9+ MB


In [7]:
# convert strings to datetime objects using strptime() method

time['stoptime'] = pd.to_datetime(time["stoptime"])
time['starttime'] = pd.to_datetime(time["starttime"])

mask = time["stoptime"] >=time["starttime"]

time[mask]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  time['stoptime'] = pd.to_datetime(time["stoptime"])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  time['starttime'] = pd.to_datetime(time["starttime"])


Unnamed: 0,subject_id,starttime,stoptime
0,10000032,2180-05-07 01:00:00,2180-05-07 22:00:00
1,10000032,2180-05-07 00:00:00,2180-05-07 22:00:00
3,10000032,2180-05-07 01:00:00,2180-05-07 22:00:00
4,10000032,2180-05-07 00:00:00,2180-05-07 22:00:00
5,10000032,2180-05-07 01:00:00,2180-05-07 22:00:00
...,...,...,...
15416703,19999987,2145-11-03 00:00:00,2145-11-03 18:00:00
15416704,19999987,2145-11-04 10:00:00,2145-11-11 17:00:00
15416705,19999987,2145-11-10 10:00:00,2145-11-11 17:00:00
15416706,19999987,2145-11-09 10:00:00,2145-11-09 16:00:00


In [8]:
time = time[mask]
time_f = time[['subject_id','starttime']]
time_f = time_f.head(1000000)
time_f.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1000000 entries, 0 to 1041915
Data columns (total 2 columns):
 #   Column      Non-Null Count    Dtype         
---  ------      --------------    -----         
 0   subject_id  1000000 non-null  int64         
 1   starttime   1000000 non-null  datetime64[ns]
dtypes: datetime64[ns](1), int64(1)
memory usage: 22.9 MB


In [9]:
# add starttime to the fsnotime

fs_time = pd.merge(fsno,time_f, how='inner', on='subject_id')
fs_time.head()
fs_time.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3003001 entries, 0 to 3003000
Data columns (total 27 columns):
 #   Column         Dtype         
---  ------         -----         
 0   Unnamed: 0     int64         
 1   subject_id     int64         
 2   hadm_id        int64         
 3   recode         int64         
 4   race           object        
 5   gender         object        
 6   anchor_age     int64         
 7   blood          int64         
 8   circulatory    int64         
 9   congenital     int64         
 10  digestive      int64         
 11  endocrine      int64         
 12  genitourinary  int64         
 13  infectious     int64         
 14  injury         int64         
 15  mental         int64         
 16  misc           int64         
 17  muscular       int64         
 18  neoplasms      int64         
 19  nervous        int64         
 20  pregnancy      int64         
 21  prenatal       int64         
 22  respiratory    int64         
 23  skin   

In [41]:
# save this version of the dataframe to a csv. 

fs_time.to_csv('fstime.csv.gz')

In [10]:
# read the ffonotime table

def read_ffonotime_table(df):
    ffono = pd.read_csv('ffonotime.csv.gz')
    ffono = ffono.head(1000000)
    return ffono

ffono = read_ffonotime_table(df)
ffono.head()


Unnamed: 0.1,Unnamed: 0,subject_id,hadm_id,recode,race,gender,anchor_age,blood,circulatory,congenital,...,misc,muscular,neoplasms,nervous,pregnancy,prenatal,respiratory,skin,systolic,diastolic
0,0,10000032,22841357,276,WHITE,F,52,0,0,0,...,0,0,0,0,0,0,1,0,110,65
1,1,10000032,22841357,276,WHITE,F,52,0,0,0,...,0,0,0,0,0,0,1,0,106,60
2,2,10000032,22841357,276,WHITE,F,52,0,0,0,...,0,0,0,0,0,0,1,0,121,77
3,3,10000032,22841357,276,WHITE,F,52,0,0,0,...,0,0,0,0,0,0,1,0,100,60
4,4,10000032,22841357,276,WHITE,F,52,0,0,0,...,0,0,0,0,0,0,1,0,102,60


In [11]:
ffono = ffono.head(100000)
time_ff = time_f.head(100000)
ffono.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 26 columns):
 #   Column         Non-Null Count   Dtype 
---  ------         --------------   ----- 
 0   Unnamed: 0     100000 non-null  int64 
 1   subject_id     100000 non-null  int64 
 2   hadm_id        100000 non-null  int64 
 3   recode         100000 non-null  int64 
 4   race           100000 non-null  object
 5   gender         100000 non-null  object
 6   anchor_age     100000 non-null  int64 
 7   blood          100000 non-null  int64 
 8   circulatory    100000 non-null  int64 
 9   congenital     100000 non-null  int64 
 10  digestive      100000 non-null  int64 
 11  endocrine      100000 non-null  int64 
 12  genitourinary  100000 non-null  int64 
 13  infectious     100000 non-null  int64 
 14  injury         100000 non-null  int64 
 15  mental         100000 non-null  int64 
 16  misc           100000 non-null  int64 
 17  muscular       100000 non-null  int64 
 18  neopl

In [12]:
ffono = ffono.drop('Unnamed: 0', axis=1)

In [13]:
# add starttime to the ffonotime

ffo_time = pd.merge(ffono,time_ff, how='inner', on='subject_id')

ffo_time.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 20992361 entries, 0 to 20992360
Data columns (total 26 columns):
 #   Column         Dtype         
---  ------         -----         
 0   subject_id     int64         
 1   hadm_id        int64         
 2   recode         int64         
 3   race           object        
 4   gender         object        
 5   anchor_age     int64         
 6   blood          int64         
 7   circulatory    int64         
 8   congenital     int64         
 9   digestive      int64         
 10  endocrine      int64         
 11  genitourinary  int64         
 12  infectious     int64         
 13  injury         int64         
 14  mental         int64         
 15  misc           int64         
 16  muscular       int64         
 17  neoplasms      int64         
 18  nervous        int64         
 19  pregnancy      int64         
 20  prenatal       int64         
 21  respiratory    int64         
 22  skin           int64         
 23  systo

In [14]:
ffo_time.head()

Unnamed: 0,subject_id,hadm_id,recode,race,gender,anchor_age,blood,circulatory,congenital,digestive,...,muscular,neoplasms,nervous,pregnancy,prenatal,respiratory,skin,systolic,diastolic,starttime
0,10000032,22841357,276,WHITE,F,52,0,0,0,0,...,0,0,0,0,0,1,0,110,65,2180-05-07 01:00:00
1,10000032,22841357,276,WHITE,F,52,0,0,0,0,...,0,0,0,0,0,1,0,110,65,2180-05-07 00:00:00
2,10000032,22841357,276,WHITE,F,52,0,0,0,0,...,0,0,0,0,0,1,0,110,65,2180-05-07 01:00:00
3,10000032,22841357,276,WHITE,F,52,0,0,0,0,...,0,0,0,0,0,1,0,110,65,2180-05-07 00:00:00
4,10000032,22841357,276,WHITE,F,52,0,0,0,0,...,0,0,0,0,0,1,0,110,65,2180-05-07 01:00:00


In [25]:
# save this version of the dataframe to a csv. 

ffo_time.to_csv('ffotime.csv.gz')