In [1]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import psycopg2
import getpass
import pdvega
import time
from configobj import ConfigObj

In [2]:
# Create a posgresql database connection using settings from config file 

# Create a database connection using settings from config file
config='../db/config.ini'
# connection info
conn_info = dict()
if os.path.isfile(config):
    config = ConfigObj(config)
    conn_info["sqluser"] = config['username']
    conn_info["sqlpass"] = config['password']
    conn_info["sqlhost"] = config['host']
    conn_info["sqlport"] = config['port']
    conn_info["dbname"] = config['dbname']
    conn_info["schema_name"] = config['schema_name']
else:
    conn_info["sqluser"] = 'postgres'
    conn_info["sqlpass"] = '243'
    conn_info["sqlhost"] = 'localhost'
    conn_info["sqlport"] = 5432
    conn_info["dbname"] = 'eicu'
    conn_info["schema_name"] = 'public,eicu'
    
# Connect to the eICU database
print('Database: {}'.format(conn_info['dbname']))
print('Username: {}'.format(conn_info["sqluser"]))
if conn_info["sqlpass"] == '243':
    # try connecting without password, i.e. peer or OS authentication
    try:
        if (conn_info["sqlhost"] == 'localhost') & (conn_info["sqlport"]=='5432'):
            con = psycopg2.connect(dbname=conn_info["dbname"],
                                    user=conn_info["sqluser"])            
        else:
            con = psycopg2.connect(dbname=conn_info["dbname"],
                                    host=conn_info["sqlhost"],
                                    port=conn_info["sqlport"],
                                    user=conn_info["sqluser"])
    except:
        conn_info["sqlpass"] = getpass.getpass('Password: ')
    
        con = psycopg2.connect(dbname=conn_info["dbname"],
                                host=conn_info["sqlhost"],
                                port=conn_info["sqlport"],
                                user=conn_info["sqluser"],
                                password=conn_info["sqlpass"])
query_schema = 'set search_path to ' + conn_info['schema_name'] + ';'

Database: eicu
Username: postgres
Password: ········


In [4]:
#import patient data, if not exsit allpatientdata.csv file than please excute the patientdata.ipynb file first
df=pd.read_csv(os.getcwd()+'\\allpatientfile.csv')

In [5]:
def main():
    ftName_0 = ['platelets x 1000', 'WBC x 1000', 'Hgb', 'chloride', 'glucose', 'potassium', 'calcium', 'BUN', 
                'creatinine', 'sodium', 'bedside glucose','magnesium', 'bicarbonate', 'total bilirubin', 'alkaline phos.',
                'paCO2', 'paO2', 'HCO3', 'troponin - I', 'albumin', 'pH', 'total protein',
                'phosphate', 'direct bilirubin', 'PT - INR', 'PTT', 'lactate', 'fibrinogen', 'CPK-MB INDEX', 'CPK',
                'CPK-MB', 'HDL', 'LDL', 'total cholesterol', 'PTT ratio', 'TSH', 'ammonia',
                'amylase', 'lipase', 'T4', 'Vitamin B12', 'Fe', 'TIBC', 'ionized calcium', 'Ferritin',
                'cortisol', 'free T4', 'T3', 'uric acid', 'serum osmolality', 'BNP', 'troponin - T',
                'CRP', 'Fe/TIBC Ratio', 'LDH', 'transferrin', 'prealbumin', 'CRP-hs']
    ftName = ['patientunitstayid']
    for i in range(0,len(ftName_0)):
        for j in range(0,30):
            ftName.append('%s D%02d'%(ftName_0[i], j+1))
    df_L = pd.DataFrame(columns= ftName)
    th = 0
    for i in range(0,len(df)):
        print("%d/%d: %s"%(i,len(df),df['patientunitstayid'][i]),end='\r')
        df_L = Lab(df['patientunitstayid'][i], df['Firstday'][i], df['LoS'][i],th,df_L, ftName_0)
        th=th+1
    df_L.to_csv(os.getcwd()+'\\lab.csv',index=False)

In [6]:
#可調整輸出的方式
#adjust the extract data into output format
def Featrue_all_value_with_day(ft_name, day, df_lab):
    temp_1 = [-0.001]
    for j in range(0,len(df_lab.index)):
        if df_lab['labname'][j] == ft_name:  
                if df_lab['date'][j] == day: 
                    if df_lab['labname'][j] == 'CRP':
                        if df_lab['labmeasurenameinterface'][j] == 'mg/L':
                            df_lab['labresult'][j] = df_lab['labresult'][j]/10
                            if df_lab['labresult'][j] < 0.3:
                                df_lab['labresult'][j] = 0.3
                    temp_1.append(df_lab['labresult'][j])
    if len(temp_1) != 1:
        temp_1.pop(0)
    #print(temp_1)
    return np.max(temp_1)

In [9]:
def Lab(patientunitstayid, Firstday, lenofstay, th, df_L,ftName_0):
    query = query_schema + """
    select *
    from lab
    where patientunitstayid = '{}'
    order by labresultoffset
    """.format(int(patientunitstayid))
    df_lab = pd.read_sql_query(query, con)
    df_lab = df_lab[['patientunitstayid','labresultoffset','labname','labresult','labmeasurenameinterface']]

    #calculate event time by days
    date = np.zeros((len(df_lab.index),), dtype=int)
    df_lab['date'] = date
    for i in range(0,len(df_lab.index)):
        temp = df_lab['labresultoffset'][i]

        if temp < Firstday:
            df_lab['date'][i] = 1
        elif temp >= Firstday:
            temp = temp - Firstday
            df_lab['date'][i] = temp // (24*60) + 2

    if df_lab.size == 0:
        space_row = [patientunitstayid]
        for i in range(len(ftName_0)):
            for j in range(0,30):
                if j >= int(lenofstay):
                    space_row.append(-0.001)
                else:
                    space_row.append(-0.001)
        df_L = df_L.append(pd.Series(space_row, index=df_L.columns), ignore_index=True)
    else:
        space_row = [patientunitstayid]
        for i in range(len(ftName_0)):
            for j in range(0,30):
                if j >= int(lenofstay):
                    space_row.append(-0.001)
                else:
                    space_row.append(0)
        df_L = df_L.append(pd.Series(space_row, index=df_L.columns), ignore_index=True)
        #將計算好的值放入表格中
        #put the calculated value into sheet and call "Feature_all_value_with_day" function to turning it into specific format
        for i in range(0,len(ftName_0)):
            for j in range(0,30):
                if j >= int(lenofstay):
                    break
                else:
                    str_temp = '%s D%02d'%(ftName_0[i],j+1)
                    df_L[str_temp][th] = Featrue_all_value_with_day(ftName_0[i],j+1,df_lab)
    return df_L

In [10]:
if __name__ is '__main__':
    main()

0/2676: 251510.0

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


10/2676: 276269.0

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  # Remove the CWD from sys.path while we load stuff.


1289/2676: 1336774.0

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  if sys.path[0] == '':


2675/2676: 3353226.0

In [11]:
#mask
df=pd.read_csv(os.getcwd()+'\\lab.csv')
df= df.mask(df==-0.001)
df= df.mask(df==0)
df.mask(df==' ').to_csv(os.getcwd()+'\\lab_mask.csv',index=False)

In [13]:
#filtering the outliner
df=pd.read_csv(os.getcwd()+'\\lab_mask.csv')
for i in range(0,len(df.index)):
    for j in range(1,30):
        if df['platelets x 1000 D%02d'%j][i] > 1000:
            df['platelets x 1000 D%02d'%j][i]=np.nan
            
        if df['WBC x 1000 D%02d'%j][i] < 0.5:
            df['WBC x 1000 D%02d'%j][i] = 0.5
            
        if df['WBC x 1000 D%02d'%j][i] > 100:
            df['WBC x 1000 D%02d'%j][i]=np.nan
            
        if df['glucose D%02d'%j][i] > 1000:
            df['glucose D%02d'%j][i] =np.nan
            
        if df['potassium D%02d'%j][i] > 10:
            df['potassium D%02d'%j][i]=np.nan
            
        if df['BUN D%02d'%j][i] > 200:
            df['BUN D%02d'%j][i]=np.nan

        if df['alkaline phos. D%02d'%j][i] > 1000:
            df['alkaline phos. D%02d'%j][i]=np.nan
  
 
        if df['albumin D%02d'%j][i] > 6:
            df['albumin D%02d'%j][i]=np.nan

        if df['CPK D%02d'%j][i] > 100000:
            df['CPK D%02d'%j][i]=np.nan

        if df['amylase D%02d'%j][i] > 5000:
            df['amylase D%02d'%j][i]=np.nan

        if df['lipase D%02d'%j][i] > 5000:
            df['lipase D%02d'%j][i]=np.nan
           
        if df['CRP D%02d'%j][i] > 50:
            df['CRP D%02d'%j][i] = 50
    
        if df['T4 D%02d'%j][i] > 100:
            df['T4 D%02d'%j][i]=np.nan

        if df['Vitamin B12 D%02d'%j][i] > 2000:
            df['Vitamin B12 D%02d'%j][i]=np.nan

        if df['Ferritin D%02d'%j][i] > 10000:
            df['Ferritin D%02d'%j][i]=np.nan

        if df['cortisol D%02d'%j][i] > 100:
            df['cortisol D%02d'%j][i]=np.nan

        if df['serum osmolality D%02d'%j][i] > 500:
            df['serum osmolality D%02d'%j][i]=np.nan

        if df['BNP D%02d'%j][i] > 35000:
            df['BNP D%02d'%j][i]=np.nan

        if df['Fe/TIBC Ratio D%02d'%j][i] > 100:
            df['Fe/TIBC Ratio D%02d'%j][i]=np.nan

        if df['LDH D%02d'%j][i] > 10000:
            df['LDH D%02d'%j][i]=np.nan

        if df['transferrin D%02d'%j][i] > 300:
            df['transferrin D%02d'%j][i]=np.nan

        if df['prealbumin D%02d'%j][i] > 50:
            df['prealbumin D%02d'%j][i]=np.nan

        if df['Hgb D%02d'%j][i] > 30:
            df['Hgb D%02d'%j][i]=np.nan

        if df['calcium D%02d'%j][i] > 30:
            df['calcium D%02d'%j][i]=np.nan

        if df['creatinine D%02d'%j][i] > 40:
            df['creatinine D%02d'%j][i]=np.nan

        if df['magnesium D%02d'%j][i] > 15:
            df['magnesium D%02d'%j][i]=np.nan

        if df['pH D%02d'%j][i] > 20:
            df['pH D%02d'%j][i]=np.nan

        if df['phosphate D%02d'%j][i] > 40:
            df['phosphate D%02d'%j][i]=np.nan 

        if df['CPK-MB INDEX D%02d'%j][i] > 100:
            df['CPK-MB INDEX D%02d'%j][i]=np.nan

        if df['TIBC D%02d'%j][i] > 1000:
            df['TIBC D%02d'%j][i]=np.nan

        if df['ionized calcium D%02d'%j][i] > 10:
            df['ionized calcium D%02d'%j][i]=np.nan

        if df['troponin - T D%02d'%j][i] > 20:
            df['troponin - T D%02d'%j][i]=np.nan

df.to_csv(os.getcwd()+'\\lab_mask.csv',index=False)
#print(len(pid))

In [16]:
#complement

df = pd.read_csv(os.getcwd()+'\\lab_mask.csv')
ftName_0 = ['platelets x 1000', 'WBC x 1000', 'Hgb', 'chloride', 'glucose', 'potassium', 'calcium', 'BUN', 
                'creatinine', 'sodium', 'bedside glucose','magnesium', 'bicarbonate', 'total bilirubin', 'alkaline phos.',
                'paCO2', 'paO2', 'HCO3', 'troponin - I',  'albumin', 'pH', 'total protein',
                'phosphate', 'direct bilirubin', 'PT - INR', 'PTT', 'lactate', 'fibrinogen', 'CPK-MB INDEX', 'CPK',
                'CPK-MB', 'HDL', 'LDL', 'total cholesterol',  'PTT ratio', 'TSH', 'ammonia',
                'amylase', 'lipase', 'T4', 'Vitamin B12', 'Fe', 'TIBC', 'ionized calcium', 'Ferritin',
                'cortisol', 'free T4', 'T3', 'uric acid', 'serum osmolality', 'BNP', 'troponin - T',
                'CRP', 'Fe/TIBC Ratio', 'LDH', 'transferrin', 'prealbumin', 'CRP-hs']
for i in range(0,len(df)):
    print('%d/%d:%s'%(i,len(df),df['patientunitstayid'][i]),end='\r')
    for j in ftName_0:
        current=0
        flag=0
        for day in range(1,31):
            if(not pd.isna(df['%s D%02d'%(j,day)][i]) and flag==0):
                flag=1#find first value of a month
                current=df['%s D%02d'%(j,day)][i]
                #print(current,flag,day)
                break
        if(flag==1):
            #print(123)
            for missing in range(1,day):
                df['%s D%02d'%(j,missing)][i]=current
            flag=2#if there is a null value than put the value of previous day into it
        #print(df['%s D%02d'%('Nasal cannula O2',1)][i])
        if(flag==2):
            for missing in range(2,31):
                if(pd.isna(df['%s D%02d'%(j,missing)][i]) and not pd.isna(df['%s D%02d'%(j,missing-1)][i])):
                    df['%s D%02d'%(j,missing)][i]=df['%s D%02d'%(j,missing-1)][i]


df.to_csv('lab_com.csv',index=False)
                

2675/2676:3353226.0