In [1]:
%pylab inline

Populating the interactive namespace from numpy and matplotlib


In [2]:
import numpy as np
import pandas as pd
import math
import tqdm

from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.ensemble import IsolationForest
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler

from datetime import datetime
from costum_utils import future_checker

## 1. Data import

Let us start by importing the data obtained by quering eICU database

In [3]:
id_patients = pd.read_csv('./eICU/id_patients.csv',sep=',',index_col=0)
lab_results = pd.read_csv('./eICU/lab.csv',sep=',',index_col=0)
transfusion =  pd.read_csv('./eICU/transfusion.csv',sep=',',index_col=0)
patient_info =  pd.read_csv('./eICU/patient_info.csv',sep=',',index_col=0)
vital_periodic =  pd.read_csv('./eICU/vital_periodic.csv',sep=',')
vital_aperiodic =  pd.read_csv('./eICU/vital_aperiodic.csv',sep=',')
crystalloid = pd.read_csv('./eICU/crystalloids.csv',sep=',')

We only keep up to the first 24 hours of the icu stay

In [4]:
vital_aperiodic=vital_aperiodic[vital_aperiodic.time<=24]
vital_periodic=vital_periodic[vital_periodic.time<=24]
crystalloid = crystalloid[crystalloid.time<=24]

We also remove readmissions

In [5]:
patient_info = patient_info[patient_info.unitstaytype!='readmit']
patient_info = patient_info[['gender','age']]

Qualcosa sui cristalloidi che non capisco

In [6]:
crystalloid = crystalloid.drop('cellpath',axis = 1)
crystalloid.columns= ['patientunitstayid','time','crystalloid']

Collapse multiple transfusions within an hour into a unique observation

In [7]:
transfusion.groupby(transfusion.index).sum()['amount'].value_counts()/transfusion.groupby(transfusion.index).sum().shape[0]

0.00       0.644117
700.00     0.028753
350.00     0.021256
600.00     0.017111
1050.00    0.012436
             ...   
1487.00    0.000088
1138.75    0.000088
911.50     0.000088
1389.00    0.000088
642.30     0.000088
Name: amount, Length: 1202, dtype: float64

## 2. Data merging

In [8]:
lab_e_transfusion=lab_results.merge(transfusion,left_on=[lab_results.index,'time'],right_on=[transfusion.index,'time'],how = 'outer').set_index('key_0')

In [9]:
vital_periodic=vital_periodic.groupby(['patientunitstayid','time'],as_index=False).mean().drop(['systemicsystolic','systemicdiastolic','systemicmean'],axis = 1)

In [10]:
lab_e_transfusion_e_periodic = lab_e_transfusion.merge(vital_periodic,left_on=[lab_e_transfusion.index,'time'],right_on=['patientunitstayid','time'],how = 'outer').set_index('patientunitstayid')

In [11]:
vital_aperiodic=vital_aperiodic.groupby(['patientunitstayid','time'],as_index=False).mean()

In [12]:
crystalloid=crystalloid.groupby(['patientunitstayid','time'],as_index=False).sum()

In [13]:
lab_e_transfusion_e_periodic_e_aperiodic = lab_e_transfusion_e_periodic.merge(vital_aperiodic,left_on=[lab_e_transfusion_e_periodic.index,'time'],right_on=['patientunitstayid','time'],how = 'outer').set_index('patientunitstayid')

In [14]:
lab_e_transfusion_e_periodic_e_aperiodic = lab_e_transfusion_e_periodic_e_aperiodic.merge(crystalloid,left_on=[lab_e_transfusion_e_periodic_e_aperiodic.index,'time'],right_on=['patientunitstayid','time'],how = 'outer').set_index('patientunitstayid')

In [15]:
tabel_final = lab_e_transfusion_e_periodic_e_aperiodic.join(patient_info,how = 'right')

## 3. Creating X and y

In [16]:
dataset = tabel_final.copy()

### 3.1 Rename columns as MIMIC

In [17]:
dataset=dataset.rename(columns={
    'albumin':'ALBUMIN',
    'creatinine':'CREATININE',
    'glucose':'GLUCOSE',
    'bicarbonate':'BICARBONATE',
    'hematocrit':'HEMATOCRIT',
    'hemoglobin':'HEMOGLOBIN',
    'lactate':'LACTATE',
    'potassium':'POTASSIUM',
    'ptt':'PTT',
    'wbc':'WBC',
    'platelets':'PLATELET',
    'amount':'AmountTransfused',
    'temperature':'TempC',
    'heartrate':'HEARTRATE',
    'respiration':'RespRate',
    'noninvasivesystolic':'SysBP',
    'noninvasivediastolic':'DiasBP',
    'noninvasivemean':'MeanBP',
    'age':'admission_age',
    'crystalloid':'crystalloid_bolus'
    
    }          
)

### 3.2 Handiling ICU preadmission infos

Let us define a dataframe which will contain all the information about 12 hours prior ICU admission

In [18]:
pre_icu = dataset[ (dataset['time']>=-12) & (dataset['time']<=-1)]

For each variable we define a specific extraction criteria

In [19]:
#define a new dataframe only containing the index of the patients
pre_x = pd.DataFrame(index=pre_icu.index.unique())

#feature extraction criteria
features_list = [
    ('ALBUMIN',np.nanmean),
    ('BUN',np.nanmedian),
    ('CREATININE', np.nanmax),
    ('GLUCOSE',np.nanmean),
    ('BICARBONATE', np.nanmedian),
    ('HEMATOCRIT', np.nanmin),
    ('HEMOGLOBIN', np.nanmin),
    ('INR',np.nanmean),
    ('LACTATE',np.nanmean),
    ('PLATELET', np.nanmin),
    ('POTASSIUM', np.nanmax),
    ('PTT', np.nanmax),
    ('WBC', np.nanmean),
    ('AmountTransfused', np.nansum),
    ('TempC', np.nanmin),
    ('HEARTRATE', np.nanmax),
    ('RespRate', np.nanmax),
    ('SysBP', np.nanmin),
    ('DiasBP',np.nanmin),
    ('MeanBP', np.nanmean),
    ('crystalloid_bolus', np.nansum),
    ('gender', np.nanmin),
    ('admission_age', np.nanmin),   
]

#save features names
feature_names = [x[0] for x in features_list]

#process features column by column after grouping for patients
to_concat = []
grouped = pre_icu.groupby(pre_icu.index)

for feature, function in features_list:
    to_concat.append(grouped[[feature]].apply(function))
 
#add obtained feature to previously defined dataframe
pre_x = pd.concat([pre_x] + to_concat, axis=1, join='inner')
pre_x.columns = feature_names

#add the time value
pre_x['time'] = [-1]*pre_x.shape[0]

  return libreduction.apply_frame_axis0(sdata, f, names, starts, ends)
  overwrite_input=overwrite_input)
  return libreduction.apply_frame_axis0(sdata, f, names, starts, ends)


Initial information imputation

### 3.3 ICU training data

We start by subsetting data in the time interval of interest: 0-3 hours

In [20]:
final = dataset[dataset['time']<=3]
final = final[final['time']>=0]

The let us merge the informations prior ICU admission obtained in the last step

In [21]:
final_x = pd.concat([pre_x,final],sort = True)

#reorder columns and rows
final_x = final_x[pre_x.columns]
final_x.sort_values(by=['patientunitstayid','time'])

Unnamed: 0_level_0,ALBUMIN,BUN,CREATININE,GLUCOSE,BICARBONATE,HEMATOCRIT,HEMOGLOBIN,INR,LACTATE,PLATELET,...,TempC,HEARTRATE,RespRate,SysBP,DiasBP,MeanBP,crystalloid_bolus,gender,admission_age,time
patientunitstayid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
141266,3.6,29.0,1.90,410.4,29.0,33.4,9.5,1.10,4.5,283.0,...,,,,,,,100.0,Male,73,-1.0
141266,,,,319.0,,,,,,,...,,96.333333,28.000000,,,,,Male,73,0.0
141266,,,,215.0,,,,,,,...,,93.583333,21.500000,101.000000,63.00,78.000000,,Male,73,1.0
141266,,,,,,,,,,,...,37.400000,93.083333,19.083333,94.000000,54.50,68.500000,,Male,73,2.0
141266,,,,,,,,,,,...,37.508333,87.250000,14.166667,86.000000,57.00,67.000000,220.0,Male,73,3.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3353254,3.9,42.0,2.38,181.0,24.0,33.0,10.9,1.03,1.3,268.0,...,,,,,,,0.0,Male,81,-1.0
3353254,,,,,,,,,,,...,,75.750000,21.250000,128.500000,66.50,89.500000,0.0,Male,81,0.0
3353254,,,,,,,,,,,...,,78.500000,22.750000,96.833333,58.00,71.666667,,Male,81,1.0
3353254,,,,,,,,,,,...,,75.416667,22.833333,130.000000,76.25,97.000000,,Male,81,2.0


Let us now recode sex and age in numeric values and remove patients of unkonw gender

In [22]:
#gender
final_x = final_x[final_x.gender != 'Unknown']
final_x.replace('Male',0,inplace=True)
final_x.replace('Female',1,inplace=True)
final_x.gender = final_x.gender.astype('float')

#age
final_x.replace('> 89',90,inplace=True)
final_x.admission_age = final_x.admission_age.astype('float')

In [23]:
new = final_x.groupby(by=['patientunitstayid','time']).mean().reset_index()

In [24]:
new.set_index('patientunitstayid',inplace = True)

Now we need to uniform the data format and add missing values records where needed. The command below adds an empty line for the hours in which we don't have any kind of information

In [25]:
time = [-1,0,1,2,3]
for i in new.index.unique():
    try:
        if new.loc[i].time.shape[0] !=5:
            missing = [x for x in time if x not in new.loc[i].time.values]
            df = pd.DataFrame(missing, columns = ['time'],index = [i]*len(missing))
            new = pd.concat([new,df])
    except:
        missing = [x for x in time if x not in new.loc[i].time.reshape(-1,1)]
        df = pd.DataFrame(missing, columns = ['time'],index = [i]*len(missing))
        new = pd.concat([new,df])
        
#since we added new timeslots we need to reorder the dataset
new['temp_axis'] = new.index

new.sort_values(by=['temp_axis','time'],inplace=True)
new.drop('temp_axis',axis=1,inplace=True)

Since we adess empyt lines we need to fix static variables such as gender and age

In [26]:
for i in new.index.unique(): 
    new.loc[i,['gender']] =np.nanmean(new.loc[i,['gender']])
    
for i in new.index.unique(): 
    new.loc[i,['admission_age']] =np.nanmean(new.loc[i,['admission_age']])

  
  """


We drop patients without age and gender

In [27]:
to_drop = []

for i in new.index.unique():
    if(sum(new.loc[i].gender.isna().values)!=0): to_drop.append(i)
        
missing_age = []

for i in new.index.unique():
    if(sum(new.loc[i].admission_age.isna().values)!=0): to_drop.append(i)
        
new.drop(to_drop,inplace=True)

In [28]:
new.time.value_counts().sum()

52590

### 3.4 Missing imputation

We start by imputing the hour prior to ICU entering

In [29]:
#LABS - compute values to impute
pre_imputation=[
('ALBUMIN',np.nanmedian),
('BUN',np.nanmedian),
('CREATININE',np.nanmedian),
('GLUCOSE',np.nanmedian),
('BICARBONATE',np.nanmedian),
('HEMATOCRIT',np.nanmedian),
('HEMOGLOBIN',np.nanmedian),
('INR',np.nanmedian),
('LACTATE',np.nanmedian),
('PLATELET',np.nanmedian),
('POTASSIUM',np.nanmedian),
('PTT',np.nanmedian),
('WBC',np.nanmedian)
]

#save them in a dictionary
imputation_value_dict = {}

for i,j in pre_imputation:
    imputation_value_dict[i] = j(new[new['time']==-1][i].values)

#impute values
for j in tqdm.tqdm(new.index.unique()):
    
    for i,_ in pre_imputation:
        
        if(math.isnan(new.loc[(new.index==j) & (new.time==-1),i].values)):
            new.loc[(new.index==j) & (new.time==-1),i]=imputation_value_dict[i]

100%|██████████| 10518/10518 [01:43<00:00, 101.26it/s]


As a first step we have to impute vitals values for the -1 hour. We do that by using the median of the values at 0.

In [30]:
#VITALS - compute the values to impute
periodic_pre_impute=[
('TempC',np.nanmedian),
('HEARTRATE',np.nanmedian),
('RespRate',np.nanmedian),
('SysBP',np.nanmedian),
('DiasBP',np.nanmedian),
('MeanBP',np.nanmedian)
]

imputation_value_dict = {}

#compute the values to impute from next hour
for i,j in periodic_pre_impute:
    imputation_value_dict[i] = j(new[new['time']==0][i].values)

    
#impute values    
for j in tqdm.tqdm(new.index.unique()):
    
    for i,_ in periodic_pre_impute:
        
        if(math.isnan(new.loc[(new.index==j) & (new.time==-1),i].values)):
            new.loc[(new.index==j) & (new.time==-1),i]=imputation_value_dict[i]
  

100%|██████████| 10518/10518 [01:23<00:00, 126.24it/s]


Let us first impute the values that are not meant to be forward filled, namely AmountTransfused and crystalloid_bolus

In [31]:
new['crystalloid_bolus'].fillna(0,inplace=True)
new['AmountTransfused'].fillna(0,inplace=True)

Let us now dynamically impute the remaining values

In [32]:
for i in tqdm.tqdm(new.index.unique()):
    new.loc[i]=new.loc[i].fillna(method='ffill')

100%|██████████| 10518/10518 [00:05<00:00, 1885.79it/s]


In [33]:
new.columns

Index(['time', 'ALBUMIN', 'BUN', 'CREATININE', 'GLUCOSE', 'BICARBONATE',
       'HEMATOCRIT', 'HEMOGLOBIN', 'INR', 'LACTATE', 'PLATELET', 'POTASSIUM',
       'PTT', 'WBC', 'AmountTransfused', 'TempC', 'HEARTRATE', 'RespRate',
       'SysBP', 'DiasBP', 'MeanBP', 'crystalloid_bolus', 'gender',
       'admission_age'],
      dtype='object')

In [34]:
#raw export
new.to_csv('raw_x_0_3_eICU.csv')

### 3.5 Feature collapsing

Let us now collapse the time series featrues in one unique observations according to the criteria chosen below

In [35]:
collapsed = pd.DataFrame(index=new.index.unique())

features_list = [
    #('gender', np.nanmin),
    ('ALBUMIN',np.nanmean),
    ('BUN', np.nanmax),
    ('CREATININE', np.nanmax),
    ('GLUCOSE',np.nanmean),
    ('BICARBONATE', np.nanmin),
    ('HEMATOCRIT', np.nanmin),
    ('HEMOGLOBIN', np.nanmin),
    ('INR',np.nanmean),
    ('LACTATE',np.nanmean),
    ('PLATELET', np.nanmin),
    ('POTASSIUM', np.nanmax),
    ('PTT', np.nanmax),
    ('WBC', np.nanmean),
    ('AmountTransfused', np.nansum),
    ('TempC', np.nanmin),
    ('HEARTRATE', np.nanmax),
    ('RespRate', np.nanmax),
    ('SysBP', np.nanmin),
    ('DiasBP',np.nanmean),
    ('MeanBP', np.nanmean),
    ('crystalloid_bolus',np.nansum),
    ('gender',np.nanmin),
    ('admission_age', np.nanmin),
]


to_concat = []
grouped = new.groupby(new.index)
for feature, function in features_list:
    to_concat.append(grouped[[feature]].apply(function))
  
collapsed = pd.concat([collapsed] + to_concat, axis=1, join='inner')

col_names =[x[0] for x in features_list]
collapsed.columns = col_names

In [36]:
collapsed

Unnamed: 0,ALBUMIN,BUN,CREATININE,GLUCOSE,BICARBONATE,HEMATOCRIT,HEMOGLOBIN,INR,LACTATE,PLATELET,...,AmountTransfused,TempC,HEARTRATE,RespRate,SysBP,DiasBP,MeanBP,crystalloid_bolus,gender,admission_age
141266,3.60,29.0,1.90,274.880000,29.0,33.4,9.5,1.100,4.50,283.0,...,0.00,36.533333,96.333333,28.000000,86.000000,59.700000,73.766667,320.00,0.0,73.0
141631,3.40,30.0,1.23,131.800000,29.0,40.6,14.1,1.200,2.50,212.0,...,0.00,36.533333,119.000000,19.200000,115.750000,95.600000,117.133333,0.00,0.0,45.0
141945,3.00,54.0,3.86,130.000000,34.0,20.9,6.5,1.100,2.50,191.0,...,0.00,36.533333,89.600000,23.666667,92.750000,55.850000,71.333333,0.00,1.0,72.0
142096,2.50,33.0,0.87,101.000000,22.0,28.7,9.3,1.600,2.50,103.0,...,0.00,36.533333,110.500000,24.200000,106.000000,68.700000,86.016667,0.00,0.0,54.0
142478,3.00,38.0,1.20,113.400000,29.0,23.9,8.0,1.100,2.50,362.0,...,197.00,36.533333,89.600000,31.571429,115.750000,70.566667,89.300000,600.00,0.0,64.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3353038,4.26,14.0,0.66,107.200000,21.0,31.0,10.3,1.100,1.48,318.0,...,0.00,36.533333,91.750000,29.916667,115.750000,74.900000,89.266667,0.00,1.0,27.0
3353065,3.10,76.0,1.62,127.000000,25.0,11.0,3.6,1.200,2.50,423.0,...,833.33,36.533333,92.166667,33.272727,99.000000,57.850000,78.683333,0.00,0.0,62.0
3353116,2.70,68.0,1.72,192.166667,19.0,26.5,8.6,1.146,2.02,210.0,...,0.00,36.533333,113.500000,25.000000,79.714286,54.244048,68.241667,559.49,0.0,85.0
3353140,3.70,17.0,1.04,139.000000,31.0,35.0,11.6,1.040,2.50,238.0,...,0.00,36.533333,89.600000,27.666667,115.750000,62.600000,87.333333,0.00,1.0,90.0


### 3.6 Feature engineering

We now add additional features in order to improve the performance of the classifier and capture temporal patterns. Namely we compute the inercept and slope of a lienar fit on the time series available. Before doing so we need to recode the informations about age and gender in order to exclude them in the subsequent analysis


In [37]:
new['gender'] = new['gender'].astype('int')
new['admission_age'] = new['admission_age'].astype('int')

Compute now the new features

In [38]:
#trend features
trend_features = pd.DataFrame(index=new.index.unique())

import warnings
with warnings.catch_warnings():
    
    warnings.simplefilter("ignore")
    
    exclusion_list = ['time','AmountTransfused','crystalloid_bolus']
    
    #select the columns of which we should make feature engineering
    for col in new.select_dtypes('float').columns.tolist():
        
        print(col)
        list_fit=[]
        
        if(col not in exclusion_list):
        
            #for each patient compute intercept and slope
            for ind in new.index.unique():

                    temp=new.loc[ind]
                    

                    value = np.polyfit(temp[col],temp.time,1)
                    list_fit.append(value)


            #make the labels
            label_slope = 'slope_'+col
            label_int = 'intercept_'+col

            #add the features
            trend_features[label_int] = [x[0] for x in list_fit]
            trend_features[label_slope] = [x[1] for x in list_fit]
            
        else:
            
            pass

time
ALBUMIN
BUN
CREATININE
GLUCOSE
BICARBONATE
HEMATOCRIT
HEMOGLOBIN
INR
LACTATE
PLATELET
POTASSIUM
PTT
WBC
AmountTransfused
TempC
HEARTRATE
RespRate
SysBP
DiasBP
MeanBP
crystalloid_bolus


### 3.7 Final feature merging

In [39]:
to_export = pd.concat([collapsed] + [trend_features] , axis=1, join='inner')

In [40]:
to_export.columns

Index(['ALBUMIN', 'BUN', 'CREATININE', 'GLUCOSE', 'BICARBONATE', 'HEMATOCRIT',
       'HEMOGLOBIN', 'INR', 'LACTATE', 'PLATELET', 'POTASSIUM', 'PTT', 'WBC',
       'AmountTransfused', 'TempC', 'HEARTRATE', 'RespRate', 'SysBP', 'DiasBP',
       'MeanBP', 'crystalloid_bolus', 'gender', 'admission_age',
       'intercept_ALBUMIN', 'slope_ALBUMIN', 'intercept_BUN', 'slope_BUN',
       'intercept_CREATININE', 'slope_CREATININE', 'intercept_GLUCOSE',
       'slope_GLUCOSE', 'intercept_BICARBONATE', 'slope_BICARBONATE',
       'intercept_HEMATOCRIT', 'slope_HEMATOCRIT', 'intercept_HEMOGLOBIN',
       'slope_HEMOGLOBIN', 'intercept_INR', 'slope_INR', 'intercept_LACTATE',
       'slope_LACTATE', 'intercept_PLATELET', 'slope_PLATELET',
       'intercept_POTASSIUM', 'slope_POTASSIUM', 'intercept_PTT', 'slope_PTT',
       'intercept_WBC', 'slope_WBC', 'intercept_TempC', 'slope_TempC',
       'intercept_HEARTRATE', 'slope_HEARTRATE', 'intercept_RespRate',
       'slope_RespRate', 'intercept_SysBP',

### 3.8 Making the y's

In [41]:
to_drop = future_checker(dataset,4)

In [42]:
y = dataset[dataset.time>=4]

In [43]:
y.columns

Index(['ALBUMIN', 'BUN', 'CREATININE', 'GLUCOSE', 'BICARBONATE', 'HEMATOCRIT',
       'HEMOGLOBIN', 'INR', 'LACTATE', 'PLATELET', 'POTASSIUM', 'PTT', 'WBC',
       'time', 'AmountTransfused', 'TempC', 'HEARTRATE', 'RespRate', 'SysBP',
       'DiasBP', 'MeanBP', 'crystalloid_bolus', 'gender', 'admission_age'],
      dtype='object')

Before starting we have to check wheter, for some patients, there could be no data after the 3rd hour of ICU. Those patients are to be removed both from Xs and ys

Remove those indexes

In [44]:
X= to_export.drop(to_drop)

Create than the y by looking who has a positive amount of blood transfusion

In [45]:
y['temp_index'] = y.index
y=y[['temp_index','AmountTransfused']].groupby(['temp_index']).sum()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [46]:
temp_index = y.index.copy()
y = [int(x) for x in (y>0).values]
y = pd.Series(y)
y.index = temp_index
y.name ='outcome'

### 3.9 Final checks and export

In [47]:
X = X.merge(y,left_index=True,right_index=True)

In [48]:
y = X['outcome']
X = X.drop('outcome',axis=1)

In [49]:
X.shape

(10392, 61)

In [50]:
y.shape

(10392,)

In [51]:
X.to_csv('x_0_3_eICU.csv')
y.to_csv('y_4_24_eICU.csv')

## High transfusion dataset

In [52]:
transfused_idx = y[y==1].index

In [53]:
#extract only transfused patients from the whole dataset
y_transfused = dataset.loc[transfused_idx].copy()

#subset the whole dataset to the columns of intrest
y_transfused = y_transfused[['time','AmountTransfused']]

#training set identic to the previous, need to extract different y's. Start by subsetting for time
y_transfused = y_transfused[y_transfused.time >=4]

#creat a temporary feature to use in groupby to aggregate measures
y_transfused['temp_index'] = y_transfused.index
y_transfused = y_transfused.drop('time',axis=1)
y_transfused = y_transfused.groupby('temp_index').sum()

In [54]:
#save indexes
temp_index = y_transfused.index

#trasform booleans in integers an create the series containing the outcomes
y_transfused= [int(x[0]) for x in (y_transfused>500).values]
y_transfused = pd.Series(y_transfused)
y_transfused.index = temp_index
y_transfused.name ='outcome'

In [55]:
#extract the relative X's from the previous dataset
X_transfused = X.loc[temp_index]

In [56]:
X_transfused.head()

Unnamed: 0_level_0,ALBUMIN,BUN,CREATININE,GLUCOSE,BICARBONATE,HEMATOCRIT,HEMOGLOBIN,INR,LACTATE,PLATELET,...,intercept_HEARTRATE,slope_HEARTRATE,intercept_RespRate,slope_RespRate,intercept_SysBP,slope_SysBP,intercept_DiasBP,slope_DiasBP,intercept_MeanBP,slope_MeanBP
temp_index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
142096,2.5,33.0,0.87,101.0,22.0,28.7,9.3,1.6,2.5,103.0,...,-0.008539,1.857146,-0.279816,6.110364,-0.035507,5.391987,0.015944,-0.095344,-0.039243,4.375512
142478,3.0,38.0,1.2,113.4,29.0,23.9,8.0,1.1,2.5,362.0,...,-0.146635,12.343312,-0.090335,3.062036,0.406662,-47.955267,0.086455,-5.100865,0.137598,-11.287488
142667,1.5,28.0,1.1,97.0,30.0,24.6,7.6,1.8,2.0,68.0,...,0.055926,-3.99609,-0.068721,1.984537,-0.050492,5.647801,-0.035945,2.696621,-0.047762,3.996254
142689,2.12,38.0,1.2,178.8,23.0,19.9,6.4,1.4,2.5,142.0,...,0.055797,-5.772024,0.524139,-10.761682,-0.106438,12.574108,0.169817,-9.37075,-0.043808,4.23014
143450,3.2,43.0,1.52,206.0,26.0,25.4,8.8,1.1,2.5,300.0,...,0.04893,-3.738238,-0.054104,2.146069,0.12157,-13.894409,0.206495,-14.080984,0.178417,-15.152671


In [57]:
y_transfused.head()

temp_index
142096    0
142478    1
142667    1
142689    1
143450    1
Name: outcome, dtype: int64

In [58]:
X_transfused.to_csv('Regression_x_0_3_eICU.csv')

In [59]:
X_transfused.to_csv('Regression_y_4_24_eICU.csv')