In [4]:
import os
from os import listdir
import io
import re
import pandas as pd
import csv
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import time
import pickle
from pylab import rcParams
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split

In [5]:
# load train data and interpolate
train_data_df=pd.read_csv('../csv_data/train_patient.csv')
train_data_df = train_data_df.interpolate(method='linear').ffill().bfill() 
train_data_df.head()

Unnamed: 0,HR,O2Sat,Temp,SBP,MAP,DBP,Resp,EtCO2,BaseExcess,HCO3,...,Platelets,Age,Gender,Unit1,Unit2,HospAdmTime,ICULOS,SepsisLabel,Patient_id,time
0,80.0,100.0,36.5,121.0,58.0,41.0,13.5,34.0,1.0,25.0,...,160.0,77.27,1,0.0,1.0,-69.14,3,0,0,0
1,76.0,100.0,36.25,113.25,61.0,41.5,12.0,34.0,1.0,25.0,...,161.8,77.27,1,0.0,1.0,-69.14,4,0,0,1
2,80.0,100.0,36.25,132.75,71.5,46.25,12.0,34.0,-1.0,25.0,...,163.6,77.27,1,0.0,1.0,-69.14,5,0,0,2
3,78.0,100.0,36.1,103.5,58.0,43.0,12.0,34.0,-3.0,25.0,...,165.4,77.27,1,0.0,1.0,-69.14,6,0,0,3
4,74.0,100.0,36.0,128.75,69.5,44.5,12.5,34.0,-3.0,25.0,...,167.2,77.27,1,0.0,1.0,-69.14,7,0,0,4


In [6]:
# sort by time length
time_length_series = train_data_df.groupby(['Patient_id'])['SepsisLabel'].count()
time_length_series=time_length_series.sort_values() # sort
time_length_df=time_length_series[:-1].reset_index() # remove total count
time_length_df.columns = ['Patient_id', 'time_length']
time_length_df.head()

Unnamed: 0,Patient_id,time_length
0,23382,8
1,22552,8
2,15577,8
3,29904,8
4,22554,8


In [7]:
# length count
time_length_count = time_length_df.groupby(['time_length'])['Patient_id'].count().reset_index()
time_length_count.columns=['time_length','count']
time_length_count.head()

Unnamed: 0,time_length,count
0,8,235
1,9,173
2,10,160
3,11,168
4,12,199


### create batch data
batch size: 256

In [8]:
batches=[]
labels=[]
for length in time_length_count['time_length']:
    P_id_series=time_length_df[time_length_df['time_length']==length]['Patient_id'].tolist()
    batch=[]
    label=[]
    n=0
    for j in P_id_series:
        temp = train_data_df[train_data_df['Patient_id']==j]
        if temp['SepsisLabel'].sum()==0:
            label.append([1,0])
        else:
            label.append([0,1])
        temp=temp.iloc[:-1,:-7].values.tolist() # remove onset data and other features
        batch.append(temp)
        n+=1
        if n%256==0:
            batches.append(batch)
            labels.append(label)
            batch=[]
            label=[]
    batches.append(batch)
    labels.append(label)

In [9]:
len(batches)

342

In [10]:
# train
pickle.dump(batches, open( '../pkl_data/batches_data_train' + '.seqs', 'wb'), -1) 

In [11]:
pickle.dump(labels, open( '../pkl_data/batches_label_train' + '.seqs', 'wb'), -1) 

In [12]:
# test 25*256=6400
batches_test=[]
labels_test=[]
for i in range(100,300):
    if(len(batches[i])==256):
        batches_test.append(batches[i])
        labels_test.append(labels[i])

In [13]:
# test
pickle.dump(batches_test, open( '../pkl_data/batches_data_test' + '.seqs', 'wb'), -1) 
pickle.dump(labels_test, open( '../pkl_data/batches_label_test' + '.seqs', 'wb'), -1) 

In [14]:
path_string = '../pkl_data/batches_label_test.seqs'
with open(path_string,'rb') as f:
    a = pickle.load(f)

In [15]:
a 

[[[1, 0],
  [1, 0],
  [1, 0],
  [1, 0],
  [1, 0],
  [1, 0],
  [1, 0],
  [1, 0],
  [1, 0],
  [1, 0],
  [1, 0],
  [1, 0],
  [1, 0],
  [1, 0],
  [1, 0],
  [1, 0],
  [1, 0],
  [1, 0],
  [1, 0],
  [1, 0],
  [1, 0],
  [1, 0],
  [1, 0],
  [1, 0],
  [1, 0],
  [1, 0],
  [1, 0],
  [1, 0],
  [1, 0],
  [1, 0],
  [1, 0],
  [1, 0],
  [1, 0],
  [1, 0],
  [1, 0],
  [1, 0],
  [1, 0],
  [1, 0],
  [1, 0],
  [1, 0],
  [1, 0],
  [1, 0],
  [1, 0],
  [1, 0],
  [1, 0],
  [1, 0],
  [1, 0],
  [1, 0],
  [1, 0],
  [1, 0],
  [1, 0],
  [1, 0],
  [1, 0],
  [1, 0],
  [1, 0],
  [1, 0],
  [1, 0],
  [1, 0],
  [1, 0],
  [1, 0],
  [1, 0],
  [1, 0],
  [1, 0],
  [1, 0],
  [1, 0],
  [1, 0],
  [1, 0],
  [1, 0],
  [1, 0],
  [1, 0],
  [1, 0],
  [1, 0],
  [1, 0],
  [1, 0],
  [1, 0],
  [1, 0],
  [1, 0],
  [1, 0],
  [1, 0],
  [1, 0],
  [1, 0],
  [1, 0],
  [1, 0],
  [1, 0],
  [1, 0],
  [1, 0],
  [1, 0],
  [1, 0],
  [1, 0],
  [1, 0],
  [1, 0],
  [1, 0],
  [1, 0],
  [1, 0],
  [1, 0],
  [1, 0],
  [1, 0],
  [1, 0],
  [0, 1],
  [1, 0],
