In [None]:
#creating full dataset with target labels

In [None]:
import re
import os
import time
import pandas as pd
import numpy as np
from collections import Counter
from sklearn.model_selection import train_test_split
from sklearn.metrics.pairwise import cosine_similarity
from gensim.models import KeyedVectors
from torch.nn.utils.rnn import pack_padded_sequence
import torch
import torch.nn as nn
import torch.nn.functional as F
from sklearn.metrics import roc_auc_score
from torch.utils.data import DataLoader, Dataset
import matplotlib.pyplot as plt
import numpy as np
from torch.optim import lr_scheduler
from torch.autograd import Variable 
import copy
import random
from transformers import get_linear_schedule_with_warmup
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler
from torch.utils.data import TensorDataset, random_split
from transformers import AutoModel, AutoTokenizer, BertTokenizer
from transformers import BertForSequenceClassification, AdamW, BertConfig

In [None]:
if torch.cuda.is_available():
    device = torch.device('cuda')
else:
    device = torch.device('cpu')

In [None]:
torch.cuda.is_available()

In [None]:
torch.manual_seed(666)
torch.cuda.manual_seed(666)
np.random.seed(666)
random.seed(42)
torch.backends.cudnn.deterministic = True

In [None]:
# data exploration

In [None]:
pat = pd.read_csv('../physionet.org/files/mimiciii/1.4/PATIENTS.csv')

In [None]:
pat.head()

In [None]:
pat['SUBJECT_ID'].nunique()

In [None]:
pat.shape

In [None]:
stays = pd.read_csv('../physionet.org/files/mimiciii/1.4/ICUSTAYS.csv')

In [None]:
stays.head()

In [None]:
stays['INTIME'] = pd.to_datetime(stays['INTIME'])

In [None]:
stays.sort_values(by = ['SUBJECT_ID', 'OUTTIME'], inplace=True)

In [None]:
stays['stay_count'] = stays.groupby(['SUBJECT_ID']).cumcount()+1

In [None]:
stays.head()

In [None]:
# difference between intime and previous outtime 

In [None]:
stays['diff_stays'] = stays.groupby('SUBJECT_ID')['INTIME'].diff()

In [None]:
stays.head()

In [None]:
plt.figure(figsize=(14,8))
stays[~stays['diff_stays'].isna()]['diff_stays'].astype('timedelta64[D]').plot.hist(bins=40)
plt.xlim(-500, 500)

In [None]:
stays[~stays['diff_stays'].isna()]['diff_stays'].sort_values()

In [None]:
stays[~stays['diff_stays'].isna()]['diff_stays'].astype('timedelta64[D]')

In [None]:
stays.shape

In [None]:
stays['SUBJECT_ID'].nunique()

In [None]:
grouped_pat = pd.DataFrame(stays.groupby('SUBJECT_ID')['ICUSTAY_ID'].nunique())

In [None]:
grouped_pat[grouped_pat['ICUSTAY_ID'] >1]

In [None]:
multi_pat = grouped_pat[grouped_pat['ICUSTAY_ID'] >1].index

In [None]:
stays.head()

In [None]:
stays['s_count_multi_pat'] = stays.apply(lambda x: x['stay_count'] if x['SUBJECT_ID'] in multi_pat else np.NaN, axis=1)

In [None]:
stays.head()

In [None]:
stays[stays['SUBJECT_ID'].apply(lambda x: x in multi_pat)]

In [None]:
stays.shape

In [None]:
stays.head()

In [None]:
id_max = stays.groupby('SUBJECT_ID')['s_count_multi_pat'].idxmax()

In [None]:
max_index = [int(x) for x in list(id_max) if ~np.isnan(x)]

In [None]:
not_max_index = set(stays.index) - set(max_index)

In [None]:
not_max_df = stays.loc[not_max_index]

In [None]:
not_max_null_index = not_max_df[~not_max_df['s_count_multi_pat'].isna()].index

In [None]:
not_max_null_index

In [None]:
stays['read_binary']= [1 if x in not_max_null_index else 0 for x in stays.index]

In [None]:
stays.head(10)

In [None]:
# redo calculation for time lapse

In [None]:
# output minus next input

In [None]:
stays['next_input'] = stays['INTIME'].shift(-1)

In [None]:
stays['stay_diff'] = stays['next_input'].astype('M8[ns]') - stays['OUTTIME'].astype('M8[ns]') 

In [None]:
stays['rel_stay_diff'] =  stays.apply(lambda x: x['stay_diff'] if x['read_binary'] ==1  
                                     and x['stay_diff'] <= pd.Timedelta(90,'D') else 0, axis=1)

In [None]:
stays['read_binary_90day'] =  stays.apply(lambda x: 1 if x['read_binary'] ==1  
                                     and x['stay_diff'] <= pd.Timedelta(90,'D') else 0, axis=1)

In [None]:
stays.head(20)

In [None]:
stays.tail(20)

In [None]:
hadm = stays['HADM_ID'].value_counts()

In [None]:
stays[stays['HADM_ID']==135101]

In [None]:
final_cohort = stays[['SUBJECT_ID','ICUSTAY_ID', 'HADM_ID','rel_stay_diff','read_binary_90day']]

In [None]:
final_cohort['read_binary_90day'].value_counts()

In [None]:
8725/52807

In [None]:
final_cohort.to_csv('final_binary_assignment.csv', index=False)

In [None]:
final_cohort.head()

In [None]:
# Join to noteevents

In [None]:
final_cohort = pd.read_csv('final_binary_assignment.csv')

In [None]:
final_cohort.head()

In [None]:
final_cohort.merge(pd.read_csv('../physionet.org/files/mimiciii/1.4/NOTEEVENTS.csv', 
                               usecols=['HADM_ID','TEXT']), 
                   on='HADM_ID').to_csv('final_df.csv', index=False)

In [None]:
notes = pd.read_csv('../physionet.org/files/mimiciii/1.4/NOTEEVENTS.csv', usecols=['HADM_ID','TEXT'], nrows=20)

In [None]:
notes.head()

In [None]:
df = pd.read_csv('final_concat_df.csv', nrows=2000)

In [None]:
# df = df.groupby(['SUBJECT_ID','ICUSTAY_ID','HADM_ID', 'rel_stay_diff','read_binary_90day']).agg(
#     lambda x: '\n '.join(x)).reset_index()

In [None]:
pd.read_csv('final_df.csv').groupby(['SUBJECT_ID','ICUSTAY_ID','HADM_ID', 'rel_stay_diff','read_binary_90day']).agg(
    lambda x: '\n '.join(x)).reset_index().to_csv('final_concat_df.csv', index=False)

In [None]:
# make data loader with full data

In [None]:
import pandas as pd

In [None]:
df = pd.read_csv('final_concat_df.csv', nrows=1000)

In [None]:
df.head()

In [None]:
df['TEXT'].apply(lambda x: len(x)).plot.hist(bins=40, figsize=(10,10))

In [None]:
df['TEXT'].apply(lambda x: len(x)).describe([.75,.8,.9])

In [None]:
max(df['TEXT'].apply(lambda x: len(x)))

In [None]:
1.072159e+05

In [None]:
df['TEXT'].apply(lambda x: x[:107215]  if len(x) > 107215 else x)

In [None]:
import pickle

In [None]:
with open('input_ids_nrows_1000.pk', 'rb') as pickle_file:
    input_ids_test = pickle.load(pickle_file)

In [None]:
input_ids_test