In [7]:
DATA_PATH = "./resource"
input_file = DATA_PATH + '/S1_File.txt'
vocab_pkl = DATA_PATH + '/vocab.pkl'
data_pkl = DATA_PATH + '/data.pkl'
label_pkl = DATA_PATH + '/data.pkl'


In [21]:
import pickle
import numpy as np

def save_pkl(path, obj):
  with open(path, 'wb') as f:
    pickle.dump(obj, f)
    print(" [*] save %s" % path)

In [24]:
import numpy as np
import pandas as pd

rare_word = 100

df = pd.read_csv(input_file, sep='\t', header=0)
hist = df.groupby('DX_GROUP_DESCRIPTION').size().to_frame('SIZE').reset_index()

count = hist.groupby('SIZE').size().to_frame('COUNT').reset_index()

# filter
hist = hist[hist['SIZE'] > rare_word]
vocab = hist.sort_values(by='SIZE').reset_index()['DX_GROUP_DESCRIPTION']

wdict = vocab.to_dict()
word_to_index = {v: k for k, v in wdict.items()}
save_pkl(vocab_pkl, word_to_index)

 [*] save ./resource/vocab.pkl


In [26]:
def convert_format(word_to_index, events):
    # order by PID, DAY_ID
    with open(input_file, mode='r') as f:
        # header
        header = f.readline().strip().split('\t')
        #print(header)
        pos = {}
        for key, value in enumerate(header):
            pos[value] = key
        #print(pos)

        docs = []
        doc = []
        sent = []
        labels = []
        label = []

        # init
        line = f.readline()
        tokens = line.strip().split('\t')
        pid = tokens[pos['PID']]
        day_id = tokens[pos['DAY_ID']]
        label.append(tag(events, pid, day_id))

        while line != '':
            tokens = line.strip().split('\t')
            c_pid = tokens[pos['PID']]
            c_day_id = tokens[pos['DAY_ID']]

            # closure
            if c_pid != pid:
                doc.append(sent)
                docs.append(doc)
                sent = []
                doc = []
                pid = c_pid
                day_id = c_day_id
                labels.append(label)
                label = [tag(events, pid, day_id)]
            else:
                if c_day_id != day_id:
                    doc.append(sent)
                    sent = []
                    day_id = c_day_id
                    label.append(tag(events, pid, day_id))

            word = tokens[pos['DX_GROUP_DESCRIPTION']]
            try:
                sent.append(word_to_index[word])
            except KeyError:
                #sent.append(unknown)
                pass

            line = f.readline()

        # closure
        doc.append(sent)
        docs.append(doc)
        labels.append(label)

    return docs, labels

def extract_events():
    # extract event "INPATIENT HOSPITAL"
    target_event = 'INPATIENT HOSPITAL'

    df = pd.read_csv(input_file, sep='\t', header=0)
    events = df[df['SERVICE_LOCATION'] == target_event]

    events = events.groupby(['PID', 'DAY_ID', 'SERVICE_LOCATION']).size().to_frame('COUNT').reset_index()\
        .sort_values(by=['PID', 'DAY_ID'], ascending=True)\
        .set_index('PID')

    return events


def tag(events, pid, day_id):
    return 1 if tag_logic(events, pid, day_id) else 0


def tag_logic(events, pid, day_id):
    try:
        patient = events.loc[int(pid)]

        # test whether have events within 30 days
        if isinstance(patient, pd.Series):
            return (int(day_id) <= patient.DAY_ID) & (patient.DAY_ID < int(day_id) + 30)

        return patient.loc[(int(day_id) <= patient.DAY_ID) & (patient.DAY_ID < int(day_id) + 30)].shape[0] > 0
    except KeyError:
        # the label is not in the [index]
        return False


In [27]:
events = extract_events()

docs, labels = convert_format(word_to_index, events)
save_pkl('./resource/data.pkl', docs)
save_pkl('./resource/label.pkl', labels)


 [*] save ./resource/data.pkl
 [*] save ./resource/label.pkl
