In [95]:
import nltk
nltk.download()

showing info https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/index.xml


True

In [96]:
import numpy as np
import pickle
import itertools
import pandas as pd

In [97]:
from nltk.tokenize import RegexpTokenizer
tokenizer = RegexpTokenizer(r'\w+')

In [98]:
from nltk.corpus import stopwords
breakers = {'with', 'without', 'comma', 'and', 'or', 'by', 'in', 'due', 'to', 'of', 'causing'}
stop_words= (set(stopwords.words("english")) | {'nan', 'unspecified', 'stated', 'other', 'state'})

In [99]:
from nltk.stem.wordnet import WordNetLemmatizer
lem = WordNetLemmatizer()

In [100]:
df_train = np.asarray(pd.read_csv("data/diabetes/diab_train.csv"))
df_val = np.asarray(pd.read_csv("data/diabetes/diab_validation.csv"))
df_test = np.asarray(pd.read_csv("data/diabetes/diab_test.csv"))

In [101]:
X = np.hstack([df_train[:,:-4], df_train[:,-3:]])
X_val = np.hstack([df_val[:,:-4], df_val[:,-3:]])
X_test = np.hstack([df_test[:,:-4], df_test[:,-3:]])
Y = df_train[:,-4]
Y_val = df_val[:,-4]
Y_test = df_test[:,-4]

In [102]:
X_all = np.vstack([X, X_val, X_test])

In [103]:
def sanitize(doc, breaks=True):
    return [lem.lemmatize(w) for w in tokenizer.tokenize(str(doc).replace(",", " comma").lower())\
                     if w not in stop_words or (breaks and w in breakers)]

In [104]:
def onehot(keys, tokens_docs, truncate=100):
    # convert list of of token-lists to one flat list of tokens
    # and then create a dictionary that maps word to id of word,
    all_tokens = itertools.chain.from_iterable(tokens_docs)
    word_to_id = {token: idx for idx, token in enumerate(set(all_tokens))}

    # convert token lists to token-id lists
    token_ids = [[word_to_id[token] for token in tokens_doc] for tokens_doc in tokens_docs]

    maxl = 0
    maxi = 0
    for t in token_ids:
        maxl = max(maxl, len(t))
        for w in t:
            maxi = max(maxi, w)

    mapping = {}
    for i,t in enumerate(token_ids):
        enc = np.zeros((min(maxl, truncate), maxi+1))
        for j in range(min(len(t), truncate)):
            enc[j][t[j]] += 1
        mapping[keys[i]] = enc
    return mapping

In [105]:
def encode(index, mapping, force_string=False):
    Xp = X_all[:,index]
    
    if force_string:
        Xp = Xp.astype(str)

    return np.asarray([mapping[x] for x in Xp])

In [106]:
def identity_map(data):
    return {x:x for x in set(data)}

In [107]:
encodings = []

Race

In [108]:
print(set(X_all[:,1]))

{'Hispanic', 'AfricanAmerican', '?', 'Other', 'Caucasian', 'Asian'}


In [109]:
race = {'Other': 1, 'Asian':2, 'Caucasian':3, 'Hispanic':4, 'AfricanAmerican':5, '?':0}

In [110]:
encodings.append(race)

Gender

In [111]:
print(set(X_all[:,2]))

{'Female', 'Male'}


In [112]:
gender = {'Male': 0, 'Female': 1}

In [113]:
encodings.append(gender)

Age

In [114]:
print(set(X_all[:,3]))

{'[20-30)', '[30-40)', '[90-100)', '[50-60)', '[40-50)', '[60-70)', '[80-90)', '[10-20)', '[70-80)', '[0-10)'}


In [115]:
age = {'[0-10)': 0, '[80-90)': 8, '[60-70)': 6, '[50-60)': 5,\
          '[30-40)': 3, '[20-30)':2, '[90-100)':9, '[40-50)':4, '[70-80)': 7, '[10-20)': 10}

In [116]:
encodings.append(age)

Weight

In [117]:
print(set(X_all[:,4]))

{'[150-175)', '[125-150)', '?', '[0-25)', '[50-75)', '[25-50)', '[100-125)', '[75-100)'}


In [118]:
weight = {'[25-50)': 25, '[150-175)': 150, '[75-100)': 75,\
          '[0-25)': 0, '[125-150)': 125, '[50-75)': 50, '[100-125)':100, '?': 73}

In [119]:
encodings.append(weight)

Admission Type ID

In [120]:
print(set(X_all[:,5]))

{nan, 'Not Mapped', 'Not Available', 'Elective', 'Urgent', 'Newborn', 'Emergency'}


In [121]:
admission_type = {'nan': 0, 'Newborn': 2, 'Emergency':4,\
                  'Not Mapped': 0, 'Not Available': 0, 'Urgent': 3, 'Elective': 1}

In [122]:
X_all[:,5] = X_all[:,5].astype(str)

In [123]:
encodings.append(admission_type)

Discharge Disposition ID

In [124]:
print(len(set(X_all[:,6])), 'Items')
print(sorted(list(set(X_all[:,6].astype(str)))))

22 Items
['Admitted as an inpatient to this hospital', 'Discharged to home', 'Discharged/transferred to ICF', 'Discharged/transferred to SNF', 'Discharged/transferred to a federal health care facility.', 'Discharged/transferred to a long term care hospital.', 'Discharged/transferred to a nursing facility certified under Medicaid but not certified under Medicare', 'Discharged/transferred to another  type of inpatient care institution', 'Discharged/transferred to another rehab fac including rehab units of a hospital.', 'Discharged/transferred to another short term hospital', 'Discharged/transferred to home under care of Home IV provider', 'Discharged/transferred to home with home health service', 'Discharged/transferred within this institution to Medicare approved swing bed', 'Discharged/transferred/referred another institution for outpatient services', 'Discharged/transferred/referred to a psychiatric hospital of a psychiatric distinct part unit of a hospital', 'Discharged/transferred/r

In [125]:
discharge_disposition = sorted([sanitize(s, breaks=False) for s in set(X_all[:,6].astype(str))])
discharge_disposition_onehot = onehot([s for s in set(X_all[:,6].astype(str))], discharge_disposition, truncate=4)

Admission Source ID

In [126]:
print(sorted(list(set(X_all[:,7].astype(str)))))

['Clinic Referral', 'Court/Law Enforcement', 'Emergency Room', 'HMO Referral', 'Not Available', 'Not Mapped', 'Physician Referral', 'Transfer from a Skilled Nursing Facility (SNF)', 'Transfer from a hospital', 'Transfer from another health care facility', 'nan']


In [127]:
admission_src_id = {'Clinic Referral':np.asarray([1,0,0,0,0,0,0,0,0]), \
                    'Court/Law Enforcement':np.asarray([0,1,0,0,0,0,0,0,0]), \
                    'Emergency Room':np.asarray([0,0,1,0,0,0,0,0,0]), \
                    'HMO Referral':np.asarray([0,0,0,1,0,0,0,0,0]), \
                    'Not Available':np.asarray([0,0,0,0,1,0,0,0,0]), \
                    'Not Mapped':np.asarray([0,0,0,0,1,0,0,0,0]), \
                    'nan':np.asarray([0,0,0,0,1,0,0,0,0]), \
                    'Physician Referral':np.asarray([0,0,0,0,0,1,0,0,0]), \
                    'Transfer from a Skilled Nursing Facility (SNF)':np.asarray([0,0,0,0,0,0,1,0,0]), \
                    'Transfer from a hospital':np.asarray([0,0,0,0,0,0,0,1,0]), \
                    'Transfer from another health care facility':np.asarray([0,0,0,0,0,0,0,0,1])}

Time in Hospital

In [128]:
print(set(X_all[:,8]))

{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14}


In [129]:
encodings.append(identity_map(X_all[:,8]))

Payer Code

In [130]:
print(set(X_all[:,9]))

{'CM', '?', 'UN', 'CH', 'SI', 'MC', 'PO', 'DM', 'WC', 'SP', 'OG', 'HM', 'CP', 'MD', 'OT', 'BC'}


In [131]:
payer_code = {k:np.eye(len(set(X_all[:,9])))[i] for i,k in enumerate(set(X_all[:,9]))}

Medical Speciality

In [132]:
print(sorted([s for s in set(X_all[:,10])]))

['?', 'Anesthesiology-Pediatric', 'Cardiology', 'Cardiology-Pediatric', 'Emergency/Trauma', 'Endocrinology', 'Family/GeneralPractice', 'Gastroenterology', 'Gynecology', 'Hematology', 'Hematology/Oncology', 'Hospitalist', 'InfectiousDiseases', 'InternalMedicine', 'Nephrology', 'Neurology', 'Obsterics&Gynecology-GynecologicOnco', 'Obstetrics', 'ObstetricsandGynecology', 'Oncology', 'Ophthalmology', 'Orthopedics', 'Orthopedics-Reconstructive', 'Osteopath', 'Otolaryngology', 'OutreachServices', 'Pathology', 'Pediatrics', 'Pediatrics-CriticalCare', 'Pediatrics-EmergencyMedicine', 'Pediatrics-Endocrinology', 'Pediatrics-Hematology-Oncology', 'Pediatrics-Pulmonology', 'PhysicalMedicineandRehabilitation', 'PhysicianNotFound', 'Podiatry', 'Psychiatry', 'Psychology', 'Pulmonology', 'Radiologist', 'Radiology', 'Surgeon', 'Surgery-Cardiovascular', 'Surgery-Cardiovascular/Thoracic', 'Surgery-Colon&Rectal', 'Surgery-General', 'Surgery-Neuro', 'Surgery-Pediatric', 'Surgery-Plastic', 'Surgery-Plasticw

In [133]:
medical_speciality = sorted([sanitize(s, breaks=False) for s in set(X_all[:,10])])
medical_speciality_onehot = onehot([s for s in set(X_all[:,10])], medical_speciality)

Number of Lab Procedures

In [134]:
print(set(X_all[:,11]))

{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 101, 103, 104, 106, 107, 108, 109, 113, 114, 120}


In [135]:
encodings.append(identity_map(X_all[:,11]))

Number of Procedures

In [136]:
print(set(X_all[:,12]))

{0, 1, 2, 3, 4, 5, 6}


In [137]:
encodings.append(identity_map(X_all[:,12]))

Number of Medications

In [138]:
print(set(X_all[:,13]))

{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 65, 67, 74, 75, 81}


In [139]:
encodings.append(identity_map(X_all[:,13]))

Number Outpatient

In [140]:
print(set(X_all[:,14]))

{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 21, 27, 36}


In [141]:
encodings.append(identity_map(X_all[:,14]))

Number Emergency

In [142]:
print(set(X_all[:,15]))

{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 42, 13}


In [143]:
encodings.append(identity_map(X_all[:,15]))

Number Inpatient

In [144]:
print(set(X_all[:,16]))

{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10}


In [145]:
encodings.append(identity_map(X_all[:,16]))

Diagnoses

In [146]:
print(set(X_all[:,17])|set(X_all[:,18])|set(X_all[:,19]))

{'153', '356', '572', 'V16', '288', '303', 'E941', '183', '552', '555', '253', '350', '727', '720', '866', '358', '968', '135', '38', '250.01', '41', '532', '345', '921', 'V09', '654', '847', '250.23', 'E935', '588', '513', 'V12', '759', '920', '298', '557', '846', '716', '276', '621', '368', '608', '374', '510', '967', '282', 'E930', '281', 'V64', '706', '265', '250.43', '580', '278', '414', '214', '823', 'E819', '996', '791', '669', '250.42', '250.31', '110', '813', '623', '250.5', '223', '173', '451', '317', 'E815', '966', '193', '262', '300', '410', '486', '210', '661', '646', '372', '287', '402', 'E816', '573', '999', '225', '361', '164', '244', '53', '340', '250.82', '752', '611', '972', '199', '474', '873', '514', '112', '620', '626', '360', '?', '156', 'V53', '862', '251', '600', '868', '491', 'E947', 'V17', '250.52', 'E937', '442', '642', '381', '655', 'E880', '516', 'V66', '432', '154', '786', '644', '355', '435', '250.12', 'E849', '906', '814', '599', '277', '88', '9', '781'

Number of Diagnoses

In [147]:
print(set(X_all[:,20]))

{1, 2, 3, 4, 5, 6, 7, 8, 9}


In [148]:
encodings.append(identity_map(X_all[:,20]))

Max Glucose Serum

In [149]:
print(set(X_all[:,21]))

{'>200', 'Norm', '>300', 'None'}


In [150]:
max_glu_serum = {'Norm':0, '>300':3, 'None':1, '>200':2}

In [151]:
encodings.append(max_glu_serum)

A1Cresult

In [152]:
print(set(X_all[:,22]))

{'>8', 'Norm', '>7', 'None'}


In [153]:
a1c_res = {'>7': 2, 'Norm': 0, '>8': 3, 'None': 1}

In [154]:
encodings.append(a1c_res)

Metformin - Metformin Pioglitazone

In [155]:
print(set(X_all[:,23]))

{'Up', 'Down', 'Steady', 'No'}


In [156]:
drug_change = {'Down': 1, 'No': 0, 'Steady': 2, 'Up': 3}

In [157]:
encodings += [drug_change] * 23

Change

In [158]:
print(set(X_all[:,46]))

{'Ch', 'No'}


In [159]:
change = {'No': 0, 'Ch': 1}

In [160]:
encodings.append(change)

Diabetes Medication

In [161]:
print(set(X_all[:,47]))

{'Yes', 'No'}


In [162]:
diabetes_med = {'No': 0, 'Yes': 1}

In [163]:
encodings.append(diabetes_med)

Diagnoses Description

In [164]:
print(set(X_all[:,48]))

{'Seminal vesiculitis', 'Follicular cyst of ovary', nan, 'Acute laryngopharyngitis', 'Lipoma of skin and subcutaneous tissue of face', 'Malignant neoplasm of vagina', 'Closed fracture of unspecified part of femur', 'Unstable lie, unspecified as to episode of care or not applicable', 'Inflammatory disease of breast', 'Central nervous system malformation in fetus, unspecified as to episode of care or not applicable', 'First-degree perineal laceration, unspecified as to episode of care or not applicable', 'Delirium due to conditions classified elsewhere', 'Mechanical complication of unspecified cardiac device, implant, and graft', 'Disseminated malignant neoplasm without specification of site', 'Malignant neoplasm of trachea', 'Anomalies of skull and face bones', 'Cervical spondylosis without myelopathy', 'Paroxysmal supraventricular tachycardia', 'Intussusception', 'Malignant essential hypertension', 'Adenovirus infection in conditions classified elsewhere and of unspecified site', 'Beni

In [165]:
diagnoses = sorted([sanitize(s) for s in set(X_all[:,48])|set(X_all[:,49])|set(X_all[:,50])])
diagnoses_onehot = onehot([str(s) for s in set(X_all[:,48])|set(X_all[:,49])|set(X_all[:,50])], diagnoses, truncate=10)

In [166]:
X_all_enc = np.zeros((len(X_all), 40))

In [167]:
len(encodings)

40

In [168]:
for i in range(5):
    print(i, encodings[i], X[:,1+i][:5])
    X_all_enc[:,i] = encode(1+i, encodings[i])

0 {'Other': 1, 'Asian': 2, 'Caucasian': 3, 'Hispanic': 4, 'AfricanAmerican': 5, '?': 0} ['AfricanAmerican' 'Caucasian' 'Caucasian' 'AfricanAmerican' 'Caucasian']
1 {'Male': 0, 'Female': 1} ['Male' 'Female' 'Female' 'Female' 'Male']
2 {'[0-10)': 0, '[80-90)': 8, '[60-70)': 6, '[50-60)': 5, '[30-40)': 3, '[20-30)': 2, '[90-100)': 9, '[40-50)': 4, '[70-80)': 7, '[10-20)': 10} ['[60-70)' '[70-80)' '[80-90)' '[50-60)' '[80-90)']
3 {'[25-50)': 25, '[150-175)': 150, '[75-100)': 75, '[0-25)': 0, '[125-150)': 125, '[50-75)': 50, '[100-125)': 100, '?': 73} ['?' '?' '?' '?' '?']
4 {'nan': 0, 'Newborn': 2, 'Emergency': 4, 'Not Mapped': 0, 'Not Available': 0, 'Urgent': 3, 'Elective': 1} ['Emergency' 'Elective' 'Urgent' 'Emergency' 'Elective']


In [169]:
print(5, encodings[5], X[:,8][:5])
X_all_enc[:,5] = encode(8, encodings[5])

5 {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12, 13: 13, 14: 14} [4 1 2 4 13]


In [170]:
for i in range(6):
    print(6+i, encodings[6+i], X[:,11+i][:5])
    X_all_enc[:,6+i] = encode(11+i, encodings[6+i])

6 {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12, 13: 13, 14: 14, 15: 15, 16: 16, 17: 17, 18: 18, 19: 19, 20: 20, 21: 21, 22: 22, 23: 23, 24: 24, 25: 25, 26: 26, 27: 27, 28: 28, 29: 29, 30: 30, 31: 31, 32: 32, 33: 33, 34: 34, 35: 35, 36: 36, 37: 37, 38: 38, 39: 39, 40: 40, 41: 41, 42: 42, 43: 43, 44: 44, 45: 45, 46: 46, 47: 47, 48: 48, 49: 49, 50: 50, 51: 51, 52: 52, 53: 53, 54: 54, 55: 55, 56: 56, 57: 57, 58: 58, 59: 59, 60: 60, 61: 61, 62: 62, 63: 63, 64: 64, 65: 65, 66: 66, 67: 67, 68: 68, 69: 69, 70: 70, 71: 71, 72: 72, 73: 73, 74: 74, 75: 75, 76: 76, 77: 77, 78: 78, 79: 79, 80: 80, 81: 81, 82: 82, 83: 83, 84: 84, 85: 85, 86: 86, 87: 87, 88: 88, 89: 89, 90: 90, 91: 91, 92: 92, 93: 93, 94: 94, 95: 95, 96: 96, 97: 97, 98: 98, 101: 101, 103: 103, 104: 104, 106: 106, 107: 107, 108: 108, 109: 109, 113: 113, 114: 114, 120: 120} [40 24 59 60 54]
7 {0: 0, 1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6} [4 1 0 1 6]
8 {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 

In [171]:
for i in range(28):
    print(12+i, encodings[12+i], X[:,20+i][:5])
    X_all_enc[:,12+i] = encode(20+i, encodings[12+i])

12 {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9} [9 3 8 5 6]
13 {'Norm': 0, '>300': 3, 'None': 1, '>200': 2} ['None' 'None' 'None' 'None' 'None']
14 {'>7': 2, 'Norm': 0, '>8': 3, 'None': 1} ['None' 'None' 'None' 'None' 'None']
15 {'Down': 1, 'No': 0, 'Steady': 2, 'Up': 3} ['No' 'Steady' 'No' 'No' 'No']
16 {'Down': 1, 'No': 0, 'Steady': 2, 'Up': 3} ['No' 'No' 'No' 'No' 'No']
17 {'Down': 1, 'No': 0, 'Steady': 2, 'Up': 3} ['No' 'No' 'No' 'No' 'No']
18 {'Down': 1, 'No': 0, 'Steady': 2, 'Up': 3} ['No' 'No' 'No' 'No' 'No']
19 {'Down': 1, 'No': 0, 'Steady': 2, 'Up': 3} ['No' 'No' 'No' 'No' 'No']
20 {'Down': 1, 'No': 0, 'Steady': 2, 'Up': 3} ['No' 'No' 'No' 'No' 'No']
21 {'Down': 1, 'No': 0, 'Steady': 2, 'Up': 3} ['No' 'No' 'No' 'No' 'No']
22 {'Down': 1, 'No': 0, 'Steady': 2, 'Up': 3} ['No' 'Steady' 'No' 'No' 'Down']
23 {'Down': 1, 'No': 0, 'Steady': 2, 'Up': 3} ['No' 'No' 'No' 'No' 'No']
24 {'Down': 1, 'No': 0, 'Steady': 2, 'Up': 3} ['No' 'Steady' 'No' 'No' 'No']
25 {'Down': 1, 'No':

In [172]:
X_all_enc[0]

array([ 5.,  0.,  6., 73.,  4.,  4., 40.,  4., 10.,  0.,  0.,  0.,  9.,
        1.,  1.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.])

In [173]:
X_all_enc = np.hstack([X_all_enc, encode(7, admission_src_id, force_string=True), encode(9, payer_code)])

In [174]:
X_all_enc[:2]

array([[ 5.,  0.,  6., 73.,  4.,  4., 40.,  4., 10.,  0.,  0.,  0.,  9.,
         1.,  1.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
         0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
         0.,  0.,  0.,  1.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
         0.,  0.,  1.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.],
       [ 3.,  1.,  7., 73.,  1.,  1., 24.,  1.,  5.,  0.,  0.,  0.,  3.,
         1.,  1.,  2.,  0.,  0.,  0.,  0.,  0.,  0.,  2.,  0.,  2.,  0.,
         0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  1.,
         1.,  0.,  0.,  0.,  0.,  0.,  1.,  0.,  0.,  0.,  0.,  1.,  0.,
         0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.]])

In [175]:
from sklearn.preprocessing import Normalizer
X_all_enc = Normalizer().fit_transform(X_all_enc)

In [176]:
X_enc = X_all_enc[:len(X)]
X_val_enc = X_all_enc[len(X):len(X)+len(X_val)]
X_test_enc = X_all_enc[len(X)+len(X_val):]

In [177]:
print(X_enc.shape, X_val_enc.shape, X_test_enc.shape, Y.shape)

(6000, 65) (2000, 65) (2000, 65) (6000,)


In [178]:
np.savetxt("data/diabetes/X_enc.csv", X_enc, delimiter=",")
np.savetxt("data/diabetes/X_val_enc.csv", X_val_enc, delimiter=",")
np.savetxt("data/diabetes/X_test_enc.csv", X_test_enc, delimiter=",")
np.savetxt("data/diabetes/Y.csv", Y, delimiter=",")
np.savetxt("data/diabetes/Y_val.csv", Y_val, delimiter=",")
np.savetxt("data/diabetes/Y_test.csv", Y_test, delimiter=",")

In [179]:
X_discharge_disposition = encode(6, discharge_disposition_onehot, force_string=True)

In [180]:
X_discharge_disposition[:len(X)].shape

(6000, 4, 44)

In [181]:
pickle.dump(X_discharge_disposition[:len(X)], open("data/diabetes/X_discharge_disposition_enc.csv", "wb"))
pickle.dump(X_discharge_disposition[len(X):len(X)+len(X_val)], open("data/diabetes/X_val_discharge_disposition_enc.csv", "wb"))
pickle.dump(X_discharge_disposition[len(X)+len(X_val):], open("data/diabetes/X_test_discharge_disposition_enc.csv", "wb"))

In [182]:
X_medical_speciality = encode(10, medical_speciality_onehot)

In [183]:
pickle.dump(X_medical_speciality[:len(X)], open("data/diabetes/X_medical_speciality_enc.csv", "wb"))
pickle.dump(X_medical_speciality[len(X):len(X)+len(X_val)], open("data/diabetes/X_val_medical_speciality_enc.csv", "wb"))
pickle.dump(X_medical_speciality[len(X)+len(X_val):], open("data/diabetes/X_test_medical_speciality_enc.csv", "wb"))

In [184]:
X_diagnoses1_onehot = encode(48, diagnoses_onehot, force_string=True).reshape((-1, 10, 1, 1058))

In [185]:
X_diagnoses2_onehot = encode(49, diagnoses_onehot, force_string=True).reshape((-1, 10, 1, 1058))

In [186]:
X_diagnoses3_onehot = encode(50, diagnoses_onehot, force_string=True).reshape((-1, 10, 1, 1058))

In [187]:
X_diagnoses = np.concatenate([X_diagnoses1_onehot, X_diagnoses2_onehot, X_diagnoses3_onehot], axis=2)

In [188]:
pickle.dump(X_diagnoses[:len(X)], open("data/diabetes/X_diagnoses_enc.csv", "wb"))
pickle.dump(X_diagnoses[len(X):len(X)+len(X_val)], open("data/diabetes/X_val_diagnoses_enc.csv", "wb"))
pickle.dump(X_diagnoses[len(X)+len(X_val):], open("data/diabetes/X_test_diagnoses_enc.csv", "wb"))