In [1]:
import nltk
nltk.download()

showing info https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/index.xml


True

In [2]:
import numpy as np
import pickle
import itertools
import pandas as pd
from matplotlib import pyplot as plt

In [3]:
from nltk.tokenize import RegexpTokenizer
tokenizer = RegexpTokenizer(r'\w+')

In [4]:
from nltk.corpus import stopwords
breakers = {'with', 'without', 'comma', 'and', 'or', 'by', 'in', 'due', 'to', 'of', 'causing'}
stop_words= (set(stopwords.words("english")) | {'nan', 'unspecified', 'stated', 'other', 'state'})

In [5]:
from nltk.stem.wordnet import WordNetLemmatizer
lem = WordNetLemmatizer()

In [6]:
df_train = np.asarray(pd.read_csv("data/diabetes/diab_train.csv"))
df_val = np.asarray(pd.read_csv("data/diabetes/diab_validation.csv"))
df_test = np.asarray(pd.read_csv("data/diabetes/diab_test.csv"))

In [7]:
X = np.hstack([df_train[:,:-4], df_train[:,-3:]])
X_val = np.hstack([df_val[:,:-4], df_val[:,-3:]])
X_test = np.hstack([df_test[:,:-4], df_test[:,-3:]])
Y = df_train[:,-4]
Y_val = df_val[:,-4]
Y_test = df_test[:,-4]

In [8]:
X_all = np.vstack([X, X_val, X_test])

In [9]:
def sanitize(doc, breaks=True):
    return ' '.join([lem.lemmatize(w) for w in tokenizer.tokenize(str(doc).replace(",", " comma").lower())\
                     if w not in stop_words or (breaks and w in breakers)])

In [10]:
def onehot(keys, tokens_docs, truncate=100):
    # convert list of of token-lists to one flat list of tokens
    # and then create a dictionary that maps word to id of word,
    all_tokens = itertools.chain.from_iterable(tokens_docs)
    word_to_id = {token: idx for idx, token in enumerate(set(all_tokens))}

    # convert token lists to token-id lists
    token_ids = [[word_to_id[token] for token in tokens_doc] for tokens_doc in tokens_docs]

    maxl = 0
    maxi = 0
    for t in token_ids:
        maxl = max(maxl, len(t))
        for w in t:
            maxi = max(maxi, w)

    mapping = {}
    for i,t in enumerate(token_ids):
        enc = np.zeros((min(maxl, truncate), maxi+1))
        for j in range(min(len(t), truncate)):
            enc[j][t[j]] += 1
        mapping[keys[i]] = enc
    return mapping

In [11]:
def encode(index, mapping, force_string=False):
    Xp = X_all[:,index]
    
    if force_string:
        Xp = Xp.astype(str)

    return np.asarray([mapping[x] for x in Xp])

In [12]:
def identity_map(data):
    return {x:x for x in set(data)}

In [13]:
encodings = []

Race

In [14]:
print(set(X_all[:,1]))

{'Asian', 'Hispanic', '?', 'Other', 'Caucasian', 'AfricanAmerican'}


In [15]:
race = {'Other': 1, 'Asian':2, 'Caucasian':3, 'Hispanic':4, 'AfricanAmerican':5, '?':0}

In [16]:
encodings.append(race)

Gender

In [17]:
print(set(X_all[:,2]))

{'Female', 'Male'}


In [18]:
gender = {'Male': 0, 'Female': 1}

In [19]:
encodings.append(gender)

Age

In [20]:
print(set(X_all[:,3]))

{'[10-20)', '[0-10)', '[40-50)', '[30-40)', '[20-30)', '[80-90)', '[70-80)', '[90-100)', '[50-60)', '[60-70)'}


In [21]:
age = {'[0-10)': 0, '[80-90)': 8, '[60-70)': 6, '[50-60)': 5,\
          '[30-40)': 3, '[20-30)':2, '[90-100)':9, '[40-50)':4, '[70-80)': 7, '[10-20)': 10}

In [22]:
encodings.append(age)

Weight

In [23]:
print(set(X_all[:,4]))

{'[0-25)', '[150-175)', '[25-50)', '[50-75)', '[100-125)', '?', '[75-100)', '[125-150)'}


In [24]:
weight = {'[25-50)': 25, '[150-175)': 150, '[75-100)': 75,\
          '[0-25)': 0, '[125-150)': 125, '[50-75)': 50, '[100-125)':100, '?': 73}

In [25]:
encodings.append(weight)

Admission Type ID

In [26]:
print(set(X_all[:,5]))

{nan, 'Emergency', 'Not Mapped', 'Elective', 'Newborn', 'Urgent', 'Not Available'}


In [27]:
admission_type = {'nan': 0, 'Newborn': 2, 'Emergency':4,\
                  'Not Mapped': 0, 'Not Available': 0, 'Urgent': 3, 'Elective': 1}

In [28]:
X_all[:,5] = X_all[:,5].astype(str)

In [29]:
encodings.append(admission_type)

Discharge Disposition ID

In [30]:
print(len(set(X_all[:,6])), 'Items')
print(sorted(list(set(X_all[:,6].astype(str)))))

22 Items
['Admitted as an inpatient to this hospital', 'Discharged to home', 'Discharged/transferred to ICF', 'Discharged/transferred to SNF', 'Discharged/transferred to a federal health care facility.', 'Discharged/transferred to a long term care hospital.', 'Discharged/transferred to a nursing facility certified under Medicaid but not certified under Medicare', 'Discharged/transferred to another  type of inpatient care institution', 'Discharged/transferred to another rehab fac including rehab units of a hospital.', 'Discharged/transferred to another short term hospital', 'Discharged/transferred to home under care of Home IV provider', 'Discharged/transferred to home with home health service', 'Discharged/transferred within this institution to Medicare approved swing bed', 'Discharged/transferred/referred another institution for outpatient services', 'Discharged/transferred/referred to a psychiatric hospital of a psychiatric distinct part unit of a hospital', 'Discharged/transferred/r

In [31]:
discharge_disposition = sorted([sanitize(s, breaks=False) for s in set(X_all[:,6].astype(str))])
discharge_disposition_onehot = onehot([s for s in set(X_all[:,6].astype(str))], discharge_disposition, truncate=4)

Admission Source ID

In [32]:
print(sorted(list(set(X_all[:,7].astype(str)))))

['Clinic Referral', 'Court/Law Enforcement', 'Emergency Room', 'HMO Referral', 'Not Available', 'Not Mapped', 'Physician Referral', 'Transfer from a Skilled Nursing Facility (SNF)', 'Transfer from a hospital', 'Transfer from another health care facility', 'nan']


In [33]:
admission_src_id = {'Clinic Referral':np.asarray([1,0,0,0,0,0,0]), \
                    'Court/Law Enforcement':np.asarray([0,1,0,0,0,0,0]), \
                    'Emergency Room':np.asarray([0,0,1,0,0,0,0]), \
                    'HMO Referral':np.asarray([0,0,0,1,0,0,0]), \
                    'Not Available':np.asarray([0,0,0,0,1,0,0]), \
                    'Not Mapped':np.asarray([0,0,0,0,1,0,0]), \
                    'nan':np.asarray([0,0,0,0,1,0,0]), \
                    'Physician Referral':np.asarray([0,0,0,0,0,1,0]), \
                    'Transfer from a Skilled Nursing Facility (SNF)':np.asarray([0,0,0,0,0,0,1]), \
                    'Transfer from a hospital':np.asarray([0,0,0,0,0,0,1]), \
                    'Transfer from another health care facility':np.asarray([0,0,0,0,0,0,1])}

Time in Hospital

In [34]:
print(set(X_all[:,8]))

{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14}


In [35]:
encodings.append(identity_map(X_all[:,8]))

Payer Code

In [36]:
print(set(X_all[:,9]))

{'PO', 'SI', 'SP', 'MC', 'BC', 'OG', 'MD', 'CP', '?', 'HM', 'CM', 'UN', 'OT', 'WC', 'DM', 'CH'}


In [37]:
payer_code = {k:np.eye(len(set(X_all[:,9])))[i] for i,k in enumerate(set(X_all[:,9]))}

Medical Speciality

In [180]:
print(len(X_all[:,10][X_all[:,10] == '?'])/len(X_all[:,10]))

0.41


In [38]:
print(sorted([s for s in set(X_all[:,10])]))

['?', 'Anesthesiology-Pediatric', 'Cardiology', 'Cardiology-Pediatric', 'Emergency/Trauma', 'Endocrinology', 'Family/GeneralPractice', 'Gastroenterology', 'Gynecology', 'Hematology', 'Hematology/Oncology', 'Hospitalist', 'InfectiousDiseases', 'InternalMedicine', 'Nephrology', 'Neurology', 'Obsterics&Gynecology-GynecologicOnco', 'Obstetrics', 'ObstetricsandGynecology', 'Oncology', 'Ophthalmology', 'Orthopedics', 'Orthopedics-Reconstructive', 'Osteopath', 'Otolaryngology', 'OutreachServices', 'Pathology', 'Pediatrics', 'Pediatrics-CriticalCare', 'Pediatrics-EmergencyMedicine', 'Pediatrics-Endocrinology', 'Pediatrics-Hematology-Oncology', 'Pediatrics-Pulmonology', 'PhysicalMedicineandRehabilitation', 'PhysicianNotFound', 'Podiatry', 'Psychiatry', 'Psychology', 'Pulmonology', 'Radiologist', 'Radiology', 'Surgeon', 'Surgery-Cardiovascular', 'Surgery-Cardiovascular/Thoracic', 'Surgery-Colon&Rectal', 'Surgery-General', 'Surgery-Neuro', 'Surgery-Pediatric', 'Surgery-Plastic', 'Surgery-Plasticw

In [39]:
medical_speciality = sorted([sanitize(s, breaks=False) for s in set(X_all[:,10])])
medical_speciality_onehot = onehot([s for s in set(X_all[:,10])], medical_speciality)

Number of Lab Procedures

In [40]:
print(set(X_all[:,11]))

{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 101, 103, 104, 106, 107, 108, 109, 113, 114, 120}


In [41]:
encodings.append(identity_map(X_all[:,11]))

Number of Procedures

In [42]:
print(set(X_all[:,12]))

{0, 1, 2, 3, 4, 5, 6}


In [43]:
encodings.append(identity_map(X_all[:,12]))

Number of Medications

In [44]:
print(set(X_all[:,13]))

{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 65, 67, 74, 75, 81}


In [45]:
encodings.append(identity_map(X_all[:,13]))

Number Outpatient

In [46]:
print(set(X_all[:,14]))

{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 21, 27, 36}


In [187]:
n_outpatient = {0:0, 1:1, 2:2, 3:2, 4:2, 5:2, 6:2, 7:2, 8:2, 9:2, 10:2, 11:2,
                12:2, 13:2, 14:2, 15:2, 16:2, 17:2, 18:2, 19:2, 21:2, 27:2, 36:2}

In [47]:
encodings.append(n_outpatient)

Number Emergency

In [185]:
print(set(X_all[:,15]))

{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 42, 13}


In [186]:
n_emergency = {0: 0, 1: 1, 2:2, 3:2, 4:2, 5:2, 6:2, 7:2, 8:2, 9:2, 42:2, 13:2}

In [49]:
encodings.append(n_emergency)

Number Inpatient

In [50]:
print(set(X_all[:,16]))

{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10}


In [188]:
n_inpatient = {0:0, 1:1, 2:2, 3:3, 4:4, 5:5, 6:6, 7:6, 8:6, 9:6, 10:6}

In [51]:
encodings.append(n_inpatient)

Diagnoses<br>
Since one major task of this project was to embed the diagnoses using NLP, we will deliberately not make use of these already embedded representations of the diagnoses.

Number of Diagnoses

In [53]:
print(set(X_all[:,20]))

{1, 2, 3, 4, 5, 6, 7, 8, 9}


In [None]:
n_diagnoses = {1:0, 2:0, 3:0, 4:1, 5:2, 6:3, 7:4, 8:5, 9:6}

In [54]:
encodings.append(n_diagnoses)

Max Glucose Serum

In [55]:
print(set(X_all[:,21]))

{'>300', 'None', '>200', 'Norm'}


In [189]:
max_glu_serum = {'Norm':0, '>300':2, 'None':1, '>200':0}

In [57]:
encodings.append(max_glu_serum)

A1Cresult

In [58]:
print(set(X_all[:,22]))

{'>7', 'None', '>8', 'Norm'}


In [59]:
a1c_res = {'>7': 1, 'Norm': 1, '>8': 1, 'None': 0}

In [60]:
encodings.append(a1c_res)

Metformin - Metformin Pioglitazone

In [61]:
print(set(X_all[:,23]))

{'Steady', 'Up', 'No', 'Down'}


In [62]:
drug_change = {'Down': 1, 'No': 0, 'Steady': 2, 'Up': 3}

In [63]:
encodings += [drug_change] * 23

Change

In [64]:
print(set(X_all[:,46]))

{'No', 'Ch'}


In [65]:
change = {'No': 0, 'Ch': 1}

In [66]:
encodings.append(change)

Diabetes Medication

In [67]:
print(set(X_all[:,47]))

{'No', 'Yes'}


In [68]:
diabetes_med = {'No': 0, 'Yes': 1}

In [69]:
encodings.append(diabetes_med)

Diagnoses Description

In [70]:
print(set(X_all[:,48]))

{nan, 'Diseases of tricuspid valve', 'Diabetes mellitus without mention of complication, type I [juvenile type], uncontrolled', 'Closed fracture of upper end of forearm, unspecified', 'Urethral stricture due to unspecified infection', 'Intracerebral hemorrhage', 'Basilar artery syndrome', 'Depressive disorder, not elsewhere classified', 'Diabetes with unspecified complication, type II or unspecified type, uncontrolled', 'Corns and callosities', 'Peritoneal adhesions (postoperative) (postinfection)', 'Adjustment disorder with depressed mood', 'Malignant neoplasm of head of pancreas', 'Diabetes with other specified manifestations, type II or unspecified type, not stated as uncontrolled', 'Diabetes with other coma, type II or unspecified type, uncontrolled', 'Poisoning by adrenal cortical steroids', 'Undersocialized conduct disorder, aggressive type, unspecified', 'Abnormal involuntary movements', 'Poisoning by oxazolidine derivatives', 'Occlusion and stenosis of basilar artery without me

In [157]:
diagnoses = sorted([sanitize(s) for s in set(X_all[:,48])|set(X_all[:,49])|set(X_all[:,50])])
#diagnoses_onehot = onehot([str(s) for s in set(X_all[:,48])|set(X_all[:,49])|set(X_all[:,50])], diagnoses, truncate=10)

In [158]:
diagnoses[250]

'diabetes with specified manifestation comma type ii or type comma uncontrolled'

In [159]:
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction.text import CountVectorizer

In [160]:
cvec = CountVectorizer(ngram_range=(1,3))

In [161]:
occmat = cvec.fit_transform(diagnoses).toarray()

In [162]:
lda = LatentDirichletAllocation(n_components=10)

In [163]:
lda = lda.fit(occmat)

In [164]:
from sklearn.decomposition import PCA

In [165]:
pca = PCA(n_components=10)

In [166]:
pca = pca.fit(occmat)

In [110]:
pca.transform(cvec.transform([diagnoses[250]]).toarray())

array([[ 2.14397699, -2.16139624, -1.03667735, -0.45697702,  0.04197052]])

In [111]:
pca.transform(cvec.transform([diagnoses[251]]).toarray())

array([[ 1.92756661, -2.11056592, -1.12475125, -0.17033708,  0.41657061]])

In [112]:
pca.transform(cvec.transform([diagnoses[50]]).toarray())

array([[ 0.04423506, -0.52042471,  0.34910216,  0.27582229, -0.1376241 ]])

In [84]:
X_all_enc = np.zeros((len(X_all), 40))

In [85]:
len(encodings)

40

In [86]:
for i in range(5):
    print(i, encodings[i], X[:,1+i][:5])
    X_all_enc[:,i] = encode(1+i, encodings[i])

0 {'Other': 1, 'Asian': 2, 'Caucasian': 3, 'Hispanic': 4, 'AfricanAmerican': 5, '?': 0} ['AfricanAmerican' 'Caucasian' 'Caucasian' 'AfricanAmerican' 'Caucasian']
1 {'Male': 0, 'Female': 1} ['Male' 'Female' 'Female' 'Female' 'Male']
2 {'[0-10)': 0, '[80-90)': 8, '[60-70)': 6, '[50-60)': 5, '[30-40)': 3, '[20-30)': 2, '[90-100)': 9, '[40-50)': 4, '[70-80)': 7, '[10-20)': 10} ['[60-70)' '[70-80)' '[80-90)' '[50-60)' '[80-90)']
3 {'[25-50)': 25, '[150-175)': 150, '[75-100)': 75, '[0-25)': 0, '[125-150)': 125, '[50-75)': 50, '[100-125)': 100, '?': 73} ['?' '?' '?' '?' '?']
4 {'nan': 0, 'Newborn': 2, 'Emergency': 4, 'Not Mapped': 0, 'Not Available': 0, 'Urgent': 3, 'Elective': 1} ['Emergency' 'Elective' 'Urgent' 'Emergency' 'Elective']


In [87]:
print(5, encodings[5], X[:,8][:5])
X_all_enc[:,5] = encode(8, encodings[5])

5 {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12, 13: 13, 14: 14} [4 1 2 4 13]


In [88]:
for i in range(6):
    print(6+i, encodings[6+i], X[:,11+i][:5])
    X_all_enc[:,6+i] = encode(11+i, encodings[6+i])

6 {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12, 13: 13, 14: 14, 15: 15, 16: 16, 17: 17, 18: 18, 19: 19, 20: 20, 21: 21, 22: 22, 23: 23, 24: 24, 25: 25, 26: 26, 27: 27, 28: 28, 29: 29, 30: 30, 31: 31, 32: 32, 33: 33, 34: 34, 35: 35, 36: 36, 37: 37, 38: 38, 39: 39, 40: 40, 41: 41, 42: 42, 43: 43, 44: 44, 45: 45, 46: 46, 47: 47, 48: 48, 49: 49, 50: 50, 51: 51, 52: 52, 53: 53, 54: 54, 55: 55, 56: 56, 57: 57, 58: 58, 59: 59, 60: 60, 61: 61, 62: 62, 63: 63, 64: 64, 65: 65, 66: 66, 67: 67, 68: 68, 69: 69, 70: 70, 71: 71, 72: 72, 73: 73, 74: 74, 75: 75, 76: 76, 77: 77, 78: 78, 79: 79, 80: 80, 81: 81, 82: 82, 83: 83, 84: 84, 85: 85, 86: 86, 87: 87, 88: 88, 89: 89, 90: 90, 91: 91, 92: 92, 93: 93, 94: 94, 95: 95, 96: 96, 97: 97, 98: 98, 101: 101, 103: 103, 104: 104, 106: 106, 107: 107, 108: 108, 109: 109, 113: 113, 114: 114, 120: 120} [40 24 59 60 54]
7 {0: 0, 1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6} [4 1 0 1 6]
8 {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 

In [89]:
for i in range(28):
    print(12+i, encodings[12+i], X[:,20+i][:5])
    X_all_enc[:,12+i] = encode(20+i, encodings[12+i])

12 {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9} [9 3 8 5 6]
13 {'Norm': 0, '>300': 3, 'None': 1, '>200': 2} ['None' 'None' 'None' 'None' 'None']
14 {'>7': 2, 'Norm': 0, '>8': 3, 'None': 1} ['None' 'None' 'None' 'None' 'None']
15 {'Down': 1, 'No': 0, 'Steady': 2, 'Up': 3} ['No' 'Steady' 'No' 'No' 'No']
16 {'Down': 1, 'No': 0, 'Steady': 2, 'Up': 3} ['No' 'No' 'No' 'No' 'No']
17 {'Down': 1, 'No': 0, 'Steady': 2, 'Up': 3} ['No' 'No' 'No' 'No' 'No']
18 {'Down': 1, 'No': 0, 'Steady': 2, 'Up': 3} ['No' 'No' 'No' 'No' 'No']
19 {'Down': 1, 'No': 0, 'Steady': 2, 'Up': 3} ['No' 'No' 'No' 'No' 'No']
20 {'Down': 1, 'No': 0, 'Steady': 2, 'Up': 3} ['No' 'No' 'No' 'No' 'No']
21 {'Down': 1, 'No': 0, 'Steady': 2, 'Up': 3} ['No' 'No' 'No' 'No' 'No']
22 {'Down': 1, 'No': 0, 'Steady': 2, 'Up': 3} ['No' 'Steady' 'No' 'No' 'Down']
23 {'Down': 1, 'No': 0, 'Steady': 2, 'Up': 3} ['No' 'No' 'No' 'No' 'No']
24 {'Down': 1, 'No': 0, 'Steady': 2, 'Up': 3} ['No' 'Steady' 'No' 'No' 'No']
25 {'Down': 1, 'No':

In [90]:
X_all_enc[0]

array([ 5.,  0.,  6., 73.,  4.,  4., 40.,  4., 10.,  0.,  0.,  0.,  9.,
        1.,  1.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.])

In [91]:
X_all_enc = np.hstack([X_all_enc, encode(7, admission_src_id, force_string=True), encode(9, payer_code)])

In [92]:
X_all_enc[:2]

array([[ 5.,  0.,  6., 73.,  4.,  4., 40.,  4., 10.,  0.,  0.,  0.,  9.,
         1.,  1.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
         0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
         0.,  0.,  0.,  1.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
         1.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.],
       [ 3.,  1.,  7., 73.,  1.,  1., 24.,  1.,  5.,  0.,  0.,  0.,  3.,
         1.,  1.,  2.,  0.,  0.,  0.,  0.,  0.,  0.,  2.,  0.,  2.,  0.,
         0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  1.,
         1.,  0.,  0.,  0.,  0.,  0.,  1.,  0.,  0.,  0.,  0.,  0.,  0.,
         0.,  0.,  0.,  0.,  0.,  1.,  0.,  0.,  0.,  0.,  0.,  0.,  0.]])

In [93]:
from sklearn.preprocessing import Normalizer
X_all_enc = Normalizer().fit_transform(X_all_enc)

In [94]:
X_enc = X_all_enc[:len(X)]
X_val_enc = X_all_enc[len(X):len(X)+len(X_val)]
X_test_enc = X_all_enc[len(X)+len(X_val):]

In [95]:
print(X_enc.shape, X_val_enc.shape, X_test_enc.shape, Y.shape)

(6000, 65) (2000, 65) (2000, 65) (6000,)


In [96]:
np.savetxt("data/diabetes/X_enc.csv", X_enc, delimiter=",")
np.savetxt("data/diabetes/X_val_enc.csv", X_val_enc, delimiter=",")
np.savetxt("data/diabetes/X_test_enc.csv", X_test_enc, delimiter=",")
np.savetxt("data/diabetes/Y.csv", Y, delimiter=",")
np.savetxt("data/diabetes/Y_val.csv", Y_val, delimiter=",")
np.savetxt("data/diabetes/Y_test.csv", Y_test, delimiter=",")

In [150]:
discharge_disposition = [sanitize(str(d), breaks=True) for d in set(X_all[:,6])]
cvec_dis = CountVectorizer()
occmat_dis = cvec_dis.fit_transform(discharge_disposition).toarray()
lda_dis = LatentDirichletAllocation(n_components=3)
lda_dis = lda_dis.fit(occmat_dis)

In [151]:
X_discharge_disposition = np.asarray([lda_dis.transform(cvec_dis.transform([sanitize(d)]).toarray()) 
                                      for d in X_all[:,6]]).reshape(10000, -1)
X_discharge_disposition = Normalizer().fit_transform(X_discharge_disposition)

In [152]:
pickle.dump(X_discharge_disposition[:len(X)], open("data/diabetes/X_discharge_disposition_enc.csv", "wb"))
pickle.dump(X_discharge_disposition[len(X):len(X)+len(X_val)], open("data/diabetes/X_val_discharge_disposition_enc.csv", "wb"))
pickle.dump(X_discharge_disposition[len(X)+len(X_val):], open("data/diabetes/X_test_discharge_disposition_enc.csv", "wb"))

In [173]:
X_diagnoses1_pca = np.asarray([pca.transform(cvec.transform([sanitize(d)]).toarray()) for d in X_all[:,48]])\
    .reshape(10000, -1)
X_diagnoses1_lda = np.asarray([lda.transform(cvec.transform([sanitize(d)]).toarray()) for d in X_all[:,48]])\
    .reshape(10000, -1)

In [175]:
X_diagnoses2_pca = np.asarray([pca.transform(cvec.transform([sanitize(d)]).toarray()) for d in X_all[:,49]])\
    .reshape(10000, -1)
X_diagnoses2_lda = np.asarray([lda.transform(cvec.transform([sanitize(d)]).toarray()) for d in X_all[:,49]])\
    .reshape(10000, -1)

In [176]:
X_diagnoses3_pca = np.asarray([pca.transform(cvec.transform([sanitize(d)]).toarray()) for d in X_all[:,50]])\
    .reshape(10000, -1)
X_diagnoses3_lda = np.asarray([lda.transform(cvec.transform([sanitize(d)]).toarray()) for d in X_all[:,50]])\
    .reshape(10000, -1)

In [177]:
X_diagnoses_pca = np.concatenate([X_diagnoses1_pca, X_diagnoses2_pca, X_diagnoses3_pca], axis=1)
X_diagnoses_lda = np.concatenate([X_diagnoses1_lda, X_diagnoses2_lda, X_diagnoses3_lda], axis=1)
X_diagnoses_pca = Normalizer().fit_transform(X_diagnoses_pca)
X_diagnoses_lda = Normalizer().fit_transform(X_diagnoses_lda)

In [178]:
pickle.dump(X_diagnoses_pca[:len(X)], open("data/diabetes/X_diagnoses_pca.csv", "wb"))
pickle.dump(X_diagnoses_pca[len(X):len(X)+len(X_val)], open("data/diabetes/X_val_diagnoses_pca.csv", "wb"))
pickle.dump(X_diagnoses_pca[len(X)+len(X_val):], open("data/diabetes/X_test_diagnoses_pca.csv", "wb"))
pickle.dump(X_diagnoses_lda[:len(X)], open("data/diabetes/X_diagnoses_lda.csv", "wb"))
pickle.dump(X_diagnoses_lda[len(X):len(X)+len(X_val)], open("data/diabetes/X_val_diagnoses_lda.csv", "wb"))
pickle.dump(X_diagnoses_lda[len(X)+len(X_val):], open("data/diabetes/X_test_diagnoses_lda.csv", "wb"))