In [100]:
import gensim
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
print(gensim.__version__)

4.2.0


In [101]:
# STEP 1: load Note datasets
df = pd.read_csv('data/filtered.csv')

In [102]:
df.head()

Unnamed: 0,subject_id,hadm_id,icustay_id,expire_flag,text
0,291,125726,275109,0,Briefly this is a 72 year old woman with prior...
1,291,125726,275109,0,Briefly this is a 72 year old woman with prior...
2,422,117029,299666,0,"Pt is a 57 y/p male with h/o afib, LE edema, w..."
3,671,126769,246119,0,"Pt admitted from ED at 0530, upon which he was..."
4,698,171990,229997,0,89 year old patient admitted from the EW with ...


In [103]:
messages=df.filter(items=['expire_flag','text' ])
messages.columns = ["label", "text"]
messages.head()

Unnamed: 0,label,text
0,0,Briefly this is a 72 year old woman with prior...
1,0,Briefly this is a 72 year old woman with prior...
2,0,"Pt is a 57 y/p male with h/o afib, LE edema, w..."
3,0,"Pt admitted from ED at 0530, upon which he was..."
4,0,89 year old patient admitted from the EW with ...


In [104]:
messages['text_clean'] = messages['text'].apply(lambda x: gensim.utils.simple_preprocess(x))

In [105]:
messages.head()

Unnamed: 0,label,text,text_clean
0,0,Briefly this is a 72 year old woman with prior...,"[briefly, this, is, year, old, woman, with, pr..."
1,0,Briefly this is a 72 year old woman with prior...,"[briefly, this, is, year, old, woman, with, pr..."
2,0,"Pt is a 57 y/p male with h/o afib, LE edema, w...","[pt, is, male, with, afib, le, edema, who, pre..."
3,0,"Pt admitted from ED at 0530, upon which he was...","[pt, admitted, from, ed, at, upon, which, he, ..."
4,0,89 year old patient admitted from the EW with ...,"[year, old, patient, admitted, from, the, ew, ..."


In [106]:
# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(messages['text_clean'],
                                                    messages['label'], test_size=0.2)

X_train

124      [pt, with, known, cad, now, with, recurrent, a...
12996    [age, over, yo, with, cad, cabg, in, and, hosp...
4305     [patient, is, admitted, from, outside, hospita...
373      [yo, ma, presented, to, hospital, location, un...
684      [see, fhp, for, full, pmh, pt, is, yr, old, wo...
11919    [hpi, yom, acute, onset, sided, weakness, flac...
2970     [the, patient, was, discharged, yesterday, fro...
432      [with, history, of, cerebral, palsy, bilateral...
2942     [yo, motorcycle, crash, vs, pole, no, helmet, ...
8991     [yo, female, with, history, of, colon, ca, pre...
12084    [cc, abdominal, pain, at, ventral, hernia, hpi...
9589     [this, yr, old, cantonese, speaking, woman, wa...
1865     [pt, admitted, after, having, sudden, onset, o...
5775     [year, old, male, with, past, alzheimer, and, ...
9051     [pt, yo, male, with, pmh, dm, insulin, imi, wi...
7490     [pt, found, down, by, neighbor, brought, to, n...
9017     [aortic, dissection, assessment, weaned, off, .

In [107]:
# Train the word2vec model
w2v_model = gensim.models.Word2Vec(X_train,
                                   vector_size=300,window=5,
                                   min_count=2,sg = 1)

In [130]:
w2v_model.wv.most_similar('managing') 
print(len(X_train))


11268


In [145]:
words = set(w2v_model.wv.index_to_key)






In [146]:
print(words)

{'intepreter', 'vhpi', 'ssent', 'end', 'oddly', 'marked', 'exc', 'mix', 'redrawn', 'bms', 'writhing', 'protocal', 'mrcp', 'ecmo', 'spinous', 'thump', 'tranfer', 'colloid', 'vvs', 'breasts', 'hydrocephalus', 'abodmen', 'dextrose', 'drsgs', 'dates', 'myasthenia', 'bolemia', 'tremuolous', 'throat', 'slng', 'testicular', 'cirrohsis', 'span', 'om', 'undertreated', 'discontinue', 'benadryl', 'needed', 'terms', 'ined', 'angiographic', 'lyme', 'hepc', 'meconium', 'rheumatoid', 'keratitis', 'opthamologist', 'eruption', 'virl', 'stents', 'three', 'hiatal', 'patter', 'nrbm', 'papyracea', 'samonella', 'clindymacin', 'disorder', 'harvest', 'igm', 'improved', 'morni', 'interventions', 'reprots', 'schistocytes', 'city', 'paraplegic', 'mandibulectomy', 'transphenodial', 'multiform', 'succus', 'inr', 'sinuses', 'zofram', 'remaind', 'dysarthric', 'diminish', 'musk', 'podus', 'jumps', 'posture', 'phosphenytoin', 'lainectomy', 'subtherapeutic', 'extremeties', 'upright', 'step', 'epigastrum', 'oopherectomy

In [147]:
X_train_vect = np.array([np.array([w2v_model.wv[i] for i in ls if i in words])
                         for ls in X_train],dtype=object)
X_test_vect = np.array([np.array([w2v_model.wv[i] for i in ls if i in words])
                         for ls in X_test],dtype=object)

In [148]:
# Compute sentence vectors by averaging the word vectors for the words contained in the sentence
X_train_vect_avg = []
for v in X_train_vect:
    
    if v.size:
        X_train_vect_avg.append(v.mean(axis=0))
    else:
        X_train_vect_avg.append(np.zeros(300, dtype=float))
        
X_test_vect_avg = []
for v in X_test_vect:
    if v.size:
        X_test_vect_avg.append(v.mean(axis=0))
    else:
        X_test_vect_avg.append(np.zeros(300, dtype=float))

In [149]:
# Are our sentence vector lengths consistent?
for i, v in enumerate(X_train_vect_avg):
    print(len(X_train.iloc[i]), len(v))

319 300
325 300
95 300
64 300
122 300
289 300
131 300
134 300
163 300
49 300
251 300
243 300
162 300
66 300
137 300
356 300
87 300
278 300
270 300
257 300
169 300
104 300
103 300
193 300
500 300
11 300
150 300
261 300
127 300
342 300
287 300
240 300
282 300
402 300
72 300
180 300
225 300
85 300
118 300
160 300
254 300
238 300
416 300
215 300
221 300
1 300
63 300
293 300
154 300
82 300
147 300
395 300
235 300
499 300
133 300
158 300
175 300
18 300
202 300
150 300
368 300
639 300
153 300
344 300
46 300
165 300
54 300
320 300
238 300
252 300
107 300
125 300
178 300
462 300
160 300
496 300
139 300
174 300
130 300
40 300
278 300
9 300
147 300
8 300
82 300
233 300
284 300
382 300
295 300
127 300
35 300
197 300
27 300
179 300
250 300
269 300
280 300
278 300
132 300
79 300
502 300
152 300
147 300
14 300
348 300
79 300
435 300
387 300
173 300
116 300
295 300
124 300
151 300
71 300
38 300
239 300
358 300
390 300
157 300
156 300
253 300
212 300
196 300
164 300
261 300
229 300
267 300
309 300
153 

314 300
81 300
147 300
98 300
168 300
112 300
133 300
209 300
48 300
11 300
3 300
108 300
213 300
739 300
98 300
234 300
67 300
160 300
181 300
182 300
17 300
145 300
249 300
549 300
109 300
391 300
358 300
580 300
123 300
267 300
354 300
353 300
295 300
312 300
107 300
111 300
463 300
393 300
37 300
152 300
115 300
62 300
127 300
94 300
177 300
67 300
488 300
313 300
200 300
230 300
67 300
202 300
75 300
54 300
216 300
251 300
184 300
422 300
92 300
173 300
396 300
321 300
211 300
386 300
374 300
123 300
215 300
67 300
215 300
223 300
208 300
150 300
215 300
455 300
385 300
224 300
325 300
111 300
207 300
182 300
438 300
261 300
82 300
73 300
70 300
271 300
393 300
19 300
162 300
83 300
508 300
393 300
207 300
73 300
244 300
473 300
367 300
72 300
209 300
183 300
210 300
143 300
149 300
82 300
197 300
254 300
333 300
164 300
217 300
158 300
443 300
68 300
105 300
110 300
103 300
83 300
345 300
161 300
7 300
131 300
14 300
27 300
213 300
207 300
166 300
333 300
67 300
177 300
7 300
188

259 300
168 300
356 300
182 300
81 300
181 300
110 300
562 300
295 300
187 300
77 300
69 300
191 300
510 300
179 300
420 300
82 300
314 300
177 300
72 300
340 300
245 300
162 300
226 300
217 300
194 300
177 300
147 300
298 300
83 300
244 300
195 300
80 300
10 300
179 300
157 300
196 300
285 300
325 300
369 300
236 300
432 300
259 300
156 300
60 300
156 300
281 300
258 300
108 300
197 300
505 300
84 300
195 300
243 300
33 300
7 300
161 300
241 300
245 300
116 300
66 300
116 300
190 300
156 300
241 300
116 300
202 300
312 300
153 300
332 300
101 300
142 300
215 300
123 300
131 300
324 300
338 300
299 300
307 300
34 300
262 300
175 300
403 300
297 300
288 300
356 300
131 300
229 300
136 300
63 300
313 300
155 300
169 300
310 300
244 300
224 300
276 300
133 300
155 300
222 300
15 300
641 300
260 300
85 300
322 300
75 300
73 300
362 300
220 300
196 300
429 300
22 300
147 300
173 300
278 300
190 300
269 300
228 300
412 300
192 300
217 300
180 300
101 300
276 300
127 300
109 300
263 300
189 3

207 300
178 300
158 300
141 300
221 300
108 300
741 300
39 300
204 300
136 300
282 300
116 300
193 300
306 300
325 300
155 300
57 300
618 300
199 300
100 300
97 300
94 300
328 300
267 300
297 300
301 300
174 300
385 300
364 300
71 300
182 300
110 300
180 300
223 300
364 300
289 300
228 300
117 300
330 300
83 300
9 300
206 300
31 300
236 300
50 300
272 300
224 300
26 300
253 300
112 300
112 300
260 300
330 300
222 300
230 300
305 300
142 300
460 300
415 300
166 300
242 300
84 300
150 300
136 300
373 300
7 300
33 300
210 300
118 300
76 300
343 300
170 300
457 300
119 300
204 300
91 300
386 300
157 300
43 300
290 300
184 300
57 300
208 300
119 300
561 300
273 300
256 300
357 300
87 300
128 300
172 300
27 300
340 300
244 300
68 300
353 300
240 300
237 300
173 300
65 300
234 300
285 300
151 300
154 300
109 300
108 300
152 300
271 300
374 300
158 300
167 300
105 300
193 300
119 300
391 300
99 300
294 300
233 300
160 300
324 300
267 300
442 300
260 300
6 300
154 300
93 300
113 300
339 300
54 

234 300
273 300
88 300
388 300
468 300
222 300
276 300
240 300
208 300
242 300
193 300
502 300
90 300
130 300
133 300
111 300
131 300
293 300
291 300
21 300
151 300
123 300
487 300
197 300
102 300
55 300
372 300
162 300
119 300
295 300
237 300
153 300
199 300
172 300
273 300
261 300
167 300
232 300
509 300
33 300
281 300
197 300
259 300
183 300
545 300
341 300
246 300
112 300
219 300
280 300
176 300
145 300
139 300
161 300
196 300
223 300
71 300
162 300
177 300
25 300
103 300
9 300
192 300
198 300
118 300
434 300
194 300
615 300
147 300
253 300
97 300
12 300
307 300
367 300
261 300
60 300
158 300
336 300
238 300
335 300
229 300
104 300
12 300
285 300
98 300
251 300
213 300
289 300
193 300
60 300
48 300
210 300
262 300
124 300
310 300
337 300
350 300
506 300
379 300
134 300
199 300
347 300
270 300
81 300
85 300
157 300
165 300
135 300
11 300
208 300
165 300
248 300
165 300
125 300
556 300
243 300
346 300
192 300
156 300
150 300
345 300
223 300
197 300
376 300
333 300
431 300
154 300
102

100 300
364 300
85 300
224 300
122 300
277 300
420 300
87 300
10 300
93 300
227 300
174 300
224 300
58 300
101 300
296 300
35 300
294 300
218 300
520 300
204 300
208 300
100 300
468 300
292 300
54 300
654 300
29 300
340 300
80 300
224 300
544 300
337 300
318 300
53 300
370 300
627 300
76 300
125 300
277 300
178 300
11 300
100 300
103 300
186 300
220 300
339 300
163 300
82 300
267 300
141 300
372 300
318 300
28 300
678 300
363 300
65 300
210 300
182 300
18 300
114 300
362 300
246 300
443 300
38 300
204 300
11 300
244 300
386 300
377 300
144 300
121 300
121 300
102 300
322 300
59 300
265 300
96 300
165 300
139 300
425 300
25 300
158 300
438 300
200 300
180 300
283 300
139 300
292 300
110 300
229 300
223 300
199 300
385 300
274 300
287 300
155 300
587 300
66 300
50 300
85 300
8 300
164 300
372 300
196 300
454 300
137 300
375 300
539 300
19 300
309 300
231 300
160 300
170 300
235 300
398 300
35 300
253 300
209 300
175 300
117 300
111 300
109 300
148 300
131 300
276 300
431 300
184 300
247 

129 300
219 300
163 300
105 300
89 300
158 300
130 300
220 300
23 300
41 300
117 300
507 300
233 300
159 300
167 300
12 300
105 300
313 300
307 300
101 300
116 300
381 300
121 300
378 300
154 300
109 300
118 300
127 300
287 300
15 300
175 300
342 300
19 300
68 300
342 300
14 300
290 300
77 300
311 300
188 300
20 300
17 300
134 300
137 300
229 300
295 300
68 300
225 300
335 300
130 300
203 300
190 300
18 300
122 300
322 300
180 300
139 300
64 300
227 300
245 300
248 300
184 300
117 300
147 300
268 300
176 300
165 300
52 300
208 300
266 300
312 300
204 300
444 300
291 300
104 300
70 300
137 300
252 300
164 300
310 300
249 300
416 300
130 300
217 300
47 300
318 300
94 300
197 300
359 300
342 300
53 300
229 300
182 300
65 300
129 300
27 300
404 300
120 300
89 300
11 300
349 300
115 300
155 300
602 300
173 300
196 300
164 300
290 300
85 300
262 300
403 300
14 300
86 300
91 300
104 300
66 300
161 300
70 300
288 300
125 300
147 300
358 300
164 300
642 300
76 300
288 300
353 300
76 300
262 300

206 300
182 300
6 300
323 300
318 300
75 300
361 300
15 300
106 300
273 300
547 300
77 300
231 300
460 300
387 300
339 300
169 300
45 300
192 300
407 300
143 300
197 300
377 300
119 300
86 300
113 300
259 300
288 300
192 300
155 300
128 300
192 300
379 300
211 300
85 300
34 300
238 300
282 300
102 300
111 300
401 300
275 300
63 300
266 300
121 300
816 300
169 300
130 300
185 300
366 300
531 300
200 300
116 300
401 300
227 300
132 300
243 300
164 300
420 300
428 300
358 300
468 300
16 300
394 300
170 300
219 300
358 300
279 300
74 300
320 300
148 300
69 300
142 300
280 300
297 300
124 300
174 300
506 300
184 300
87 300
264 300
201 300
230 300
273 300
71 300
228 300
75 300
270 300
98 300
131 300
329 300
200 300
271 300
247 300
184 300
23 300
54 300
161 300
233 300
207 300
240 300
88 300
121 300
83 300
8 300
211 300
343 300
11 300
244 300
313 300
199 300
150 300
316 300
57 300
328 300
460 300
338 300
96 300
82 300
145 300
129 300
186 300
396 300
364 300
138 300
195 300
249 300
149 300
62 

358 300
340 300
145 300
15 300
98 300
367 300
471 300
271 300
48 300
242 300
151 300
240 300
135 300
10 300
154 300
76 300
254 300
363 300
163 300
67 300
309 300
11 300
680 300
8 300
59 300
93 300
46 300
298 300
271 300
217 300
158 300
58 300
230 300
205 300
435 300
102 300
121 300
171 300
109 300
10 300
264 300
78 300
233 300
76 300
172 300
85 300
182 300
213 300
118 300
420 300
238 300
211 300
231 300
70 300
210 300
275 300
97 300
239 300
342 300
283 300
212 300
170 300
315 300
329 300
106 300
279 300
330 300
171 300
211 300
270 300
234 300
432 300
181 300
93 300
41 300
109 300
232 300
259 300
91 300
295 300
205 300
82 300
41 300
407 300
150 300
179 300
119 300
210 300
161 300
36 300
239 300
236 300
479 300
206 300
9 300
156 300
204 300
190 300
126 300
404 300
275 300
337 300
387 300
14 300
164 300
366 300
264 300
69 300
18 300
56 300
358 300
469 300
178 300
137 300
324 300
169 300
75 300
94 300
164 300
15 300
151 300
131 300
138 300
104 300
293 300
102 300
148 300
387 300
205 300
22

In [150]:
# Instantiate and fit a basic Random Forest model on top of the vectors
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier()


In [151]:
rf_model = rf.fit(X_train_vect_avg, y_train.values.ravel())

Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  y_store_unique_indices = np.zeros(y.shape, dtype=np.int)
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  y_encoded = np.zeros(y.shape, dtype=np.int)
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  y_encoded = np.zeros(y.shape, dtype=np.int)
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  y_encoded = np.zeros(y.shape, dtype=np.int)
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  y_encoded = np.zeros(y.shape, dtype=np.int)
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  y_encoded = np.zeros(y.

In [153]:
# Use the trained model to make predictions on the test data
y_pred = rf_model.predict(X_test_vect_avg)

Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  dtype=np.int)


In [154]:
from sklearn.metrics import precision_score, recall_score
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
print('Precision: {} / Recall: {} / Accuracy: {}'.format(
    round(precision, 3), round(recall, 3), round((y_pred==y_test).sum()/len(y_pred), 3)))

Precision: 0.904 / Recall: 0.252 / Accuracy: 0.928
