# Changes made Team
- `glove` was not available and so needed to install `mittens` instead
  - `from mittens import GloVe as glove`
- The calls to access the `Word2vec` and `FastText` models needed to be updated
  - `w2vec[...]` changed to `w2vec.wv[...]` and `fasttext[...]` changed to `fasttext.wv[...]`
- In Python 3 you can no longer convert from `map` to `np.array`
  - So, `t = np.asarray(map(mean, zip(*avg)))` changed to `t = np.asarray(list(map(mean, zip(*avg))))`
- Accessing `FastText` elements raised an `IndexError`
  - These calls are now wraped in a `try` block
- Added in two new embedding techniques `ClinicalBERT` and `BlueBERT`
- Varaibles saved into `new_ner_XXXX_limited_dict.pkl` were not being calculated correctly. 
  - Only common keys should be saved in each so each should have the same length prior to saving.
- Changes to newlines / spacings etc.

In [1]:
!pip install transformers




In [2]:

import pandas as pd
import os
import numpy as np
from gensim.models import Word2Vec, FastText
from mittens import GloVe as glove
from tqdm import tqdm

import torch
import collections
import gc 

from transformers import AutoTokenizer, AutoModel

import warnings
warnings.filterwarnings('ignore')

## Specify what to Run

In [3]:
run_word2vec     = True
run_fastText     = True
run_combined     = True
run_blueBERT     = False
run_clinicalBERT = False

In [4]:
new_notes = pd.read_pickle("data/ner_df.p") # med7

if run_word2vec: w2vec = Word2Vec.load("embeddings/word2vec.model")
if run_fastText: fasttext = FastText.load("embeddings/fasttext.model")

In [5]:
null_index_list = []
for i in new_notes.itertuples():
    
    if len(i.ner) == 0:
        null_index_list.append(i.Index)
new_notes.drop(null_index_list, inplace=True)

In [6]:
med7_ner_data = {}

for ii in new_notes.itertuples():
    
    p_id = ii.SUBJECT_ID
    ind = ii.Index
    
    try:
        new_ner = new_notes.loc[ind].ner
    except:
        new_ner = []
            
    unique = set()
    new_temp = []
    
    for j in new_ner:
        for k in j:
            
            unique.add(k[0])
            new_temp.append(k)

    if p_id in med7_ner_data:
        for i in new_temp:
            med7_ner_data[p_id].append(i)
    else:
        med7_ner_data[p_id] = new_temp

In [7]:
pd.to_pickle(med7_ner_data, "data/new_ner_word_dict.pkl")

In [8]:
def mean(a):
    return sum(a) / len(a)

In [9]:
data_types = [med7_ner_data]
data_names = ["new_ner"]

## Running Word2Vec

In [10]:
if run_word2vec:
    print("w2vec starting..")
    for data, names in zip(data_types, data_names):
   
        new_word2vec = {}
        for k,v in tqdm(data.items()):

            patient_temp = []
            for i in v:
                try:
                    patient_temp.append(w2vec.wv[i[0]])
                except:
                    avg = []
                    num = 0
                    temp = []

                    if len(i[0].split(" ")) > 1:
                        for each_word in i[0].split(" "):
                            try:
                                temp = w2vec.wv[each_word]
                                avg.append(temp)
                                num += 1
                            except:
                                pass
                        if num == 0: 
                            continue
                        avg = np.asarray(avg)
                        t = np.asarray(list(map(mean, zip(*avg))))
                        patient_temp.append(t)
            if len(patient_temp) == 0: 
                continue
            new_word2vec[k] = patient_temp

    print("w2vec finished")

w2vec starting..


100%|███████████████████████████████████| 22446/22446 [00:12<00:00, 1746.59it/s]

w2vec finished





## Running FastText

In [11]:
if run_fastText:
    print("fasttext starting..")
    for data, names in zip(data_types, data_names):
   
        new_fasttextvec = {}

        for k,v in tqdm(data.items()):

            patient_temp = []

            for i in v:
                try:
                    patient_temp.append(fasttext.wv[i[0]])
                except:
                    pass
            if len(patient_temp) == 0: continue
            new_fasttextvec[k] = patient_temp

    print("fasttext finished")

fasttext starting..


100%|██████████████████████████████████| 22446/22446 [00:01<00:00, 13036.48it/s]

fasttext finished





## Running Combined - Word2Vec & FastText

In [12]:
if run_combined:
    print("combined starting..")
    for data, names in zip(data_types, data_names):
   
        new_concatvec = {}

        for k,v in tqdm(data.items()):
            
            patient_temp = []
        #     if k != 6: continue
            for i in v:
                w2vec_temp = []
                try:
                    w2vec_temp = w2vec.wv[i[0]]
                except:
                    avg = []
                    num = 0
                    temp = []

                    if len(i[0].split(" ")) > 1:
                        for each_word in i[0].split(" "):
                            try:
                                temp = w2vec.wv[each_word]
                                avg.append(temp)
                                num += 1
                            except:
                                pass
                        if num == 0: 
                            w2vec_temp = [0] * 100
                        elif num == 1:
                            w2vec_temp = temp
                        else:
                            avg = np.array(avg)
                            w2vec_temp = avg.mean(axis=0)
                    else:
                        w2vec_temp = [0] * 100

                try:
                    fasttemp = fasttext.wv[i[0]]
                except:
                    fasttemp = [0] * 100
                    
                appended = np.append(fasttemp, w2vec_temp, 0)
                patient_temp.append(appended)
                
            if len(patient_temp) == 0: 
                continue
            new_concatvec[k] = patient_temp

    print("combined finished")



combined starting..


100%|███████████████████████████████████| 22446/22446 [00:06<00:00, 3315.09it/s]

combined finished





## Running BlueBERT

In [13]:
if run_blueBERT:
    
    tokenizer = AutoTokenizer.from_pretrained("bionlp/bluebert_pubmed_mimic_uncased_L-12_H-768_A-12")
    model = AutoModel.from_pretrained("bionlp/bluebert_pubmed_mimic_uncased_L-12_H-768_A-12")

    batch_size = 128
    patient_embeddings = []
    
    print("BlueBERT starting..")
    for data, names in zip(data_types, data_names):
   
        new_bluebert = {}
    
        for k, v in tqdm(data.items()):
            # Batch input texts
            input_texts = [i[0] for i in v]
            encoded_inputs = tokenizer.batch_encode_plus(input_texts,
                                                         add_special_tokens=True,
                                                         return_tensors='pt',
                                                         padding=True,
                                                         truncation=True)
            with torch.no_grad():
                output = model(**encoded_inputs).last_hidden_state
            
            # Extract [CLS] embeddings for each input text
            embeddings = output[:, 0, :].tolist()
            
            if len(embeddings) == 0: 
                continue
            new_bluebert[k] = embeddings
        
    # Convert values to NumPy arrays and store back in dictionary
    for key in new_bluebert:
        new_bluebert[key] = np.array(new_bluebert[key])
    
    print("BlueBERT finished")

## Running ClinicalBERT

In [14]:
if run_clinicalBERT:
    
    tokenizer = AutoTokenizer.from_pretrained('emilyalsentzer/Bio_ClinicalBERT')
    model = AutoModel.from_pretrained('emilyalsentzer/Bio_ClinicalBERT')

    batch_size = 128
    patient_embeddings = []

    print("ClinicalBERT starting..")
    for data, names in zip(data_types, data_names):
   
        new_clinicalbert = {}
    
        for k, v in tqdm(data.items()):
            # Batch input texts
            input_texts = [i[0] for i in v]
            encoded_inputs = tokenizer.batch_encode_plus(input_texts,
                                                         add_special_tokens=True,
                                                         return_tensors='pt',
                                                         padding=True,
                                                         truncation=True)
            with torch.no_grad():
                output = model(**encoded_inputs).last_hidden_state

            
            # Extract [CLS] embeddings for each input text
            embeddings = output[:, 0, :].tolist()
            
            if len(embeddings) == 0: 
                continue
            new_clinicalbert[k] = embeddings
        
    # Convert values to NumPy arrays and store back in dictionary
    for key in new_clinicalbert:
        new_clinicalbert[key] = np.array(new_clinicalbert[key])
    
    print("ClinicalBERT finished")

Save the results

In [15]:
if run_word2vec:     pd.to_pickle(new_word2vec,     "data/new_ner_word2vec_dict.pkl")
if run_fastText:     pd.to_pickle(new_fasttextvec,  "data/new_ner_fasttext_dict.pkl")
if run_combined:     pd.to_pickle(new_concatvec,    "data/new_ner_combined_dict.pkl")
if run_blueBERT:     pd.to_pickle(new_bluebert,     "data/new_ner_bluebert_dict.pkl")
if run_clinicalBERT: pd.to_pickle(new_clinicalbert, "data/new_ner_clinicalbert_dict.pkl")

In [16]:
new_word2vec     = pd.read_pickle("data/new_ner_word2vec_dict.pkl")
print("new_word2vec loaded")
new_fasttextvec  = pd.read_pickle("data/new_ner_fasttext_dict.pkl")
print("new_fasttextvec loaded")
new_concatvec    = pd.read_pickle("data/new_ner_combined_dict.pkl")
print("new_concatvec loaded")
# new_bluebertvec  = pd.read_pickle("data/new_ner_bluebert_dict.pkl")
# print("new_bluebert loaded")

new_word2vec loaded
new_fasttextvec loaded
new_concatvec loaded


In [17]:
if run_word2vec and run_fastText and run_combined:

    new_fasttextvec_keys  = set(new_fasttextvec.keys())
    new_word2vec_keys     = set(new_word2vec.keys())
    new_concatvec_keys    = set(new_concatvec.keys())
    intersection_keys     = new_fasttextvec_keys.intersection(new_word2vec_keys).intersection(new_concatvec_keys)

    print("Lengths before: {}, {}, {}".format(len(new_word2vec), 
                                                  len(new_fasttextvec), 
                                                  len(new_concatvec)))

    for i in new_fasttextvec_keys - intersection_keys:
        del new_fasttextvec[i]
    for i in new_word2vec_keys - intersection_keys:
        del new_word2vec[i]
    for i in new_concatvec_keys - intersection_keys:
        del new_concatvec[i]

    print("Lengths after:  {}, {}, {}".format(len(new_word2vec), 
                                              len(new_fasttextvec), 
                                              len(new_concatvec)))

    

Lengths before: 22203, 22025, 22446
Lengths after:  22025, 22025, 22025


In [18]:
pd.to_pickle(new_word2vec,    "data/new_ner_word2vec_limited_dict.pkl")
print("new_word2vec saved")
pd.to_pickle(new_fasttextvec, "data/new_ner_fasttext_limited_dict.pkl")
print("new_fasttextvec saved")
pd.to_pickle(new_concatvec,   "data/new_ner_combined_limited_dict.pkl")
print("new_concatvec saved")

print("done with writing the files")

new_word2vec saved
new_fasttextvec saved
new_concatvec saved
done with writing the files
