# Word2Vec Embeddings

In [2]:
import numpy as np
import pandas as pd
import csv
import pickle
import time
import math
import collections
from tqdm import tqdm

## Step 1: Read in Data

In [3]:
dirPath = '/home/ubuntu/Biomed-Data-Science-NLP-Project/Data/'
patientData_filepath = dirPath + 'B220_SAA_v1.csv'
CCSR_filepath = dirPath + 'ICD_to_CCSR_20201_1.csv'

In [4]:
def read_csv_to_dict(file_path: str, key: int, value: int):
    ret_dict = {}
    with open(file_path, newline='') as csvfile:
        data = csv.reader(csvfile, delimiter=',')
        for row in data:
            ret_dict[row[key][1:-1]] = row[value][1:-1] # "'icd_code'"
    print("Reading {} complete!".format(file_path))
    return ret_dict

In [5]:
icd_to_ccsr_code = read_csv_to_dict(CCSR_filepath, key=0, value=6) # 'icd_code' -> 'CCSR_code'
ccsr_codes = list(np.unique(list(icd_to_ccsr_code.values())))
ccsr_codes.remove('CCSR CATEGORY 1')

ccsr_code_to_index = collections.defaultdict(int)
for i, ccsr_code in enumerate(ccsr_codes):
    ccsr_code_to_index[ccsr_code] = i
    
print("{} CCSR categories".format(len(ccsr_code_to_index.keys())))

Reading /home/ubuntu/Biomed-Data-Science-NLP-Project/Data/ICD_to_CCSR_20201_1.csv complete!
519 CCSR categories


In [6]:
s1 = time.time()
icd_codes_table = pd.read_csv(patientData_filepath, usecols=range(16,41))
s2 = time.time()
print("Read data in {} minutes".format((s2-s1)/60))
corpus = [[elem for elem in row if type(elem) == str] for row in icd_codes_table.values.tolist()]
# corpus = [[icd1, icd2, icd3], [icd1, icd2],...]
s3 = time.time()
print("Corpus complete in {} minutes".format((s3-s1)/60))

Read data in 1.8089871327082316 minutes
Corpus complete in 5.469060230255127 minutes


## Strep 2: Define hyperparameters

In [7]:
settings = {
    'window_size': 2, # num of context words to left or right
    'epochs': 1, # each epoch loops through all training samples
    'learning_rate': 0.01,
    'icd_to_ccsr': icd_to_ccsr_code,
}

 ## Step 3: Create Word2Vec model

Implementation comes from: https://towardsdatascience.com/an-implementation-guide-to-word2vec-using-numpy-and-google-sheets-13445eebd281

In [8]:
class word2vec():
    def __init__(self, embed_size):
        self.n = embed_size # size of embedding
        self.lr = settings['learning_rate']
        self.epochs = settings['epochs']
        self.window = settings['window_size']
        self.icd_to_ccsr = settings['icd_to_ccsr']
        
    def createIndexes(self, corpus):
        # Find unique word counts using dictonary
        word_counts = collections.defaultdict(int)
        invalid_codes = []
        for i, sentence in tqdm(enumerate(corpus)):
            # Ignore sentences with 1 token
            if len(sentence) == 1: continue
            for ICD_code in sentence:
                try:
                    ccsr_code = self.icd_to_ccsr[ICD_code]
                    word_counts[ccsr_code] += 1
                except:
                    invalid_codes.append(ICD_code)
#             if i % 10000000 == 0:
#                 print("Processed {} million visits".format(i/1000000))
        print("Processed {} million visits".format(len(corpus)/1000000))
        print("Found {} invalid ICD codes".format(len(invalid_codes)))
        # Save invalid codes
        self.invalid_codes = invalid_codes
        # How many unique words in vocab?
        self.v_count = len(word_counts.keys())
        # Generate Lookup Dictionaries (vocab)
        self.words_list = list(word_counts.keys())
        # Generate word:index
        self.word_index = dict((word, i) for i, word in enumerate(self.words_list))
        # Generate index:word
        self.index_word = dict((i, word) for i, word in enumerate(self.words_list))

    def run(self, settings, corpus, w1, w2):
        self.createIndexes(corpus)
        
        # Initialize weight matrices
        self.w1 = w1#np.random.uniform(-1, 1, (self.v_count, self.n)) # (502, 100)
        self.w2 = w2#np.random.uniform(-1, 1, (self.n, self.v_count)) # (100, 502)
        
        # Cycle through each epoch
        for i in tqdm(range(self.epochs)):
            # Intialise loss to 0
            self.loss = 0
            
            # Cycle through each sentence in corpus
            for s, inSentence in enumerate(corpus):
                sentence = self.sentenceToCCSR(inSentence) # ICD -> CCSR
                sent_len = len(sentence)
                # Ignore patient vists with 1 ICD code because there is no context
                if sent_len == 1: continue
                # Cycle through each word in sentence
                for i, word in enumerate(sentence):
                    # Convert target word to one-hot
                    w_target = self.word2onehot(sentence[i])
                    # Cycle through context window
                    w_context = []
                    # Note: window_size 2 will have range of 5 values
                    for j in range(i - self.window, i + self.window+1):
                        # Criteria for context word 
                        # 1. Target word cannot be context word (j != i)
                        # 2. Index must be greater or equal than 0 (j >= 0) - if not list index out of range
                        # 3. Index must be less or equal than length of sentence (j <= sent_len-1) - if not list index out of range 
                        if j != i and j <= sent_len-1 and j >= 0:
                            # Append the one-hot representation of word to w_context
                            w_context.append(self.word2onehot(sentence[j]))
                    
                    # Forward pass - Pass in vector for target word (w_t) to get:
                    # 1. predicted y using softmax (y_pred) 2. matrix of hidden layer (h) 3. output layer before softmax (u)
                    y_pred, h, u = self.forward_pass(w_target)

                    # Calculate error
                    # 1. For a target word, calculate difference between y_pred and each of the context words
                    # 2. Sum up the differences using np.sum to give us the error for this particular target word
                    err = np.sum([np.subtract(y_pred, word) for word in w_context], axis=0)

                    # Backpropagation
                    # We use SGD to backpropagate errors - calculate loss on the output layer 
                    self.backprop(err, h, w_target)

                    # Calculate loss
                    # There are 2 parts to the loss function
                    # Part 1: -ve sum of all the output +
                    # Part 2: length of context words * log of sum for all elements (exponential-ed) in the output layer before softmax (u)
                    # Note: word.index(1) returns the index in the context word vector with value 1
                    # Note: u[word.index(1)] returns the value of the output layer before softmax
                    self.loss += -np.sum([u[word.index(1)] for word in w_context]) + len(w_context) * np.log(np.sum(np.exp(u)))
                if s % 100000 == 0:
                    print("Processed {} million patient vists".format(s/1000000))
            print('Epoch: {}, Loss: {}'.format(i, self.loss))

            
    def word2onehot(self, word):
        # word_vec - initialise a blank vector
        word_vec = [0 for i in range(0, self.v_count)] # Alternative np.zeros(self.v_count)
        # Get ID of word from word_index
        word_index = self.word_index[word]
        # Change value from 0 to 1 according to ID of the word
        word_vec[word_index] = 1
        return word_vec
    
    def sentenceToCCSR(self, sentence):
        ccsrSentence = []
        for ICD_code in sentence:
            try:
                ccsr_code = self.icd_to_ccsr[ICD_code]
                ccsrSentence.append(ccsr_code)
            except:
                pass
        return ccsrSentence
    
    def forward_pass(self, x): # x (502, )
        # x is one-hot vector for target word, shape
        # Run through first matrix (w1) to get hidden layer
        h = np.dot(self.w1.T, x) # self.w1.T (100, 502) x (502,) -> (100,)
        # Dot product hidden layer with second matrix (w2)
        u = np.dot(self.w2.T, h) # self.w2.T (502, 100) x (100,) -> (502,)
        # Run u through softmax to force each element to range of [0, 1]
        y_c = self.softmax(u) # (502)
        return y_c, h, u
    
    def softmax(self, x):
        e_x = np.exp(x - np.max(x))
        return e_x / e_x.sum(axis=0)
    
    def backprop(self, e, h, x):
        # https://docs.scipy.org/doc/numpy-1.15.1/reference/generated/numpy.outer.html
        # Column vector EI represents row-wise sum of prediction errors across each context word for the current center word
        # Going backwards, we need to take derivative of E with respect of w2
        # h - shape 10x1, e - shape 9x1, dl_dw2 - shape 10x9
        dl_dw2 = np.outer(h, e)
        # x - shape 1x8, w2 - 5x8, e.T - 8x1
        # x - 1x8, np.dot() - 5x1, dl_dw1 - 8x5
        dl_dw1 = np.outer(x, np.dot(self.w2, e.T))
        # Update weights
        self.w1 = self.w1 - (self.lr * dl_dw1)
        self.w2 = self.w2 - (self.lr * dl_dw2)

    # Get vector from word
    def word_vec(self, word):
        w_index = self.word_index[word]
        v_w = self.w1[w_index]
        return v_w
    
    # Input vector, returns nearest word(s)
    def vec_sim(self, word, top_n):
        v_w1 = self.word_vec(word)
        word_sim = {}

        for i in range(self.v_count):
            # Find the similary score for each word in vocab
            v_w2 = self.w1[i]
            theta_sum = np.dot(v_w1, v_w2)
            theta_den = np.linalg.norm(v_w1) * np.linalg.norm(v_w2)
            theta = theta_sum / theta_den

            word = self.index_word[i]
            word_sim[word] = theta

        words_sorted = sorted(word_sim.items(), key=lambda kv: kv[1], reverse=True)

        for word, sim in words_sorted[:top_n]:
            print(word, sim)

## Train Word2Vec model

In [11]:
w2v_100 = word2vec(embed_size=100)

In [14]:
v_count = 502
embed_size=100
w1 = np.random.uniform(-1, 1, (v_count, embed_size)) # (502, 100)
w2 = np.random.uniform(-1, 1, (embed_size, v_count)) # (100, 502)
w2v_100.run(settings, corpus, w1, w2)

27977932it [01:06, 421329.55it/s]
  0%|          | 0/1 [00:00<?, ?it/s]

Processed 27.977932 million visits
Found 26 invalid ICD codes
Processed 0.0 million patient vists
Processed 0.1 million patient vists
Processed 0.2 million patient vists
Processed 0.3 million patient vists
Processed 0.6 million patient vists
Processed 0.7 million patient vists
Processed 0.8 million patient vists
Processed 0.9 million patient vists
Processed 1.0 million patient vists
Processed 1.1 million patient vists
Processed 1.2 million patient vists
Processed 1.3 million patient vists
Processed 1.4 million patient vists
Processed 1.5 million patient vists
Processed 1.7 million patient vists
Processed 1.8 million patient vists
Processed 1.9 million patient vists
Processed 2.0 million patient vists
Processed 2.3 million patient vists
Processed 2.5 million patient vists
Processed 2.6 million patient vists
Processed 2.7 million patient vists
Processed 2.8 million patient vists
Processed 2.9 million patient vists
Processed 3.0 million patient vists
Processed 3.3 million patient vists
Pr

Processed 26.3 million patient vists
Processed 26.4 million patient vists
Processed 26.6 million patient vists
Processed 26.7 million patient vists
Processed 26.8 million patient vists
Processed 26.9 million patient vists
Processed 27.0 million patient vists
Processed 27.1 million patient vists
Processed 27.2 million patient vists
Processed 27.3 million patient vists
Processed 27.4 million patient vists
Processed 27.6 million patient vists
Processed 27.8 million patient vists
Processed 27.9 million patient vists


100%|██████████| 1/1 [42:58:45<00:00, 154725.86s/it]

Epoch: 1, Loss: 2255990956.124729





In [15]:
import pickle

In [39]:
with open('saved_embedding.pkl', 'wb') as handle:
    pickle.dump([w2v_100.w1,w2v_100.w2, corpus], handle, protocol=pickle.HIGHEST_PROTOCOL)

In [40]:
with open('saved_embedding.pkl', 'rb') as handle:
    a,b,c = pickle.load(handle)

In [41]:
assert np.all(a==w2v_100.w1)
assert np.all(b==w2v_100.w2)
assert np.all(c==corpus)

In [None]:
#     def train(self, training_data):
#         # Initialize weight matrices
#         self.w1 = np.random.uniform(-1, 1, (self.v_count, self.n))
#         self.w2 = np.random.uniform(-1, 1, (self.v_count, self.n))
#         print(self.w1.shape)
#         print(self.w2.shape)
        
#         # Cycle through each epoch
#         for i in range(self.epochs):
#             # Intialise loss to 0
#             self.loss = 0

#             # Cycle through each training sample
#             # w_t = vector for target word, w_c = vectors for context words
#             for w_t, w_c in training_data:
#                 # Forward pass - Pass in vector for target word (w_t) to get:
#                 # 1. predicted y using softmax (y_pred) 2. matrix of hidden layer (h) 3. output layer before softmax (u)
#                 y_pred, h, u = self.forward_pass(w_t)
                
#                 # Calculate error
#                 # 1. For a target word, calculate difference between y_pred and each of the context words
#                 # 2. Sum up the differences using np.sum to give us the error for this particular target word
#                 err = np.sum([np.subtract(y_pred, word) for word in w_c], axis=0)

#                 # Backpropagation
#                 # We use SGD to backpropagate errors - calculate loss on the output layer 
#                 self.backprop(err, h, w_t)

#                 # Calculate loss
#                 # There are 2 parts to the loss function
#                 # Part 1: -ve sum of all the output +
#                 # Part 2: length of context words * log of sum for all elements (exponential-ed) in the output layer before softmax (u)
#                 # Note: word.index(1) returns the index in the context word vector with value 1
#                 # Note: u[word.index(1)] returns the value of the output layer before softmax
#                 self.loss += -np.sum([u[word.index(1)] for word in w_c]) + len(w_c) * np.log(np.sum(np.exp(u)))
#             print('Epoch:', i, "Loss:", self.loss)