# Word embeddings on CORD-19 dataset using word2vec

This notebook covers a Python-based solution to building a word representation model on the CORD-19 dataset (as of March 20, 2020) using gensim's implementation of word2vec.

The notebook is divided into 2 sections:

Part 1 consists of training a word2vec model (with skip-gram) from scratch on the dataset and saving to disk.

Part 2 consists of loading the most recently trained word2vec model from disk and running a few semantic tasks, such as finding most similar & dissimilar words to a user entered token. A scatter plot is also generated to help visualize the word embeddings using t-SNE and PCA. 

## Part 1: train a word2vec model on CORD-19 dataset, and save to disk

In [1]:
import os
import json
import pandas as pd
from pprint import pprint
import gensim
import logging
from tqdm import tqdm

import matplotlib
import matplotlib.pyplot as plt
%matplotlib inline

import numpy as np

from sklearn.decomposition import PCA
from sklearn.manifold import TSNE

#### Set directory paths on your local

In [3]:
#update path where CORD-19 JSON documents are located
CORD19_dir = '/../CORD-19/CORD-19 comm_use_subset/'

#update path where word2vec model will be saved
saved_model_dir = '/../word2vec models'

#### Extract text content from 'body_text' section of each document and pre-process using gensim

In [None]:
#Get list of all CORD-19 JSON documents
filenames = os.listdir(CORD19_dir)
print('Number of documents retrieved: ', len(filenames))

In [None]:
#Open document files and append as JSON objects into a list
all_files = []
for filename in tqdm(filenames):
    if filename.endswith('.json'):
        file_ = open(CORD19_dir + filename, 'r')
        json_file = json.load(file_)
        all_files.append(json_file)
        file_.close()

In [None]:
#Probe the structure of each JSON list element
#samplefile = all_files[0]
#print('Python data type: ', type(samplefile))
#print("Dictionary keys: ", samplefile.keys())

#Probe what the body_text dictionary looks like
#print('body_text type: ', type(samplefile['body_text']))
#print('body_text_length: ', len(samplefile['body_text']))
#print('body_text keys: ', samplefile['body_text'][0].keys())

#print("body_text contents: ")
#pprint (samplefile['body_text'][:2], depth=10)

In [None]:
#extract sections and associated text from list of paragraphs located within body_text of a document
def getBodyText(samplefile):
    texts = [(di['section'], di['text']) for di in samplefile['body_text']]
    
    texts_di = {di['section']: ""  for di in samplefile['body_text']}
    for section, text in texts:
        texts_di[section] += text

    body = ""
    for section, text in texts_di.items():  
        body += section
        body += " "
        body += text
        body += " "

    return body

In [None]:
#extract full body content of all CORD-19 documents and preprocess using Gensim
body_text_list = []
for file_ in tqdm(all_files):
    body_text = getBodyText(file_)
    body_text_list.append(gensim.utils.simple_preprocess(body_text))
    
print ('Number of COVID-19 documents processed: ', len(body_text_list))

#### Now train a word2vec model (using skip-gram) on the body_text content of all documents and save model to disk.
Change hyper-parmeters as needed.

In [None]:
#train the word2vec model using skip-gram
model = gensim.models.Word2Vec (body_text_list, size=200, window=10, min_count=2, workers=10, compute_loss=True, sg=1, hs=1)
print ('Word2Vec training started ...')
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
model.train(body_text_list,total_examples=len(body_text_list),epochs=10, compute_loss=True)

In [None]:
#inspect some model diagnostics of the newly trained word2vec model
viewModelDiagnostics(model)

In [2]:
def viewModelDiagnostics(model):
    print('Number of documents in corpus: ', model.corpus_count)
    print ('Size of corpus: ', model.corpus_total_words, ' total words')
    print ('Size of vocab: ', len(model.wv.vocab), ' tokens')
    print('Training time: ', model.total_train_time, ' seconds')
    print ('Training loss:', model.get_latest_training_loss())
    print ('Number of epochs:', model.epochs)
    print ('Size of vector:', model.vector_size)
    print('Type of model: skip gram = ',model.sg)

In [None]:
#save the word2vec model to disk for future use
import tempfile
with tempfile.NamedTemporaryFile(prefix='gensim-model-', delete=False, dir=saved_model_dir) as tmp:
    temp_filepath = tmp.name
    model.save(temp_filepath)
    print('model saved to: ',temp_filepath)

## Part 2: load a pre-trained word2vec model from disk, perform semantic word tasks (similar & dissimilar words), and view scatter plot of these tasks

In [4]:
#retreive latest word2vec model filename, ignore any other filetypes in the dir
from pathlib import Path

def getLatestWord2VecFileName():
    files = sorted(Path(saved_model_dir).iterdir(),key=os.path.getmtime, reverse=True)
    for index, _ in enumerate (files):
        if files[index].name.find('gensim-model-') != -1:
             return(files[index].as_posix())

In [5]:
#load the latest word2vec model
from gensim.models import Word2Vec
latestWord2VecModel = getLatestWord2VecFileName()
saved_model = Word2Vec.load(latestWord2VecModel)
print('Loaded this word2vec model: ',latestWord2VecModel)

Loaded this word2vec model:  /Users/shanerai/Desktop/Python Exercises/Covid Dataset/CORD-19/word2vec models/gensim-model-u_p3ez5g


In [6]:
#print some diagnostics of loaded word2vec model
viewModelDiagnostics(saved_model)

Number of documents in corpus:  9118
Size of corpus:  38430241  total words
Size of vocab:  120072  tokens
Training time:  4663.222483659994  seconds
Training loss: 134217728.0
Number of epochs: 10
Size of vector: 200
Type of model: skip gram =  1


In [None]:
#Perform some syntactic/semantic word tasks with the trained vectors to evaluate model performance
#function finds and prints most similar tokens/words to a given token/word

def getSimilarWords(list_test_words, top_N_words, saved_model):
    word_vectors = saved_model.wv
    for testword in list_test_words:
        try:
            result = word_vectors.most_similar(positive=[testword], topn=top_N_words)
            print('Words similar to', testword)
            for word, score in result:
                print('{}: {:.4f}'.format(word, score))
        except KeyError:    
            print('Exception: ', testword, 'is not in vocabulary!')
        print('\n')

#### Edit/add list of tokens below as needed, and uncomment and run cell to view most similar tokens to each word

In [None]:
#list of test tokens to retrieve 10 words most similar to each token
#list_test_words = ['immunosuppressive','vaccines','antibodies','incubation','covid','epidemiologic','pulmonary','mers','leukocyte', 'mutation', 'transmission', 'hepatitis','influenza','arboviruses','diabetes']
#top_N_words = 10
#getSimilarWords(list_test_words, top_N_words, saved_model)

In [20]:
#Draw a simple scatter plot to visualize most similar tokens to a target token
#uses PCA to reduce dimension from 100 to 10, and t-SNE to plot the tokens in 2-D space
#Output:
#blue dot represents the target token/word as input by user
#green dots represent most similar tokens to target token
#red dots represent least similar tokens to target token

def plotTSNEScatter(saved_model, word, list_words):
    
    word_labels = [word]
  
    colors_list = ['blue']
    
    arr = np.empty((0,200), dtype='float32')
    
    
    arr = np.append(arr, np.array([saved_model.wv.__getitem__(word)]), axis=0)
    #print('Shape of arr:', arr.shape, 'Size:', arr.size, 'Dimensions:', arr.ndim)    


    close_words = saved_model.wv.most_similar(word)

    for word_score in close_words:
        word_labels.append(word_score[0])
        word_vector = saved_model.wv.__getitem__(word_score[0])
        arr = np.append(arr, np.array([word_vector]), axis=0)
        colors_list.append('green')


    for word_ in list_words:
        word_labels.append(word_)
        word_vector = saved_model.wv.__getitem__(word_)
        arr = np.append(arr, np.array([word_vector]), axis=0)
        colors_list.append('red')


    #using PCA to reduce dimensions from 200 to 10 (number of similar words is 10)
    reduction = PCA(n_components=10).fit_transform(arr)

    #Find t-SNE coordinates for 2 dimension space
    np.set_printoptions(suppress=True)
    Y = TSNE(n_components=2, random_state=0, perplexity=10, n_iter=3000, learning_rate=300,verbose=1).fit_transform(reduction)
    #print('Size of Y: ', Y[0][0])

    x_coords = Y[:,0]
    y_coords = Y[:, 1]

    #display scatter plot
    plt.figure(figsize=(15,10))
    plt.scatter(x_coords, y_coords, c=colors_list)

    #get labels for each point
    for label, x, y in zip(word_labels, x_coords, y_coords):
        plt.annotate(label, xy=(x,y), xytext=(5,0), textcoords='offset points')

    #Set the x and y limits of each of the 2 axes based on 
    plt.xlim(x_coords.min()-10, x_coords.max()+7.)
    plt.ylim(y_coords.min()-10, y_coords.max()+7.)
    plt.title('t-SNE scatter plot of 10 most similar & dissimilar words to {}'.format(word))
    plt.show()


### Interactive widget: enter a word in the texbox to view plot of 10 most similar & dissimilar words

In [37]:
from ipywidgets import widgets
from IPython.display import display, clear_output

label = widgets.HBox([widgets.Label(value='Type a word below and hit enter/return to view plot of 10 most similar & dissimilar words:')])
display(label)

text = widgets.Text(placeholder='e.g. epidemic', disabled=False)
display(text)

output = widgets.Output(layout={'border': '1px solid black'})
display(output)

def handle_submit(widget):
    with output:
        clear_output()
        if text.value == '':
            print('No word was entered. Enter a word in above textbox.')
        else:
            try:
                plotTSNEScatter(saved_model, text.value,[word[0] for word in saved_model.wv.most_similar(negative=[text.value])])
            except KeyError:
                print(text.value, 'does not exist in the vocabulary. Try another word.')
    
text.on_submit(handle_submit)

HBox(children=(Label(value='Type a word below and hit enter/return to view plot of 10 most similar & dissimila…

Text(value='', placeholder='e.g. epidemic')

Output(layout=Layout(border='1px solid black'))