In [None]:
!pip install langdetect
!pip install semantic-text-similarity

#Libraries needed
import pandas as pd 
import glob
import json
import re 
import numpy as np
import copy 
import torch 
import matplotlib.pyplot as plt
from langdetect import detect
from semantic_text_similarity.models import ClinicalBertSimilarity
from nltk.tokenize import sent_tokenize, word_tokenize
from gensim.models import Word2Vec
from sklearn.manifold import TSNE

Collecting langdetect
  Downloading langdetect-1.0.8.tar.gz (981 kB)
Building wheels for collected packages: langdetect
  Building wheel for langdetect (setup.py): started
  Building wheel for langdetect (setup.py): finished with status 'done'
  Created wheel for langdetect: filename=langdetect-1.0.8-py3-none-any.whl size=993197 sha256=5a20f0c7f07cafd464096f2ac989fcf6da686d310f0611f6efcc7716b0076d4e
  Stored in directory: c:\users\krishna\appdata\local\pip\cache\wheels\59\f6\9d\85068904dba861c0b9af74e286265a08da438748ee5ae56067
Successfully built langdetect
Installing collected packages: langdetect
Successfully installed langdetect-1.0.8


In [None]:
#Read in the saved data 
print("Loading the dataframe.")
df_covid = pd.read_csv('covidData.csv')
print("Dataframe loaded.")
print()
df_covid = df_covid.head(1000)

In [None]:
#Remove all articles that have fewer than the number of words specified 
min_word_count = 1000
print("Removing all articles with fewer than "+str(min_word_count)+" words.")
indexNames = df_covid[df_covid['body_word_count'] < min_word_count].index
df_covid = df_covid.drop(indexNames)
df_covid = df_covid.reset_index(drop=True)
print("Articles cleaned.")
print()

In [None]:
#Remove all non-English articles
print("Removing all non-English articles")
index = 0
indexNames = []
while(index < len(df_covid)):
    print(f'Processing index: {index} of {len(df_covid)}', end='\r')
    language = detect(df_covid.iloc[index]['body_text'])
    if(language != 'en'):
        indexNames.append(index)
    index += 1
df_covid = df_covid.drop(indexNames)
df_covid = df_covid.reset_index(drop=True)
print("All non-English articles removed. Total article count is now: "+str(len(df_covid)))
print()

In [None]:
#Save the cleaned dataset 
print("Saving the dataframe.")
df_covid.to_csv('covidDataCleaned.csv') 
print("Dataframe saved.")

In [None]:
#Read in the saved data 
print("Loading the dataframe.")
df_covid = pd.read_csv('covidDataCleaned.csv')
print("Dataframe loaded.")
print()
df_covid = df_covid.head(5000)

In [None]:
#Read in the saved data 
print("Loading the dataframe.")
df_covid = pd.read_csv('../input/cord19cleaneddata/covidDataCleaned.csv')
print("Dataframe loaded.")
print()
df_covid = df_covid.head(5000)

In [None]:
#Train & save the word2vec model 
print("Training word2vec.")
model = Word2Vec(sentences, size=100, window=5, min_count=10, workers=4)
print("Word count:", len(list(model.wv.vocab)))
model.save("word2vec.model")
print("Finished training and saving word2vec.")

In [None]:
#Load the trained word2vec model 
print("Loading the pre-trained word2vec model.")
model = Word2Vec.load("word2vec.model")
print("Model loaded.")
print()

In [None]:
#From: https://methodmatters.github.io/using-word2vec-to-analyze-word/
#Define the function to compute the dimensionality reduction and then produce the biplot  
def tsne_plot(model, words):
    "Creates a TSNE model and plots it"
    labels = []
    tokens = []
    
    print("Getting embeddings.")
    for word in model.wv.vocab:  
        if(word in words):
            tokens.append(model[word])
            labels.append(word)
    print("Embeddings extracted.")
    print()
        
    print("Performing dimensionality reduction with t-sne.")
    tsne_model = TSNE(perplexity=5, n_components=2, init='pca', n_iter=2500, verbose=0)
    new_values = tsne_model.fit_transform(tokens)
    print("Dimensioanlity reduction complete.")
    print()
    
    x = []
    y = []
    for value in new_values:
        x.append(value[0])
        y.append(value[1])
        
    plt.figure(figsize=(8, 8))
    for i in range(len(x)):
        if(labels[i] in words):
            plt.scatter(x[i],y[i])
            plt.annotate(labels[i], xy=(x[i], y[i]), xytext=(5, 2), textcoords='offset points', ha='right', va='bottom')
    plt.show()
    return 

In [None]:
#List of words to visualize in the plot
words = ['china', 'italy', 'taiwan', 'india', 'japan', 'france', 
         'spain', 'canada', 'infection', 'disease', 'pathogen', 
         'organism', 'bacteria', 'virus', 'covid19', 'coronavirus', 
         'healthcase', 'doctor', 'nurse', 'specialist', 'hospital', 
         'novel', 'human', 'sars', 'covid', 'wuhan', 'case', 
         'background', 'dynamic', 'pneumonia', 'outbreak', 'pandemic', 
         'syndrome', 'contact', 'wash', 'hands', 'cough', 
         'respiratory', 'case', 'fear', 'spike', 'curve', 
         'transmission', 'seasonal', 'genome', 'dna', 'testing', 
         'asymptomatic', 'global', 'spread', 'diagnosis']
  
#Call the function on our dataset  
tsne_plot(model, words)


In [None]:
#Word to compare against and number of similar words to print out 
word = 'facemask'
similarCount = 3

In [None]:
#Get and print the results 
results = model.wv.most_similar(positive=word, topn=similarCount)
print("Input word:", word)
print("Top "+str(similarCount)+" similar words are:")
for index, word in enumerate(results):
    print(str(index+1)+". "+word[0]+" --- Score: "+str(word[1]))

In [None]:
#Words to compute cosine similarity over  
word1 = 'china'
word2 = 'wuhan'

#Get the word embeddings 
embedding1 = model.wv[word1]
embedding2 = model.wv[word2]

In [None]:
#Compute the cosine similarity and print the results 
cosineSimilarity = np.sum(embedding1*embedding2) / (np.sqrt(np.sum(np.square(embedding1)))*np.sqrt(np.sum(np.square(embedding2))))
print("Word1: "+word1+" --- Word2: "+word2)
print("Cosine similarity: "+ str(cosineSimilarity))

In [None]:
#Set the GPU device
device = 0
torch.cuda.set_device(device)

In [None]:
#Read in the saved data 
print("Loading the dataframe.")
df_covid = pd.read_csv('../input/cord19cleaneddata/covidDataCleaned.csv')
print("Dataframe loaded.")
print()
df_covid = df_covid.head(500)

In [None]:
#Variable to store the batch size
batchSize = 500 

In [None]:
#Load the model
print("Loading BERT semantic similarity model.")
model = ClinicalBertSimilarity(device='cuda', batch_size=batchSize) #defaults to GPU prediction
print("Model loaded.")
print()

In [None]:
#The primary questions that attempt to be answered  
primaryQuestions = [
    "What is known about transmission, incubation, and environmental stability of coronavirus"
    #"What do we know about coronavirus risk factors"
    #"What do we know about coronavirus genetics, origin, and evolution"
    #"What do we know about vaccines and therapeutics for coronavirus"
    #"What has been published about coronavirus medical care"
    #"What do we know about non-pharmaceutical interventions for coronavirus"
    #"What do we know about diagnostics and surveillance of coronavirus"
    #"In what ways does geography affects virality"
    #"What has been published about ethical and social science considerations regarding coronavirus"
    #"What has been published about information sharing and inter-sectoral collaboration"
]