In [1]:
#load library
import os
import pandas as pd
import numpy as np
import gensim
from gensim.utils import simple_preprocess
from gensim import corpora, models
from gensim.parsing.preprocessing import STOPWORDS
from nltk.stem import WordNetLemmatizer, SnowballStemmer
from nltk.stem.porter import *

import datetime
import time
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D

import nltk

In [2]:
meta=pd.read_csv("/Users/patsnap/Desktop/Neo4J_and_other_codes/Coronavirus_19/CORD-19-research-challenge/metadata.csv")
print(meta.shape)

(51078, 18)


In [3]:
### first filter by meta file. select only papers after 2020
meta["publish_time"] = pd.to_datetime(meta["publish_time"])
meta["publish_year"] = (pd.DatetimeIndex(meta['publish_time']).year)
meta["publish_month"] = (pd.DatetimeIndex(meta['publish_time']).month)
meta = meta[meta["publish_year"] == 2020]
print(meta.shape[0], " papers are available after 2020 Jan 1.")

5330  papers are available after 2020 Jan 1.


In [4]:
#count how many has abstract
count = 0
index = []
for i in range(len(meta)):
    #print(i)
    if type(meta.iloc[i, 8])== float:
        count += 1
    else:
        index.append(i)

print(len(index), " papers have abstract available.")

3947  papers have abstract available.


In [5]:
##extract the abstract to pandas 
documents = meta.iloc[index, 8]
documents=documents.reset_index()
documents.drop("index", inplace = True, axis = 1)

##create pandas data frame with all abstracts, use as input corpus
documents["index"] = documents.index.values
documents.head(3)

Unnamed: 0,abstract,index
0,Diabetes mellitus and hypertension are recogni...,0
1,We detected bovine kobuvirus (BKV) in calves w...,1
2,We examined nasal swabs and serum samples acqu...,2


# Data Processing

In [6]:
np.random.seed(400)
stemmer = SnowballStemmer("english")

In [7]:
##lemmatize and stemming

def lemmatize_stemming(text):
    return stemmer.stem(WordNetLemmatizer().lemmatize(text, pos='v'))

# Tokenize and lemmatize
def preprocess(text):
    result=[]
    for token in gensim.utils.simple_preprocess(text) :
        if token not in gensim.parsing.preprocessing.STOPWORDS and len(token) > 3:
            # TODO: Apply lemmatize_stemming on the token, then add to the results list
            result.append(lemmatize_stemming(token))
    return result

In [8]:
## use example to check the preprocessing step

document_num = 1000  ##randomly pick one abstract
doc_sample = documents[documents["index"] == document_num].values[0][0]

print("Original document: ")
words = []
for word in doc_sample.split(' '):
    words.append(word)
print(words)
print("\n\nTokenized and lemmatized document: ")
print(preprocess(doc_sample))

Original document: 
['Abstract', 'Context', 'The', 'COVID-19', 'pandemic', 'created', 'a', 'rapid', 'and', 'unprecedented', 'shift', 'in', 'our', 'medical', 'system.', 'Medical', 'providers,', 'teams,', 'and', 'organizations', 'have', 'needed', 'to', 'shift', 'their', 'visits', 'away', 'from', 'face-to-face', 'visits', 'and', 'toward', 'telehealth', '(both', 'by', 'phone', 'and', 'through', 'video).', 'Palliative', 'care', 'teams', 'who', 'practice', 'in', 'the', 'community', 'setting', 'are', 'faced', 'with', 'a', 'difficult', 'task:', 'How', 'do', 'we', 'actively', 'triage', 'the', 'most', 'urgent', 'visits', 'while', 'keeping', 'our', 'vulnerable', 'patients', 'safe', 'from', 'the', 'pandemic?', 'Measures', 'The', 'following', 'are', 'recommendations', 'created', 'by', 'the', 'Palo', 'Alto', 'Medical', 'Foundation', 'Palliative', 'Care', 'and', 'Support', 'Services', 'team', 'to', 'help', 'triage', 'and', 'coordinate', 'for', 'timely,', 'safe,', 'and', 'effective', 'palliative', 'ca

In [9]:
##preprocess all abstracts
processed_docs = documents['abstract'].map(preprocess)
processed_docs[:5]

0    [diabet, mellitus, hypertens, recogn, risk, fa...
1    [detect, bovin, kobuvirus, calv, diarrhea, uni...
2    [examin, nasal, swab, serum, sampl, acquir, dr...
3    [influenza, virus, potenti, caus, respiratori,...
4    [cetuximab, improv, surviv, patient, metastat,...
Name: abstract, dtype: object

In [10]:
##create dictionary based on the preprocessed_documents
dictionary = gensim.corpora.Dictionary(processed_docs)

##check the dictionary
count = 0
for k, v in dictionary.iteritems():
    print(k, v)
    count += 1
    if count > 5:
        break

0 arabia
1 associ
2 clinic
3 close
4 condit
5 coronavirus


In [11]:
## remove extreme words (very common and very rare)
dictionary.filter_extremes(no_below=15, no_above=0.1)

##create bag-of-word model for each documents
bow_corpus = [dictionary.doc2bow(doc) for doc in processed_docs]

In [13]:
## check the bow_corpus
bow_doc_1000 = bow_corpus[document_num]

# for i in range(len(bow_doc_1000)):
#     print("Word {} (\"{}\") appears {} time.".format(bow_doc_1000[i][0], dictionary[bow_doc_1000[i][0]], bow_doc_1000[i][1]))

# TF_IDF 

In [14]:
#create tf-idf from bow_corpus
tfidf = models.TfidfModel(bow_corpus)
corpus_tfidf = tfidf[bow_corpus]

#preview the corpus_tfidf
from pprint import pprint
for doc in corpus_tfidf:
    pprint(doc)
    break

[(0, 0.35010075989000167),
 (1, 0.18684054739847109),
 (2, 0.19267453198919854),
 (3, 0.25956174363610257),
 (4, 0.2671209268243568),
 (5, 0.22469568206143628),
 (6, 0.2917165598339723),
 (7, 0.27834751937482777),
 (8, 0.1943534792734007),
 (9, 0.2077270875590725),
 (10, 0.18336543350016205),
 (11, 0.17845533155687388),
 (12, 0.24360370781665938),
 (13, 0.2720805833579731),
 (14, 0.34508200985963433),
 (15, 0.23401165060982926)]


# LDA

In [15]:
now = datetime.datetime.now()
print ("start model building at ",now.strftime("%Y-%m-%d %H:%M:%S"))
lda_model = gensim.models.LdaMulticore(bow_corpus, num_topics=5, id2word = dictionary, passes = 50, workers=4) 
now = datetime.datetime.now()
print ('Model training finished at ',now.strftime("%Y-%m-%d %H:%M:%S"))

start model building at  2020-04-15 19:28:38
Model training finished at  2020-04-15 19:30:48


In [16]:
##print out the key words of five topics
for idx, topic in lda_model.print_topics(-1):
    print("Topic: {} \nWords: {}".format(idx, topic))
    print("\n")

Topic: 0 
Words: 0.012*"citi" + 0.010*"hubei" + 0.009*"travel" + 0.009*"march" + 0.008*"quarantin" + 0.008*"contact" + 0.008*"intervent" + 0.008*"individu" + 0.007*"reproduct" + 0.007*"social"


Topic: 1 
Words: 0.014*"vaccin" + 0.012*"sequenc" + 0.011*"genom" + 0.010*"drug" + 0.010*"host" + 0.010*"immun" + 0.009*"bind" + 0.009*"target" + 0.009*"express" + 0.008*"structur"


Topic: 2 
Words: 0.010*"research" + 0.007*"healthcar" + 0.006*"communiti" + 0.005*"recommend" + 0.005*"work" + 0.005*"practic" + 0.005*"articl" + 0.005*"chines" + 0.005*"nation" + 0.005*"support"


Topic: 3 
Words: 0.011*"sampl" + 0.010*"outcom" + 0.008*"children" + 0.008*"mortal" + 0.008*"critic" + 0.008*"fever" + 0.008*"blood" + 0.007*"negat" + 0.007*"swab" + 0.007*"admiss"


Topic: 4 
Words: 0.020*"imag" + 0.018*"chest" + 0.017*"lung" + 0.014*"lesion" + 0.013*"score" + 0.012*"featur" + 0.010*"fever" + 0.010*"grind" + 0.010*"famili" + 0.009*"anxieti"




Based on the key words selected above, we can somehow summarized the five major topics as below:

immunology
hubei social, individual quarantin
healthcare, recommendation
genomic sequence
symptoms (fever, chest image) + admision

# TF-IDF + LDA

In [18]:
now = datetime.datetime.now()
print ("start model building at ",now.strftime("%Y-%m-%d %H:%M:%S"))
lda_model_tfidf = gensim.models.LdaMulticore(corpus_tfidf, num_topics=5, id2word = dictionary, passes = 50, workers=4)
now = datetime.datetime.now()
print ('Model training finished at ',now.strftime("%Y-%m-%d %H:%M:%S"))

In [None]:
## check the key words of five topics
for idx, topic in lda_model_tfidf.print_topics(-1):
    print("Topic: {} Word: {}".format(idx, topic))
    print("\n")

Based on the keywords above, we can summarize the five topics as:

healthcare and research,
disease co-morbidities,
Drug and genomic sequencing, biomedical
Disease spread,
Fever, chest image, symptoms

# Apply model to get all abstracts' topic

In [None]:
documents_lda_topics = pd.DataFrame(columns = ["topic1", "topic2", "topic3", "topic4", "topic5"])
documents_lda_tfidf_topics = pd.DataFrame(columns = ["topic1", "topic2", "topic3", "topic4", "topic5"])
for i in range(len(bow_corpus)):
    if i % 500 ==0:
        print(i)
    documents_lda_topics.loc[i] = [0] * 5
    documents_lda_tfidf_topics.loc[i] = [0] * 5
    
    output = lda_model.get_document_topics(bow_corpus[i])
    for j in range(len(output)):
        a = output[j][0]
        b = output[j][1]
        documents_lda_topics.iloc[i,a] = b
    
    output_tfidf = lda_model_tfidf.get_document_topics(bow_corpus[i])
    for k in range(len(output_tfidf)):
        a = output_tfidf[k][0]
        b = output_tfidf[k][1]
        documents_lda_tfidf_topics.iloc[i, a] = b
        
print("Data processing finished")

In [None]:
## pick the final topic for each abstract based on max-probability
for i in range(5):
    documents_lda_topics.iloc[:, i] = documents_lda_topics.iloc[:, i].astype('float64', copy=False)
    
documents_lda_topics["final_topic"] =documents_lda_topics.iloc[:, :5].idxmax(axis=1)

for i in range(5):
    documents_lda_tfidf_topics.iloc[:, i] = documents_lda_tfidf_topics.iloc[:, i].astype('float64', copy=False)

documents_lda_tfidf_topics["final_topic"] =documents_lda_tfidf_topics.iloc[:, :5].idxmax(axis=1)

In [None]:
##preview the dataframe for both models
print("LDA + bow_corpus: topic probability:")
documents_lda_topics.head(3)
print("LDA + TF-IDF_corpus: topic probability:")
documents_lda_tfidf_topics.head(3)

# Topic Modelling

In [None]:
pca = PCA(n_components=3)
pca_result = pca.fit_transform(documents_lda_topics.iloc[:, :5])

In [None]:
## with 3 components, variance explained
pca.explained_variance_ratio_

In [None]:
##create dataframe with projected vectors from PCA
pca_df = pd.DataFrame()
pca_df['pca-one'] = pca_result[:,0]
pca_df['pca-two'] = pca_result[:,1] 
pca_df["pca-three"] = pca_result[:, 2]
pca_df["topic"] = documents_lda_topics.iloc[:, 5].replace({"topic1": "red", "topic2": "blue", "topic3": "green", "topic4": "yellow", "topic5": "black"})

In [None]:
plt.figure(figsize=(16,10))
sns.scatterplot( x="pca-one", y="pca-two", hue= documents_lda_topics.iloc[:, 5].replace({"topic1": "red", "topic2": "blue", "topic3": "green", "topic4": "yellow", "topic5": "black"}), data=pca_df, legend="full", alpha=0.3)

In [None]:
ax = plt.figure(figsize=(16,10)).gca(projection='3d')
ax.scatter(xs=pca_df["pca-one"], ys=pca_df["pca-two"], zs=pca_df["pca-three"], cmap='tab10', c = documents_lda_topics.iloc[:, 5].replace({"topic1": "red", "topic2": "blue", "topic3": "green", "topic4": "yellow", "topic5": "black"}))
ax.set_xlabel('pca-one')
ax.set_ylabel('pca-two')
ax.set_zlabel('pca-three')
plt.show()

In [None]:
##first run TSNE
import time
time_start = time.time()
tsne = TSNE(n_components=2, verbose=1, perplexity=40, n_iter=300)
tsne_results = tsne.fit_transform(documents_lda_topics.iloc[:, :5])

In [None]:
##create dataframe with TSNE results
tsne_df = pd.DataFrame()
tsne_df['tsne-2d-one'] = tsne_results[:,0]
tsne_df['tsne-2d-two'] = tsne_results[:,1]

In [None]:
plt.figure(figsize=(16,10))
sns.scatterplot(x="tsne-2d-one", y="tsne-2d-two", hue=documents_lda_topics.iloc[:, 5].replace({"topic1": "red", "topic2": "blue", "topic3": "green", "topic4": "yellow", "topic5": "black"}), data=tsne_df, legend="full", alpha=0.3)