<a href="https://colab.research.google.com/github/saipramodkudapa/cord19-search-engine/blob/main/bda_cord_experiment.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install kaggle

In [None]:
ls -a

In [None]:
import json
token = {"username":"saipramodkudapa","key":"26273f7393c832d3b1283cdbec485327"}
with open('/content/.kaggle/kaggle.json', 'w') as file:
    json.dump(token, file)

In [None]:
cd content/

In [None]:
!cp /content/.kaggle/kaggle.json ./.kaggle/kaggle.json

In [None]:
!kaggle config set -n path -v{/content}

In [None]:
!chmod 600 /root/.kaggle/kaggle.json

In [None]:
!kaggle datasets download -d allen-institute-for-ai/CORD-19-research-challenge -p /content

In [None]:
!unzip \*.zip

In [None]:
pip install pyspark

In [None]:
sc.stop()

In [None]:
########### TEAM MEMBERS ###########
#* Aparna Dutt
#* Pramod Sai Kudapa
#* Anil Rayala
#* Prajwal Chandra

########### BRIEF DESCRIPTION ########
## (i)	First we are downloading CORD-19 dataset from kaggle
## (ii)	From the article jsons we are fetching paper_id and body_text
## (iii)	Generate feature vectors using tf-idf scores
## (iv)	Perform Dimensionality Reduction using PCA
## (v)	Perform K-means Clustering (Tensorflow)
## (vi)	Perform Topic Modeling (LDA) to find important keywords for each cluster

########## LIST OF ALGORITHMS AND SOFTWARE STACK ##########

## Using Spark and Tensorflow as data pipeline
## Regarding concepts/algorithms, we have used TF-IDF,PCA and K-means clustering

import pyspark
from pyspark import SparkContext
from pyspark.sql import SQLContext
import pandas as pd
import json
import glob
import spacy
from spacy.lang.en.stop_words import STOP_WORDS
import en_core_sci_sm
import string
from pyspark.sql.types import Row
from pyspark.ml.feature import IDF
from pyspark.ml.feature import CountVectorizer as sparkCountVectorizer
from pyspark.mllib.linalg.distributed import RowMatrix
from pyspark.ml.linalg import DenseVector
import numpy as np
import tensorflow as tf
from sklearn.decomposition import PCA
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction.text import CountVectorizer
import pickle

## Creating SparkContext and SQLContext
sc = SparkContext("local", "cord_19")
sqlContext = SQLContext(sc)

## Helper function to fetch only the desired columns in a json file (article id and text of the article)
def fetch_data(file_path):
	body = []
	with open(file_path) as file:
		content = json.load(file)
		for entry in content['body_text']:
			body.append(entry['text'])
	return (content['paper_id'], ' '.join(body))
	
########### Preprocessing the data	############


punctuations = string.punctuation						# Fetching punctuations from string module
stopwords = list(STOP_WORDS)							# Collecting the common stop_words (in English language) imported from spacy.lang.en.stop_words 

# Extra stop_words which frequently appear in medical articles
custom_stop_words = ['doi', 'preprint', 'copyright', 'peer', 'reviewed', 'org', 'https', 'et', 'al', 'author', 'figure', 
				'rights', 'reserved', 'permission', 'used', 'using', 'biorxiv', 'medrxiv', 'license', 'fig', 'fig.', 
				'al.', 'Elsevier', 'PMC', 'CZI', 'www'
				]
# Appending the extra stop words to the eng language stop words
for w in custom_stop_words:
	if w not in stopwords:
		stopwords.append(w)

# Parser for parsing the text in the article
parser = en_core_sci_sm.load(disable=["tagger", "ner"])			# Loading the parse from en_core_sci_sm package
parser.max_length = 3000000

  
## Helper function to tokenize the full text in an article
def spacy_tokenizer(text):
	all_tokens = parser(text)								## Parse the article using parser defined above
	lem_tokens = [ word.lemma_.lower().strip() if word.lemma_ != "-PRON-" else word.lower_ for word in all_tokens ]		##Lemmatization
	filtered_tokens = [ word for word in lem_tokens if word not in stopwords and word not in punctuations ]				## Filtering stop words and punctuations
	tokens = [token for token in filtered_tokens]
	return tokens


########******* Fetching all the json files (these are medical articles in our dataset)	#############*********

root_path = '/content/document_parses'								## Defining base path
all_json_paths = glob.glob(f'{root_path}/**/*.json', recursive=True)		## Fetching all the jsons paths in our dataset (around 85k)


## Spark Code

sample_jsons = all_json_paths[:100]
json_file_paths = sc.parallelize(sample_jsons)						## Creating RDD of 100 jsons as sample
papers = json_file_paths.map(lambda path: fetch_data(path))				## Fetching data from all json paths
processed_papers = papers.map(lambda t: (t[0], spacy_tokenizer(t[1])))	## Processing the data


## Helper function to convert RDD to PYSPARK DataFrame
def row_conversion(tup):
	labels = ['paper_id','body_text']
	temp_dict = {}
	for i in range(len(tup)):
		temp_dict[labels[i]] = tup[i]
	return temp_dict

## Converting RDD to DataFrame

df = processed_papers.map(lambda record: Row(**row_conversion(record))).toDF()
#df.printSchema()

## Featurizing processed text into TF-IDF vectors
cv = sparkCountVectorizer(inputCol = 'body_text',outputCol = 'tf_vector')
cv_model = cv.fit(df)
tf_df = cv_model.transform(df)						## New column tf_vector with respective term-frequency vectors

## Standardizing TF vectors into TF-IDF vectors
idf = IDF(inputCol='tf_vector',outputCol='tfidf_vector')
idf_model = idf.fit(tf_df)
tfidf_df = idf_model.transform(tf_df)					## New column tfidf_vector with respective TF-IDF vectors

## Helper function to convert sparse vector to dense vector
def sparse_to_dense(v):
	v = DenseVector(v)
	dense_vector = list([float(x) for x in v])
	return dense_vector

## Converting back to RDD
papers_rdd = tfidf_df.select('paper_id', 'tfidf_vector').rdd.map(lambda t: (t['paper_id'], sparse_to_dense(t['tfidf_vector'])))

tfidf_dict = dict(papers_rdd.take(100))				## Considering only 100 papers as sample

sc.stop()		## Stopping SparkContext.Not using it any further

tfidf_matrix = np.array(list(tfidf_dict.values()))		## Converting to numpy matrix (100, ~23k)

## Applying Dimensionality Reduction using PCA
## Reducing dimensions to preserve 95% variance in the original data

pca = PCA(n_components = 0.95)
transformed_input = pca.fit_transform(tfidf_matrix)		## Dimensions reduced from ~23k to 55

## TENSORFLOW TO PERFORM K-MEANS CLUSTERING (k = 5)

## Helper function to convert matrix to tensor
def train_function():
	return tf.compat.v1.train.limit_epochs( tf.convert_to_tensor(transformed_input, dtype=tf.float32), num_epochs=1)

instances, features = transformed_input.shape				## Number of observations and dimensions
kmeans = tf.compat.v1.estimator.experimental.KMeans(num_clusters=5)			## Kmeans node

# Training
generations = 10				## Number of iterations
old_centroids = None			## Initialization of centroids
for generation_no in range(generations):
	kmeans.train(train_function)						## Performing Kmeans
	updated_centroids = kmeans.cluster_centers()			## Fetching cluster centers
	old_centroids = updated_centroids					## Updating the old centroids

## Assigning documents to the respective cluster

cluster_labels = list(kmeans.predict_cluster_index(train_function))		## Finding cluster groups
for idx, each_vector in enumerate(transformed_input):
	cluster_idx = cluster_labels[idx]
	document_centroid = updated_centroids[cluster_idx]
	print('Document:', each_vector, 'belongs to ', cluster_idx, ' cluster centered at', document_centroid)

#### PERFORMING LDA ####


count_vectors = []						## Variable for Word Vectors
num_of_clusters = 5
for _ in range(0, num_of_clusters):
  cv = CountVectorizer(stop_words='english', lowercase=True)
  count_vectors.append(cv)
 

#We need to collect all documents belonging to a cluster before using CountVectorizer
cluster_content = {}
for idx in range(0,num_of_clusters):
    cluster_content[idx] = []

for idx, each_vector in enumerate(transformed_input):
    cluster_content[cluster_labels[idx]].append(index_docID_dict[idx][1])		## Grouping articles according to clusters they are part of

count_vectors_output = []
for idx in range(0,num_of_clusters):
    if idx in cluster_content:
        count_vectors_output.append(count_vectors[idx].fit_transform(cluster_content[idx]))			## Applying Count vectoriser on individual clusters

#Performing LDA
numberOfTopics = 15

LDA_models = []
for _ in range(0, num_of_clusters):
    LDA_models.append(LatentDirichletAllocation(n_components=numberOfTopics, max_iter=20, learning_method='online'))		## Constructing 5 LDA models

lda_output = []

for idx in range(0,num_of_clusters):
  lda_cluster_output = LDA_models[idx].fit_transform(count_vectors_output[idx])
  lda_output.append(lda_cluster_output)


## Helper funtion to find keywords from each topic in a cluster
def fetch_keywords(lda_model, cluster_vector, number_of_words):
    keywords_for_cluster = []
    
    for idx, each_topic in enumerate(lda_model.components_):
        keywords_in_each_topic = [(cluster_vector.get_feature_names()[i], each_topic[i]) for i in each_topic.argsort()[:-number_of_words - 1:-1]]		## Topic wise top keywords
        keywords_for_cluster.append(keywords_in_each_topic)

    return keywords_for_cluster

## Finding important keywords for each cluster
keywords_all_clusters = []
for idx, each_model in enumerate(LDA_models):
    if count_vectors[idx] is not None:
      keywords_in_cluster = fetch_keywords(each_model, count_vectors[idx], 4)			## Fetching top 4 keywords for each topic in for individual clusters 
      keywords_all_clusters.append(keywords_in_cluster)

## We will store keywords and cluster info in a file

cluster_info = {}

for idx in range(0,num_of_clusters):
    cluster_info[idx] = []

index_docID_dict = {}
for idx,_ in enumerate(transformed_input):
    cluster_info[cluster_labels[idx]].append(index_docID_dict[idx][0])

pickle.dump(cluster_info, open("cluster_info","wb"))

## We will make another dictionary with keywords as keys and clusters linked to it as values
keywords_cluster_info = {}

for idx,each_cluster in enumerate(keywords_all_clusters):
    for each_topic in each_cluster:
        for each_keyword in each_topic:
            try:
                keywords_cluster_info[each_keyword[0]].append(idx)
            except KeyError:
                keywords_cluster_info[each_keyword[0]] = [idx]


pickle.dump(keywords_cluster_info, open("keywords_cluster_info","wb"))

In [None]:
!pip install pyspark
import pyspark
from pyspark import SparkContext
from pyspark.sql import SQLContext
## Creating SparkContext and SQLContext
sc = SparkContext("local", "cord_19")
sqlContext = SQLContext(sc)

In [None]:
import pandas as pd
import json
metadata_path = "metadata.csv"
csv_file = sc.textFile(metadata_path)
df = pd.read_csv(metadata_path)
df

In [None]:
import glob
root_path = '/content/document_parses'
all_json = glob.glob(f'{root_path}/**/*.json', recursive=True)
few_json = all_json[:100]

In [None]:
import json
#NLP 
!pip install https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.2.4/en_core_sci_sm-0.2.4.tar.gz
import spacy
from spacy.lang.en.stop_words import STOP_WORDS
import en_core_sci_sm  # model downloaded in previous step
import string
def fetch_data(file_path):
	body = []
	with open(file_path) as file:
		content = json.load(file)
		for entry in content['body_text']:
			body.append(entry['text'])
	return (content['paper_id'], ' '.join(body))

punctuations = string.punctuation						# Fetching punctuations from string module
stopwords = list(STOP_WORDS)							# Collecting the common stop_words (in English language) imported from spacy.lang.en.stop_words 

# Extra stop_words which frequently appear in medical articles
custom_stop_words = ['doi', 'preprint', 'copyright', 'peer', 'reviewed', 'org', 'https', 'et', 'al', 'author', 'figure', 
				'rights', 'reserved', 'permission', 'used', 'using', 'biorxiv', 'medrxiv', 'license', 'fig', 'fig.', 
				'al.', 'Elsevier', 'PMC', 'CZI', 'www'
				]
# Appending the extra stop words to the eng language stop words
for w in custom_stop_words:
	if w not in stopwords:
		stopwords.append(w)

# Parser for parsing the text in the article
parser = en_core_sci_sm.load(disable=["tagger", "ner"])			# Loading the parse from en_core_sci_sm package
parser.max_length = 3000000

  
## Helper function to tokenize the full text in an article
def spacy_tokenizer(text):
	all_tokens = parser(text)								## Parse the article using parser defined above
	lem_tokens = [ word.lemma_.lower().strip() if word.lemma_ != "-PRON-" else word.lower_ for word in all_tokens ]		##Lemmatization
	filtered_tokens = [ word for word in lem_tokens if word not in stopwords and word not in punctuations ]				## Filtering stop words and punctuations
	tokens = [token for token in filtered_tokens]
	return tokens

root_path = '/content/document_parses'								## Defining base path
all_json_paths = glob.glob(f'{root_path}/**/*.json', recursive=True)		## Fetching all the jsons paths in our dataset (around 85k)
few_jsons = all_json_paths[:100]
few_jsons[0]

In [None]:
json_file_paths = sc.parallelize(few_jsons)						## Creating RDD of all json paths
papers = json_file_paths.map(lambda path: fetch_data(path))				## Fetching data from all json paths
processed_papers = papers.map(lambda t: (t[0], spacy_tokenizer(t[1])))
processed_papers.take(1)

In [None]:
from pyspark.sql.types import Row

def row_conversion(tup):
	labels = ['paper_id','body_text']
	temp_dict = {}
	for i in range(len(tup)):
		temp_dict[labels[i]] = tup[i]
	return temp_dict

## Converting RDD to DataFrame

df = processed_papers.map(lambda record: Row(**row_conversion(record))).toDF()
df.printSchema()

In [None]:
from pyspark.ml.feature import HashingTF, IDF
from pyspark.ml.feature import CountVectorizer
# from pyspark.sql import Row

## Featurizing processed text into TF-IDF vectors
cv = CountVectorizer(inputCol = 'body_text',outputCol = 'tf_vector')
cv_model = cv.fit(df)
tf_df = cv_model.transform(df)	
tf_df.take(1)

In [None]:
## Standardizing TF vectors into TF-IDF vectors
idf = IDF(inputCol='tf_vector',outputCol='tfidf_vector')
idf_model = idf.fit(tf_df)
tfidf_df = idf_model.transform(tf_df)
tfidf_df.take(1)

[Row(body_text=['r', 'egelmatig', 'zijn', 'muggen', 'zoals', 'de', 'tijgermug', 'het', 'nieuws', 'meet', 'koppen', 'als', 'weer', 'tijgermuggen', 'aangetroffen', 'bij', 'bandenimporteurs', 'dit', 'leidt', 'tot', 'vragen', 'exotische', 'muggen', 'bij', 'de', 'ggd', 'bij', 'dit', 'contact', 'blijkt', 'een', 'deel', 'van', 'de', 'vragenstellers', 'door', 'de', 'huisarts', 'naar', 'ons', 'te', 'zijn', 'verwezen', 'het', 'onderwerp', 'leeft', 'en', 'zeker', 'niet', 'alleen', 'bij', 'omwonenden', 'van', 'locaties', 'waar', 'muggen', 'zijn', 'aangetroffen', 'een', 'vraag', 'die', 'telkens', 'terugkomt', 'lopen', 'mensen', 'nederland', 'risico', 'op', 'het', 'oplopen', 'van', 'een', 'infectieziekte', 'inheemse', 'exotische', 'muggen', 'dit', 'artikel', 'gaan', 'op', 'de', 'risico', 'op', 'infectieziekten', 'muggen', 'en', 'op', 'de', 'bestrijding', 'van', 'exotische', 'muggen', 'om', 'de', 'risico', 'beter', 'te', 'kunnen', 'verduidelijken', 'bespreken', 'eerst', 'de', 'biologische', 'eigensch

In [None]:
print_res = tfidf_df.take(1)[0]
print(print_res['body_text'][:7])
print(0.8082)
print(4.2547)
print(0.9827)
print(4.6846)
print(1.324)
print(1.7162)
print_res['tfidf_vector']

['r', 'egelmatig', 'zijn', 'muggen', 'zoals', 'de', 'tijgermug']
0.8082
4.2547
0.9827
4.6846
1.324
1.7162


SparseVector(22006, {1: 0.5426, 4: 0.3591, 6: 0.1616, 10: 0.1225, 21: 0.5968, 24: 0.8039, 33: 0.28, 56: 0.2984, 86: 1.4796, 96: 0.2331, 101: 0.7233, 120: 0.7333, 132: 0.2844, 139: 0.5376, 140: 0.381, 144: 0.381, 146: 0.2713, 162: 0.3666, 168: 0.5208, 188: 0.762, 192: 0.762, 194: 0.762, 196: 210.5059, 203: 0.5547, 219: 0.488, 220: 0.472, 222: 0.5042, 244: 1.5241, 245: 0.488, 254: 0.488, 259: 21.7189, 265: 3.1793, 279: 0.5208, 299: 8.5209, 312: 0.5898, 340: 15.395, 353: 186.5656, 359: 0.6639, 360: 1.7247, 396: 0.6448, 418: 0.9015, 427: 356.8996, 520: 3.6851, 522: 2.714, 630: 0.765, 687: 1.5706, 706: 2.3623, 779: 167.899, 797: 215.7085, 801: 1.0042, 802: 2.9593, 819: 5.3384, 918: 33.1656, 931: 2.1302, 986: 1.2829, 1039: 1.6707, 1045: 67.8053, 1081: 160.8009, 1095: 4.1003, 1157: 10.2509, 1164: 1.6707, 1226: 106.5513, 1307: 3.6851, 1330: 125.5031, 1412: 2.5357, 1430: 117.6592, 1463: 113.7372, 1478: 113.7372, 1661: 56.2641, 1691: 94.1274, 1745: 54.89, 1858: 82.3614, 1916: 82.3614, 1957: 78.4

In [None]:
from pyspark.mllib.linalg.distributed import RowMatrix
from pyspark.ml.linalg import DenseVector

## Helper function to convert sparse vector to dense vector
def sparse_to_dense(v):
	v = DenseVector(v)
	dense_vector = list([float(x) for x in v])
	return dense_vector

## Converting back to RDD
papers_rdd = tfidf_df.select('paper_id', 'tfidf_vector').rdd.map(lambda t: (t['paper_id'], sparse_to_dense(t['tfidf_vector'])))

tfidf_dict = dict(papers_rdd.take(100))
tfidf_dict

In [None]:
index_docID_dict = {}
i = 0
for key in tfidf_dict.keys():
  if key[:3] == 'PMC':
    path = '/content/document_parses/pmc_json/'+key+'.xml.json'
  else:
    path = '/content/document_parses/pdf_json/'+key+'.json'
  tup=fetch_data(path)
  index_docID_dict[i] = tup
  i=i+1
print(list(index_docID_dict.items())[:1])

In [None]:
import numpy as np

tfidf_matrix = np.array(list(tfidf_dict.values()))

In [None]:
tfidf_matrix[0]

array([0.        , 0.54263019, 0.        , ..., 0.        , 0.        ,
       0.        ])

In [None]:
from sklearn.decomposition import PCA

pca = PCA(n_components = 0.95)
transformed_input = pca.fit_transform(tfidf_matrix)
print(transformed_input.shape)
transformed_input

In [None]:
import numpy as np
import tensorflow as tf

## TENSORFLOW TO APPLY K-MEANS CLUSTERING (k = 5)

## Helper function to convert matrix to tensor
def train_function():
	return tf.compat.v1.train.limit_epochs( tf.convert_to_tensor(transformed_input, dtype=tf.float32), num_epochs=1)

instances, features = transformed_input.shape				## Number of observations and dimensions
kmeans = tf.compat.v1.estimator.experimental.KMeans(num_clusters=5)			## Kmeans node

# Training
generations = 10				## Number of iterations
old_centroids = None			## Initialization of centroids
for generation_no in range(generations):
	kmeans.train(train_function)						## Performing Kmeans
	updated_centroids = kmeans.cluster_centers()			## Fetching cluster centers
	old_centroids = updated_centroids					## Updating the old centroids

## Assigning documents to the respective cluster

cluster_labels = list(kmeans.predict_cluster_index(train_function))		## Finding cluster groups
for idx, each_vector in enumerate(transformed_input):
	cluster_idx = cluster_labels[idx]
	document_centroid = updated_centroids[cluster_idx]
	print('Document:', each_vector, 'belongs to ', cluster_idx, ' cluster centered at', document_centroid)

In [None]:
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction.text import CountVectorizer

In [None]:
count_vectors = []
    
for _ in range(0, 5):
    count_vectors.append(CountVectorizer(stop_words='english', lowercase=True, token_pattern='[a-zA-Z\-][a-zA-Z\-]{2,}'))

In [None]:
#We need to collect all documents belonging to a cluster before using CountVectorizer.
cluster_content = {}
for i in range(0,5):
    cluster_content[i] = []

for idx, each_vector in enumerate(transformed_input):
    cluster_content[cluster_labels[idx]].append(index_docID_dict[idx][1])

#print(type(cluster_content[0]))
print(cluster_content[0])
print(len(cluster_content[0]))

count_vectors_output = []
for i in range(0,5):
    try:
        count_vectors_output.append(count_vectors[i].fit_transform(cluster_content[i]))
    except KeyError:
    #cluster_content[i] = None
        print("unexpected")
        count_vectors_output.append(None)

#for element in count_vectors_output[0]:
#print(len(count_vectors))
print(count_vectors[0].get_feature_names())
#print(type(count_vectors_output[0]))
#print(count_vectors_output[0].toarray()[0])
#print(count_vectors_output[0][0].toArray())

1


In [None]:
#Performing LDA
numberOfTopics = 10

modelsLDA = []
for _ in range(0, 5):
    modelsLDA.append(LatentDirichletAllocation(n_components=numberOfTopics, max_iter=10, learning_method='online',verbose=False, random_state=42))
    
modelsLDA[0]

In [None]:
lda_output = []

for i in range(0,5):
    if count_vectors_output[i] != None:
        lda_output.append(modelsLDA[i].fit_transform(count_vectors_output[i]))
    else:
      lda_output.append(None)

In [None]:
def fetch_keywords(lda_model, cluster_vector, number_of_words):
    temp_keywords = []
    keywords_for_cluster = []
    
    for idx, each_topic in enumerate(lda_model.components_):
        keywords_in_each_topic = [(cluster_vector.get_feature_names()[i], each_topic[i]) for i in each_topic.argsort()[:-number_of_words - 1:-1]]
        keywords_for_cluster.append(keywords_in_each_topic)

    return keywords_for_cluster

In [None]:
keywords_all_clusters = []
for idx, lda_model in enumerate(modelsLDA):
    if count_vectors[idx] is not None:
      keywords_in_cluster = fetch_keywords(lda_model, count_vectors[idx], 3)
      keywords_all_clusters.append(keywords_in_cluster)

print(keywords_all_clusters[0])

for l in keywords_all_clusters[0]:
    for k in l:
        print(k[0])
  #print(k)
#len(keywords_all_clusters[0])

In [None]:
#We will store keywords and cluster info in a file
import pickle

cluster_info = {}

for i in range(0,5):
    cluster_info[i] = []

for idx,_ in enumerate(transformed_input):
    cluster_info[cluster_labels[idx]].append(index_docID_dict[idx][0])
'''
print(len(cluster_info[0]))
print(len(cluster_info[1]))
print(len(cluster_info[2]))
print(len(cluster_info[3]))
print(len(cluster_info[4]))
'''
pickle.dump(cluster_info, open("cluster_info","wb"))

In [None]:
#We will make another dictionary with keywords as keys and clusters linked to it as values
keywords_cluster_info = {}

for idx,each_cluster in enumerate(keywords_all_clusters):
    for each_topic in each_cluster:
        for each_keyword in each_topic:
            try:
                keywords_cluster_info[each_keyword[0]].append(idx)
            except KeyError:
                keywords_cluster_info[each_keyword[0]] = [idx]

'''
print(len(keywords_cluster_info))
for key,val in keywords_cluster_info.items():
    print("key is " + key)
    print("value is " + str(val))
'''

pickle.dump(keywords_cluster_info, open("keywords_cluster_info","wb"))