# Notebook for training information retrieval models

### Import packages

In [154]:
# Import packages
import numpy as np

from pyspark.sql import SparkSession
from pyspark import SQLContext
from pyspark.sql.functions import udf, size, explode, col, countDistinct, collect_list
from pyspark.ml import Pipeline

from nltk.corpus import stopwords
from gensim.parsing.preprocessing import STOPWORDS as gensim_words
import spacy
sp = spacy.load('en_core_web_sm')

from sparknlp.base import Finisher, DocumentAssembler
from sparknlp.annotator import Tokenizer, Normalizer, LemmatizerModel, StopWordsCleaner

from pyspark.ml.feature import HashingTF, IDF
from pyspark.ml.feature import Normalizer as Normalizer_L2

In [38]:
nltk_stopwords = set(stopwords.words('english')) \
                    .union(set(stopwords.words('german'))) \
                    .union(set(stopwords.words('french')))
gensim_stopwords = set(gensim_words)
spacy_stopwords = sp.Defaults.stop_words
# https://countwordsfree.com/stopwords
cwf_stopwords = set(line.strip() for line in open('stop_words.txt'))

all_stopwords = list( nltk_stopwords \
                        .union(gensim_stopwords) \
                        .union(spacy_stopwords) \
                        .union(cwf_stopwords) )

### Create Spark Context and SQL Context

In [39]:
# Start spark session configured for spark nlp
spark = SparkSession.builder \
        .master('local[*]') \
        .appName('SDDM') \
        .config('spark.driver.memory', '200g') \
        .config('spark.executor.memory', '200g') \
        .config('spark.executor.cores', '32') \
        .config('spark.memory.fraction', '0.8') \
        .config('spark.local.dir', '/data/s1847503/SDDM/tmp') \
        .config('spark.jars.packages', 'com.johnsnowlabs.nlp:spark-nlp_2.11:2.5.0') \
        .getOrCreate()
print("Created a SparkSession")
sc = spark.sparkContext
print("Created a SparkContext")
sqlContext = SQLContext(sc)
print("Created a SQLContext")

# .config('spark.memory.fraction', '0.8') \

Created a SparkSession
Created a SparkContext
Created a SQLContext


### Load the data into a SQLContext Dataframe

In [40]:
df = sqlContext.read.format('csv').options(header='true', maxColumns=2000000) \
      .load('/data/s1847503/SDDM/newdata/relevant.csv')
df.show()

+---+--------------------+--------------------+--------------------+--------------------+--------------------+
|_c0|            paper_id|               title|        list_authors|           full_text|            sections|
+---+--------------------+--------------------+--------------------+--------------------+--------------------+
|  0|df97f804b68dcf16f...|Articles The effe...|['Kiesha Prem', '...|"Severe acute res...| respectively. Ho...|
|  1|0475f4122241f4008...|Impact of Social ...|['Xutong Wang', '...|75% or 90%. We co...|"['75% or 90%. We...|
|  2|94586be17a5f9eca8...|Lower State COVID...|  ['Emily Rauscher']|Evidence is mixed...|['Evidence is mix...|
|  3|           question0|                   -|                   -|Effectiveness of ...|                   -|
|  4|           question1|                   -|                   -|Methods to unders...|                   -|
|  5|           question2|                   -|                   -|Evidence that dom...|                   -|
|

### Initialize Annotators

In [41]:
# Pipeline for text
document_assembler = DocumentAssembler() \
                        .setInputCol('full_text') \
                        .setOutputCol('document')

# Tokenizer divides the text into tokens
tokenizer = Tokenizer() \
                .setInputCols(['document']) \
                .setOutputCol('tokens')

# Finisher converts tokens to human-readable output (we need the tokens for determining the text lengths)
finisher_tokens = Finisher() \
                        .setInputCols(['tokens']) \
                        .setCleanAnnotations(False)

# Normalizer removes punctuation, numbers etc.
normalizer = Normalizer() \
                .setInputCols(['tokens']) \
                .setOutputCol('normalized') \
                .setLowercase(True)

# Lemmatizer changes each word to its lemma
lemmatizer = LemmatizerModel.pretrained() \
                .setInputCols(['normalized']) \
                .setOutputCol('lemma')

# StopWordsCleaner removes stop words    
stopwords_cleaner = StopWordsCleaner() \
                        .setInputCols(['lemma']) \
                        .setOutputCol('clean_lemma') \
                        .setCaseSensitive(False).setStopWords(all_stopwords)

# word_embeddings = BertEmbeddings.pretrained('bert_base_cased', 'en') \
#                           .setInputCols(["document", "clean_lemma"]) \
#                           .setOutputCol("embeddings")

# Finisher converts clean tokens to human-readable output
finisher = Finisher() \
            .setInputCols(['clean_lemma']) \
            .setCleanAnnotations(False)

lemma_antbnc download started this may take some time.
Approximate size to download 907.6 KB
[OK!]


### Create Pipeline

In [42]:
# Pipeline for fully preprocessing the text
pipeline = Pipeline() \
            .setStages([
                document_assembler,
                tokenizer,
                normalizer,
                lemmatizer,
                stopwords_cleaner,
                finisher_tokens,
                finisher
             ])

# bert_pipeline = Pipeline() \
#                     .setStages([
#                         document_assembler,
#                         tokenizer,
#                         normalizer,
#                         lemmatizer,
#                         stopwords_cleaner,
#                         word_embeddings
#                      ])

### Preprocess questions

In [162]:
# Select the question from 0 to 11
question_num = 7

questions = sqlContext.read.format('csv').options(header='true').load('/data/s1847503/SDDM/newdata/questions.csv')
questions_clean = pipeline.fit(questions).transform(questions)
questions_clean = questions_clean.select('question_id', 'full_text', col('finished_clean_lemma').alias('preprocessed'))
questions_clean = questions_clean.filter(questions_clean.question_id > question_num)
question_nums = [int(n) for n in questions_clean.select(collect_list('question_id')).first()[0]]
questions_clean.first().preprocessed

['effectiveness',
 'multifactorial',
 'strategy',
 'prevent',
 'secondary',
 'transmission']

### Preprocess text

In [44]:
# Peprocess the data
df = pipeline.fit(df).transform(df)
df = df.select('*', size('finished_tokens').alias('text_length'))

# Keep only papers with a text length of greater than 10
print("Before removing empty papers: {} rows.".format(df.count()))
df = df.dropna(subset='full_text')
# df = df.dropduplicates(subset='title')
# print("Removed duplicates")
# df = df.filter(df['text_length'] > 10)
print("After removing empty papers: {} rows.".format(df.count()))
print()

df = df.select(
                col('_c0').alias('id'),
                'paper_id',
                'title',
                'full_text',
                'text_length',
                col('finished_clean_lemma').alias('preprocessed')
            )

df.show()

Before removing empty papers: 15 rows.
After removing empty papers: 15 rows.

+---+--------------------+--------------------+--------------------+-----------+--------------------+
| id|            paper_id|               title|           full_text|text_length|        preprocessed|
+---+--------------------+--------------------+--------------------+-----------+--------------------+
|  0|df97f804b68dcf16f...|Articles The effe...|"Severe acute res...|        938|[severe, acute, r...|
|  1|0475f4122241f4008...|Impact of Social ...|75% or 90%. We co...|       4369|[compare, early, ...|
|  2|94586be17a5f9eca8...|Lower State COVID...|Evidence is mixed...|        703|[evidence, mix, e...|
|  3|           question0|                   -|Effectiveness of ...|          6|[effectiveness, i...|
|  4|           question1|                   -|Methods to unders...|          9|[method, understa...|
|  5|           question2|                   -|Evidence that dom...|         14|[evidence, domest...|
|  6

### TF-IDF

In [163]:
# Compute TF-IDF matrix for papers
tf_p = []
tf_idf_papers = []

tf_p = HashingTF(inputCol='preprocessed', outputCol='tf') \
                    .transform(df)

tf_idf_papers = IDF(inputCol='tf', outputCol='feature') \
                        .fit(tf_p) \
                        .transform(tf_p)

In [164]:
# Compute TF-IDF matrix for questions
tf_q = []
tf_idf_questions = []

tf_q = HashingTF(inputCol='preprocessed', outputCol='tf') \
                    .transform(questions_clean)

tf_idf_questions = IDF(inputCol='tf', outputCol='feature') \
                        .fit(tf_p) \
                        .transform(tf_q)

In [165]:
# tf_idf_papers.show()
tf_idf_questions = tf_idf_questions.drop('norm')
tf_idf_papers = tf_idf_papers.drop('norm')

In [166]:
# Compute L2-norm for papers and questions
normalizer_L2 = Normalizer_L2(inputCol='feature', outputCol='norm')
tf_idf_papers = normalizer_L2.transform(tf_idf_papers)
tf_idf_questions = normalizer_L2.transform(tf_idf_questions)

In [None]:
# Save dataframes to csv files
# tf_idf_papers.select('id', 'norm').write.csv('/data/s1847503/SDDM/newdata/tfidf_papers.csv')
# tf_idf_questions.select('id', 'norm').write.csv('/data/s1847503/SDDM/newdata/tfidf_questions.csv', index=False)

In [167]:
# Compute similarity matrix
from pyspark.mllib.linalg.distributed import IndexedRow, IndexedRowMatrix
matrix_q = IndexedRowMatrix(
                tf_idf_questions.select('question_id', 'norm') \
                .rdd.map(lambda row: IndexedRow(row.question_id, row.norm.toArray()))
            ).toBlockMatrix()

matrix_p = IndexedRowMatrix(
                tf_idf_papers.select('id', 'norm') \
                .rdd.map(lambda row: IndexedRow(row.id, row.norm.toArray()))
            ).toBlockMatrix()

In [168]:
sim_matrix = matrix_p.multiply(matrix_q.transpose())

In [169]:
sim_matrix = sim_matrix.toLocalMatrix().toArray()

In [170]:
import pandas as pd
d = {}
for n in question_nums:
    d[str(n)] = sim_matrix[:, n]
sim = pd.DataFrame(d)
sim

Unnamed: 0,8,9,10,11
0,0.007647,0.004753,0.018852,0.067077
1,0.022548,0.011434,0.022214,0.004014
2,0.001634,0.002492,0.001452,0.0
3,0.054711,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.132749
6,0.081821,0.0,0.0,0.0
7,0.400923,0.054761,0.031917,0.0
8,0.061298,0.0,0.0,0.0
9,0.245242,0.033497,0.019524,0.0


In [35]:
df_rel = sqlContext.read.format('csv').options(header='true') \
      .load('/data/s1847503/SDDM/newdata/relevant.csv')
df_rel.show()

+---+--------------------+--------------------+--------------------+--------------------+--------------------+
|_c0|            paper_id|               title|        list_authors|           full_text|            sections|
+---+--------------------+--------------------+--------------------+--------------------+--------------------+
|  0|df97f804b68dcf16f...|Articles The effe...|['Kiesha Prem', '...|"Severe acute res...| respectively. Ho...|
|  1|0475f4122241f4008...|Impact of Social ...|['Xutong Wang', '...|75% or 90%. We co...|"['75% or 90%. We...|
|  2|94586be17a5f9eca8...|Lower State COVID...|  ['Emily Rauscher']|Evidence is mixed...|['Evidence is mix...|
+---+--------------------+--------------------+--------------------+--------------------+--------------------+



In [36]:
relevant = [0, 1, 2]
for r in relevant:
    print(df_rel.filter(df_rel._c0 == r).first().title)
    print()

Articles The effect of control strategies to reduce social mixing on outcomes of the COVID-19 epidemic in Wuhan, China: a modelling study

Impact of Social Distancing Measures on COVID-19 Healthcare Demand in Central Texas

Lower State COVID-19 Deaths and Cases with Earlier School Closure in the U.S



### Word2Vec

In [9]:
from pyspark.ml.feature import Word2Vec
word2Vec = Word2Vec(inputCol='preprocessed', outputCol='word_vector')

In [10]:
model = word2Vec.fit(df)
df = model.transform(df)

In [11]:
df.show()

+---+--------------------+--------------------+-----------+--------------------+--------------------+
| id|               title|           full_text|text_length|        preprocessed|         word_vector|
+---+--------------------+--------------------+-----------+--------------------+--------------------+
|  0|Generation of pre...|"The infection of...|        723|[infection, newly...|[0.04855843337612...|
|  1|Zoonotic disease ...|"Veterinary profe...|       1756|[veterinary, prof...|[-0.0769096092344...|
|  2|Current and Novel...|"Influenza viruse...|        919|[influenza, virus...|[0.06106203970513...|
|  3|MERS: Progress on...|Since its identif...|       3942|[identification, ...|[-0.1404365635089...|
|  4|Hepatologie Akute...|"Das akute Leberv...|       1832|[akute, lebervers...|[9.99946369923657...|
|  5|Novel approach to...|"Introduction | T...|        448|[introduction, qu...|[0.05870716313081...|
|  6|On the electrific...|Scientists, polic...|        880|[scientist, polic...|[-

In [12]:
questions_clean = model.transform(questions_clean)
ques_vec = questions_clean.first().word_vector

In [13]:
# Calculate cosine similarity between a document vector and a question vector
def cossim(doc_vec): 
    global ques_vec
    sim = np.dot(doc_vec, ques_vec) / np.sqrt(np.dot(doc_vec, ques_vec)) / np.sqrt(np.dot(doc_vec, ques_vec)) 
    return float(sim)

cossim_udf = udf(cossim)

In [27]:
df2 = df.select('id', cossim_udf('word_vector').alias('similarity'))

In [28]:
df2.show()

+---+------------------+
| id|        similarity|
+---+------------------+
|  0|0.9999999999999998|
|  1|               1.0|
|  2|               1.0|
|  3|1.0000000000000002|
|  4|               NaN|
|  5|               1.0|
|  6|0.9999999999999999|
|  7|0.9999999999999999|
|  8|               1.0|
|  9|               1.0|
| 10|1.0000000000000002|
| 11|0.9999999999999998|
| 12|1.0000000000000002|
| 13|               NaN|
| 14|1.0000000000000002|
| 15|0.9999999999999999|
| 16|               1.0|
| 17|1.0000000000000002|
| 18|1.0000000000000002|
| 19|1.0000000000000002|
+---+------------------+
only showing top 20 rows



In [29]:
# print("Before removing empty papers: {} rows.".format(df2.count()))
print("x")
df2 = df2.filter(df2.similarity.isNotNull())
print("After removing empty papers 1: {} rows.".format(df2.count()))
df2 = df2.filter(df2.similarity != 'NaN')
# df2 = df2.dropna(subset='similarity')
print("After removing empty papers 2: {} rows.".format(df2.count()))
df2 = df2.orderBy('similarity', ascending=False).show()

x
After removing empty papers 1: 59561 rows.
After removing empty papers 2: 53611 rows.
+-----+------------------+
|   id|        similarity|
+-----+------------------+
|40821|1.0000000000000002|
|40931|1.0000000000000002|
|40823|1.0000000000000002|
|40776|1.0000000000000002|
|40831|1.0000000000000002|
|40784|1.0000000000000002|
|40832|1.0000000000000002|
|40792|1.0000000000000002|
|40843|1.0000000000000002|
|40803|1.0000000000000002|
|40851|1.0000000000000002|
|40813|1.0000000000000002|
|40854|1.0000000000000002|
|40816|1.0000000000000002|
|40862|1.0000000000000002|
|40870|1.0000000000000002|
|40875|1.0000000000000002|
|40774|1.0000000000000002|
|40877|1.0000000000000002|
|40785|1.0000000000000002|
+-----+------------------+
only showing top 20 rows



In [34]:
relevant = [40821, 40931, 40823, 40776, 40831]
for r in relevant:
    print(df.filter(df.id == 40821).first().title)
    print()

Buried treasure: evolutionary perspectives on microbial iron piracy An evolving view of host-microbe interactions

Buried treasure: evolutionary perspectives on microbial iron piracy An evolving view of host-microbe interactions

Buried treasure: evolutionary perspectives on microbial iron piracy An evolving view of host-microbe interactions

Buried treasure: evolutionary perspectives on microbial iron piracy An evolving view of host-microbe interactions

Buried treasure: evolutionary perspectives on microbial iron piracy An evolving view of host-microbe interactions



### Close Spark Context when done

In [None]:
sc.stop()