# Notebook for training information retrieval models

### Import packages

In [104]:
# Import packages
import numpy as np

from pyspark.sql import SparkSession
from pyspark import SQLContext
from pyspark.sql.functions import udf, size, explode, col, countDistinct
from pyspark.ml import Pipeline

from nltk.corpus import stopwords
from gensim.parsing.preprocessing import STOPWORDS as gensim_words
import spacy
sp = spacy.load('en_core_web_sm')

from sparknlp.base import Finisher, DocumentAssembler
from sparknlp.annotator import Tokenizer, Normalizer, LemmatizerModel, StopWordsCleaner
from sparknlp.embeddings import *

import sparknlp
from sparknlp.base import *
from sparknlp.annotator import *
from sparknlp.common import *

from pyspark.ml.feature import HashingTF, IDF
from pyspark.ml.feature import Normalizer as Normalizer_L2

In [2]:
nltk_stopwords = set(stopwords.words('english')) \
                    .union(set(stopwords.words('german'))) \
                    .union(set(stopwords.words('french')))
gensim_stopwords = set(gensim_words)
spacy_stopwords = sp.Defaults.stop_words
# https://countwordsfree.com/stopwords
cwf_stopwords = set(line.strip() for line in open('stop_words.txt'))

all_stopwords = list( nltk_stopwords \
                        .union(gensim_stopwords) \
                        .union(spacy_stopwords) \
                        .union(cwf_stopwords) )

### Create Spark Context and SQL Context

In [3]:
# Start spark session configured for spark nlp
spark = SparkSession.builder \
        .master('local[*]') \
        .appName('SDDM') \
        .config('spark.driver.memory', '200g') \
        .config('spark.executor.memory', '200g') \
        .config('spark.executor.cores', '32') \
        .config('spark.memory.fraction', '0.8') \
        .config('spark.local.dir', '/data/s1847503/SDDM/tmp') \
        .config('spark.jars.packages', 'com.johnsnowlabs.nlp:spark-nlp_2.11:2.5.0') \
        .getOrCreate()
print("Created a SparkSession")
sc = spark.sparkContext
print("Created a SparkContext")
sqlContext = SQLContext(sc)
print("Created a SQLContext")

# .config('spark.memory.fraction', '0.8') \

Created a SparkSession
Created a SparkContext
Created a SQLContext


### Load the data into a SQLContext Dataframe

In [95]:
df = sqlContext.read.format('csv').options(header='true', maxColumns=2000000) \
      .load('/data/s1847503/SDDM/newdata/data.csv')
df.show()

+---+--------------------+--------------------+--------------------+--------------------+--------------------+
|_c0|            paper_id|               title|        list_authors|           full_text|            sections|
+---+--------------------+--------------------+--------------------+--------------------+--------------------+
|  0|1329bb2f949e74925...|Generation of pre...|['Xue Wu Zhang', ...|"The infection of...| 30 drugs were se...|
|  1|dc079a2e9cf98fad0...|Zoonotic disease ...|['Charlotte Robin...|"Veterinary profe...| based on the par...|
|  2|75af9aa0e63889abd...|Current and Novel...|['Erasmus Kotey',...|"Influenza viruse...| although LAIVs a...|
|  3|1755c4785f87bca19...|MERS: Progress on...|['*', 'Ryan Aguan...|Since its identif...|['Since its ident...|
|  4|cc829c0f2ab2e110b...|Hepatologie Akute...|['Karoline Rutter...|"Das akute Leberv...| nach Ausschluss ...|
|  5|ece3d68d9b996c917...|Novel approach to...|['Ivan Timokhin',...|"Introduction | T...|      diameter 12 mm|
|

### Initialize Annotators

In [105]:
# Pipeline for text
document_assembler = DocumentAssembler() \
                        .setInputCol('full_text') \
                        .setOutputCol('document')

# Tokenizer divides the text into tokens
tokenizer = Tokenizer() \
                .setInputCols(['document']) \
                .setOutputCol('tokens')

# Finisher converts tokens to human-readable output (we need the tokens for determining the text lengths)
finisher_tokens = Finisher() \
                        .setInputCols(['tokens']) \
                        .setCleanAnnotations(False)

# Normalizer removes punctuation, numbers etc.
normalizer = Normalizer() \
                .setInputCols(['tokens']) \
                .setOutputCol('normalized') \
                .setLowercase(True)

# Lemmatizer changes each word to its lemma
lemmatizer = LemmatizerModel.pretrained() \
                .setInputCols(['normalized']) \
                .setOutputCol('lemma')

# StopWordsCleaner removes stop words    
stopwords_cleaner = StopWordsCleaner() \
                        .setInputCols(['lemma']) \
                        .setOutputCol('clean_lemma') \
                        .setCaseSensitive(False).setStopWords(all_stopwords)

# word_embeddings = BertEmbeddings.pretrained('bert_base_cased', 'en') \
#                           .setInputCols(["document", "clean_lemma"]) \
#                           .setOutputCol("embeddings")

# Finisher converts clean tokens to human-readable output
finisher = Finisher() \
            .setInputCols(['clean_lemma']) \
            .setCleanAnnotations(False)

lemma_antbnc download started this may take some time.
Approximate size to download 907.6 KB
[OK!]
bert_base_cased download started this may take some time.
Approximate size to download 389.2 MB
[OK!]


### Create Pipeline

In [106]:
# Pipeline for fully preprocessing the text
pipeline = Pipeline() \
            .setStages([
                document_assembler,
                tokenizer,
                normalizer,
                lemmatizer,
                stopwords_cleaner,
                finisher_tokens,
                finisher
             ])

bert_pipeline = Pipeline() \
                    .setStages([
                        document_assembler,
                        tokenizer,
                        normalizer,
                        lemmatizer,
                        stopwords_cleaner,
                        word_embeddings
                     ])

### Preprocess questions

In [108]:
# Select the question from 0 to 11
question_num = 3

questions = sqlContext.read.format('csv').options(header='true').load('/data/s1847503/SDDM/newdata/questions.csv')
questions_clean = pipeline.fit(questions).transform(questions)
questions_clean = questions_clean.select('question_id', 'full_text', col('finished_clean_lemma').alias('preprocessed'))
questions_clean = questions_clean.filter(questions_clean.question_id == question_num)
questions_clean.first().full_text

### Preprocess text

In [97]:
# Peprocess the data
df = pipeline.fit(df).transform(df)
df = df.select('*', size('finished_tokens').alias('text_length'))

# Keep only papers with a text length of greater than 10
print("Before removing empty papers: {} rows.".format(df.count()))
df = df.dropna(subset='full_text')
# df = df.dropduplicates(subset='title')
# print("Removed duplicates")
# df = df.filter(df['text_length'] > 10)
print("After removing empty papers: {} rows.".format(df.count()))
print()

df = df.select(
                col('_c0').alias('id'),
                'title',
                'full_text',
                'text_length',
                col('finished_clean_lemma').alias('preprocessed')
            )

df.show()

Before removing empty papers: 59561 rows.
After removing empty papers: 59561 rows.

+---+--------------------+--------------------+-----------+--------------------+
| id|               title|           full_text|text_length|        preprocessed|
+---+--------------------+--------------------+-----------+--------------------+
|  0|Generation of pre...|"The infection of...|        723|[infection, newly...|
|  1|Zoonotic disease ...|"Veterinary profe...|       1756|[veterinary, prof...|
|  2|Current and Novel...|"Influenza viruse...|        919|[influenza, virus...|
|  3|MERS: Progress on...|Since its identif...|       3942|[identification, ...|
|  4|Hepatologie Akute...|"Das akute Leberv...|       1832|[akute, lebervers...|
|  5|Novel approach to...|"Introduction | T...|        448|[introduction, qu...|
|  6|On the electrific...|Scientists, polic...|        880|[scientist, polic...|
|  7|A dynamic model f...|Infectious diseas...|       7535|[infectious, dise...|
|  8|Critical evaluati...

### TF-IDF

In [37]:
# Compute TF-IDF matrix for questions
tf_q = HashingTF(inputCol='clean_question', outputCol='tf') \
                    .transform(questions_clean)

tf_idf_questions = IDF(inputCol='tf', outputCol='feature') \
                        .fit(tf_q) \
                        .transform(tf_q)

In [38]:
# Compute TF-IDF matrix for papers
tf_p = HashingTF(inputCol='preprocessed', outputCol='tf') \
                    .transform(df)

tf_idf_papers = IDF(inputCol='tf', outputCol='feature') \
                        .fit(tf_q) \
                        .transform(tf_p)

In [19]:
# tf_idf_papers.show()

+---+--------------------+--------------------+-----------+--------------------+--------------------+--------------------+
| id|               title|           full_text|text_length|        preprocessed|                  tf|             feature|
+---+--------------------+--------------------+-----------+--------------------+--------------------+--------------------+
|  0|Articles The effe...|"Severe acute res...|        938|[severe, acute, r...|(262144,[1769,236...|(262144,[1769,236...|
|  1|Impact of Social ...|75% or 90%. We co...|       4369|[compare, early, ...|(262144,[353,925,...|(262144,[353,925,...|
|  2|Lower State COVID...|Evidence is mixed...|        703|[evidence, mix, e...|(262144,[3483,512...|(262144,[3483,512...|
+---+--------------------+--------------------+-----------+--------------------+--------------------+--------------------+



In [39]:
# Compute L2-norm for papers and questions
normalizer_L2 = Normalizer_L2(inputCol='feature', outputCol='norm')
tf_idf_papers = normalizer_L2.transform(tf_idf_papers)
tf_idf_questions = normalizer_L2.transform(tf_idf_questions)

In [13]:
# Save dataframes to csv files
# tf_idf_papers.select('id', 'norm').write.csv('/data/s1847503/SDDM/newdata/tfidf_papers.csv')
# tf_idf_questions.select('id', 'norm').write.csv('/data/s1847503/SDDM/newdata/tfidf_questions.csv', index=False)

In [40]:
# Compute similarity matrix
from pyspark.mllib.linalg.distributed import IndexedRow, IndexedRowMatrix
matrix_q = IndexedRowMatrix(
                tf_idf_questions.select('question_id', 'norm') \
                .rdd.map(lambda row: IndexedRow(row.question_id, row.norm.toArray()))
            ).toBlockMatrix()

matrix_p = IndexedRowMatrix(
                tf_idf_papers.select('id', 'norm') \
                .rdd.map(lambda row: IndexedRow(row.id, row.norm.toArray()))
            ).toBlockMatrix()

In [46]:
matrix_q.numCols()

262144

In [42]:
sim_matrix = matrix_p.multiply(matrix_q.transpose())

In [43]:
sim_matrix = sim_matrix.toLocalMatrix().toArray()

In [48]:
sim_matrix

array([[0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.]])

In [47]:
import pandas as pd
d = {}
# d[str(question_num)] = sim_matrix[:, i]
sim = pd.DataFrame(d)
sim

Unnamed: 0,3
0,0.0
1,0.0
2,0.0


### Word2Vec

In [98]:
from pyspark.ml.feature import Word2Vec
word2Vec = Word2Vec(inputCol='preprocessed', outputCol='word_vector')

In [99]:
model = word2Vec.fit(df)
df = model.transform(df)

KeyboardInterrupt: 

In [None]:
df.show()

In [90]:
questions_clean = model.transform(questions_clean)
ques_vec = questions_clean.first().word_vector

In [92]:
# Calculate cosine similarity between a document vector and a question vector
def cossim(doc_vec): 
    global ques_vec
    sim = np.dot(doc_vec, ques_vec) / np.sqrt(np.dot(doc_vec, ques_vec)) / np.sqrt(np.dot(doc_vec, ques_vec)) 
    return float(sim)

cossim_udf = udf(cossim)

In [93]:
df2 = df.select('id', cossim_udf('word_vector').alias('similarity'))

In [94]:
df2.show()

+---+----------+
| id|similarity|
+---+----------+
|  0|       NaN|
|  1|       NaN|
|  2|       NaN|
+---+----------+



### Close Spark Context when done

In [None]:
sc.stop()