# Notebook for training information retrieval models

### Import packages

In [17]:
# Import packages
from pyspark.sql import SparkSession
from pyspark import SQLContext
from pyspark.sql.functions import udf, size, explode, col, countDistinct
from pyspark.ml import Pipeline

from nltk.corpus import stopwords
from gensim.parsing.preprocessing import STOPWORDS as gensim_words
import spacy
sp = spacy.load('en_core_web_sm')

from sparknlp.base import Finisher, DocumentAssembler
from sparknlp.annotator import Tokenizer, Normalizer, LemmatizerModel, StopWordsCleaner

from pyspark.ml.feature import HashingTF, IDF
from pyspark.ml.feature import Normalizer as Normalizer_L2

In [2]:
nltk_stopwords = set(stopwords.words('english')) \
                    .union(set(stopwords.words('german'))) \
                    .union(set(stopwords.words('french')))
gensim_stopwords = set(gensim_words)
spacy_stopwords = sp.Defaults.stop_words
# https://countwordsfree.com/stopwords
cwf_stopwords = set(line.strip() for line in open('stop_words.txt'))

all_stopwords = list( nltk_stopwords \
                        .union(gensim_stopwords) \
                        .union(spacy_stopwords) \
                        .union(cwf_stopwords) )

### Create Spark Context and SQL Context

In [3]:
# Start spark session configured for spark nlp
spark = SparkSession.builder \
        .master('local[*]') \
        .appName('SDDM') \
        .config('spark.driver.memory', '64g') \
        .config('spark.executor.memory', '32g') \
        .config('spark.executor.cores', '8') \
        .config('spark.jars.packages', 'com.johnsnowlabs.nlp:spark-nlp_2.11:2.5.0') \
        .getOrCreate()
print("Created a SparkSession")
sc = spark.sparkContext
print("Created a SparkContext")
sqlContext = SQLContext(sc)
print("Created a SQLContext")

# .config('spark.memory.fraction', '0.8') \

Created a SparkSession
Created a SparkContext
Created a SQLContext


### Load the data into a SQLContext Dataframe

In [4]:
df = sqlContext.read.format('csv').options(header='true', maxColumns=2000000) \
      .load('/data/s1847503/SDDM/newdata/data.csv')
df.show()

+--------------------+--------------------+--------------------+--------------------+--------------------+
|            paper_id|               title|        list_authors|           full_text|            sections|
+--------------------+--------------------+--------------------+--------------------+--------------------+
|1329bb2f949e74925...|Generation of pre...|['Xue Wu Zhang', ...|"The infection of...| 30 drugs were se...|
|dc079a2e9cf98fad0...|Zoonotic disease ...|['Charlotte Robin...|"Veterinary profe...| based on the par...|
|75af9aa0e63889abd...|Current and Novel...|['Erasmus Kotey',...|"Influenza viruse...| although LAIVs a...|
|1755c4785f87bca19...|MERS: Progress on...|['*', 'Ryan Aguan...|Since its identif...|['Since its ident...|
|cc829c0f2ab2e110b...|Hepatologie Akute...|['Karoline Rutter...|"Das akute Leberv...| nach Ausschluss ...|
|ece3d68d9b996c917...|Novel approach to...|['Ivan Timokhin',...|"Introduction | T...|      diameter 12 mm|
|9cd0f74020b0db181...|On the electrif

### Initialize Annotators

In [7]:
# Pipeline for text
document_assembler = DocumentAssembler() \
                        .setInputCol('full_text') \
                        .setOutputCol('document')

# Tokenizer divides the text into tokens
tokenizer = Tokenizer() \
                .setInputCols(['document']) \
                .setOutputCol('tokens')

# Finisher converts tokens to human-readable output (we need the tokens for determining the text lengths)
finisher_tokens = Finisher() \
                        .setInputCols(['tokens']) \
                        .setCleanAnnotations(False)

# Normalizer removes punctuation, numbers etc.
normalizer = Normalizer() \
                .setInputCols(['tokens']) \
                .setOutputCol('normalized') \
                .setLowercase(True)

# Lemmatizer changes each word to its lemma
lemmatizer = LemmatizerModel.pretrained() \
                .setInputCols(['normalized']) \
                .setOutputCol('lemma')

# StopWordsCleaner removes stop words    
stopwords_cleaner = StopWordsCleaner() \
                        .setInputCols(['lemma']) \
                        .setOutputCol('clean_lemma') \
                        .setCaseSensitive(False).setStopWords(all_stopwords)

# Finisher converts clean tokens to human-readable output
finisher = Finisher() \
            .setInputCols(['clean_lemma']) \
            .setCleanAnnotations(False)

lemma_antbnc download started this may take some time.
Approximate size to download 907.6 KB
[OK!]


### Create Pipeline

In [8]:
# Pipeline for fully preprocessing the text
pipeline = Pipeline() \
            .setStages([
                document_assembler,
                tokenizer,
                normalizer,
                lemmatizer,
                stopwords_cleaner,
                finisher_tokens,
                finisher
             ])

### Preprocess questions

In [9]:
questions = sqlContext.read.format('csv').options(header='true').load('/data/s1847503/SDDM/newdata/questions.csv')
questions_clean = pipeline.fit(questions).transform(questions)
questions_clean = questions_clean.select('question_id', col('finished_clean_lemma').alias('clean_question'))

# questions = [q.full_text for q in questions.collect()]
# questions_clean = [q.clean_question for q in questions_clean.collect()]
# print(questions)
# print()
# print(questions_clean)

### Preprocess text

In [10]:
# Peprocess the data
df = pipeline.fit(df).transform(df)
df = df.select('*', size('finished_tokens').alias('text_length'))

# Keep only papers with a text length of greater than 10
print("Before removing empty papers: {} rows.".format(df.count()))
df = df.dropna(subset='full_text')
# df = df.dropduplicates(subset='title')
# print("Removed duplicates")
# df = df.filter(df['text_length'] > 10)
print("After removing empty papers: {} rows.".format(df.count()))
print()

df = df.select(
                'paper_id',
                'title',
                'full_text',
                'text_length',
                col('finished_clean_lemma').alias('preprocessed')
            )

df.show()

Before removing empty papers: 1329677 rows.
After removing empty papers: 406784 rows.

+--------------------+--------------------+--------------------+-----------+--------------------+
|            paper_id|               title|           full_text|text_length|        preprocessed|
+--------------------+--------------------+--------------------+-----------+--------------------+
|1329bb2f949e74925...|Generation of pre...|"The infection of...|        723|[infection, newly...|
|dc079a2e9cf98fad0...|Zoonotic disease ...|"Veterinary profe...|       1756|[veterinary, prof...|
|75af9aa0e63889abd...|Current and Novel...|"Influenza viruse...|        919|[influenza, virus...|
|1755c4785f87bca19...|MERS: Progress on...|Since its identif...|       3942|[identification, ...|
|cc829c0f2ab2e110b...|Hepatologie Akute...|"Das akute Leberv...|       1832|[akute, lebervers...|
|ece3d68d9b996c917...|Novel approach to...|"Introduction | T...|        448|[introduction, qu...|
|9cd0f74020b0db181...|On the el

### TF-IDF

In [None]:
# # Explode text
# tf_idf = df.withColumn('token', explode(col('preprocessed')))
# tf_idf.show()

In [None]:
# # Get raw term frequencies
# tf_idf = tf_idf \
#             .groupBy('paper_id', 'token') \
#             .count()
# tf_idf = tf_idf.select('paper_id', 'token', col('count').alias('tf_raw'))
# tf_idf.show()

In [None]:
# # Take the log to scale better
# def tf(f):
#     return 1 + math.log(f)

# tf_udf = udf(tf)
# tf_idf = tf_idf.select('paper_id', 'token', 'tf_raw', tf('tf_raw').alias('tf'))
# tf_idf.show()

In [None]:
# # Get document frequencies
# tf_idf = tf_idf \
#             .groupBy('token') \
#             .count()
# tf_idf = tf_idf.select('paper_id', 'token', 'tf_raw', 'tf', col('count').alias('df'))
# tf_idf.show()

In [None]:
# # Calculate inverse document frequency
# num_docs = df.count()
# def idf(df):
#     return math.log(num_docs/df)

# idf_udf = udf(idf)
# tf_idf = tf_idf.select('paper_id', 'token', 'tf_raw', 'tf', 'df', idf_udf('count(DISTINCT paper_id)').alias('idf'))
# tf_idf.show()

In [14]:
# Compute TF-IDF matrix for papers
tf_p = HashingTF(inputCol='preprocessed', outputCol='tf') \
                    .transform(df)

tf_idf_papers = IDF(inputCol='tf', outputCol='feature') \
                        .fit(tf_p) \
                        .transform(tf_p)

In [15]:
# Compute TF-IDF matrix for questions
tf_q = HashingTF(inputCol='clean_question', outputCol='tf') \
                    .transform(questions_clean)

tf_idf_questions = IDF(inputCol='tf', outputCol='feature') \
                        .fit(tf_p) \
                        .transform(tf_q)

In [18]:
# Compute L2-norm for papers and questions
normalizer_L2 = Normalizer_L2(inputCol='feature', outputCol='norm')
tf_idf_papers = normalizer_L2.transform(tf_idf_papers)
tf_idf_questions = normalizer_L2.transform(tf_idf_questions)

In [20]:
tf_idf_papers.select('paper_id', 'feature', 'norm').show()

+--------------------+--------------------+--------------------+
|            paper_id|             feature|                norm|
+--------------------+--------------------+--------------------+
|1329bb2f949e74925...|(262144,[2626,462...|(262144,[2626,462...|
|dc079a2e9cf98fad0...|(262144,[353,1466...|(262144,[353,1466...|
|75af9aa0e63889abd...|(262144,[1466,170...|(262144,[1466,170...|
|1755c4785f87bca19...|(262144,[353,1024...|(262144,[353,1024...|
|cc829c0f2ab2e110b...|(262144,[879,1006...|(262144,[879,1006...|
|ece3d68d9b996c917...|(262144,[1466,613...|(262144,[1466,613...|
|9cd0f74020b0db181...|(262144,[632,882,...|(262144,[632,882,...|
|0b70c1fd82bd1962a...|(262144,[661,1466...|(262144,[661,1466...|
|94e8acc14db64cbb1...|(262144,[162,1466...|(262144,[162,1466...|
|d4b11ed79efbb3cd5...|(262144,[170,269,...|(262144,[170,269,...|
|68a2a48d4c67318b0...|(262144,[1284,146...|(262144,[1284,146...|
|0ccdc351858fd7dfe...|(262144,[619,1156...|(262144,[619,1156...|
|81059d5922e947ca8...|(26

In [24]:
tf_idf_questions.select('question_id', 'feature', 'norm').first()[2]

SparseVector(262144, {51736: 0.4027, 73185: 0.4073, 75086: 0.4216, 147758: 0.703})

In [25]:
# Compute similarity matrix
from pyspark.mllib.linalg.distributed import IndexedRow, IndexedRowMatrix
matrix_q = IndexedRowMatrix(
                tf_idf_questions.select('question_id', 'norm') \
                .rdd.map(lambda row: IndexedRow(row.question_id, row.norm.toArray()))
            ).toBlockMatrix()

matrix_p = IndexedRowMatrix(
                tf_idf_papers.select('paper_id', 'norm') \
                .rdd.map(lambda row: IndexedRow(row.paper_id, row.norm.toArray()))
            ).toBlockMatrix()

sim_matrix = matrix_q.multiply(matrix_p.transpose())
sim_matrix = sim_matrix.toLocalMatrix().toArray()

### Similarity (copied, taking inspiration from it)

In [None]:
# Similarity ----------------------------------------------------------------------
def calc_simlarity_score(question_list, text_list,threshold=None, top=None):
    if (threshold==None)  and  (top==None):
        raise ValueError("Parameter `threshold` and `top` cannot both be None")
    dic = {}
    tfidf = TfidfVectorizer()
    corpus_tfidf_matrix = tfidf.fit_transform(text_list)
    ques_tfidf_matrix = tfidf.transform(question_list)
    sim_matrix = cosine_similarity(corpus_tfidf_matrix, ques_tfidf_matrix)
    for ques_idx in range(sim_matrix.shape[1]):
        dic[ques_idx] = []
        if threshold != None:
            if (threshold>1) or (threshold <0):
                raise ValueError("Please enter a value from 0 to 1 for parameter `threshold`")
            for paper_idx in range(sim_matrix.shape[0]):
                score = sim_matrix[paper_idx, ques_idx]
                if score >= threshold:
                    dic[ques_idx].append((paper_idx, score))
            dic[ques_idx]=sorted(dic[ques_idx], key=lambda i: i[1], reverse=True)
        elif top != None:
            top_paper_idx_list = sorted(range(len(sim_matrix[:, ques_idx])), key=lambda i: sim_matrix[:,0][i], reverse=True)[:top]
            dic[ques_idx] = [(top_idx, sim_matrix[top_idx, ques_idx]) for top_idx in top_paper_idx_list]
    return dic, sim_matrix

# Retrieve relevant paper----------------------------------------------------------------------
def retrieve_paper(df, dic):
    df_dic={}
    for ques_idx in dic:
        new_df = df.iloc[[item[0] for item in dic[ques_idx]], :]
        new_df['score'] = [item[1] for item in dic[ques_idx]]
        new_df['question'] = questions[ques_idx]
        df_dic[ques_idx]=new_df.copy()
    return df_dic

### Close Spark Context when done

In [None]:
sc.stop()