# Initialization and Environment Setup

In [4]:
!pip install pypdf --quiet
!pip install rank_bm25 --quiet
!pip install sentence-transformers --quiet

import os

import numpy

!pip install pymupdf --quiet
import fitz

from rank_bm25 import BM25Okapi as bm25fn
from pypdf import PdfReader as pypdf_reader
from re import sub as regex_sub

import torch

from sentence_transformers import SentenceTransformer
from transformers import AutoModel, AutoTokenizer


from nltk.corpus import stopwords
from nltk import download as nltk_download

nltk_download('punkt_tab'), nltk_download('stopwords')
ENGLISH_STOPWORDS = stopwords.words('english')

# Provide Search Queries Here
SEARCH_QUERIES = \
[
    string.lower().split() for string in \
    [
        "racial bias nurses",
        "minority health nurses bias",
        "class bias race healthcare",
        "weight loss",
        "tea powder weight loss",
        "berry health benefit",
    ]
]

# SEARCH_QUERIES = [ query.lower().split() for query in SEARCH_QUERIES ]
TARGET_RESULTS = [ "PaperA", "PaperA", "PaperA", "PaperB", "PaperB", "PaperC" ]

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


# Prepare Data
Separately generate cleaned and uncleaned sets of extracted pdf data

In [5]:
datasets_path = "/content/drive/MyDrive/Dataset Corpus"
for file in os.scandir(datasets_path):
  if file.is_file() and os.path.splitext(file.path)[1] == ".pdf":

    file_text = ""

    pdf_reader = pypdf_reader(file.path)
    for pgidx in range(0, len(pdf_reader.pages)):
      file_text += pdf_reader.pages[pgidx].extract_text()

    with open(datasets_path + "/raw/" + os.path.splitext(file.name)[0] + ".raw.txt", "w") as outfile:
      outfile.write(file_text)

    with open(datasets_path + "/clean/" + os.path.splitext(file.name)[0] + ".clean.txt", "w") as outfile:
      cleaned_text = regex_sub("[0-9]", " ", regex_sub("[\\W]", " ", file_text.lower()))
      for word in cleaned_text.split():
        if word not in ENGLISH_STOPWORDS and len(word.strip()) != 0: outfile.write(word + " ")

    #with open(datasets_path + "/raw/" + os.splitext(file.name)[0] + ".raw.txt", "w") as outfile:
      #outfile.write(file_text)

# Unclean BM25

In [6]:
corpus, file_order = [ [], [] ]

datasets_path = "/content/drive/MyDrive/Dataset Corpus/raw"
for file in os.scandir(datasets_path):
  if file.is_file() and os.path.splitext(file.path)[1] == ".txt":
    with open(file.path, "r") as readfile:
      file_order.append(file.name.split('.')[0])
      corpus.append(readfile.read().split())


bm25, scores = [ bm25fn(corpus), [] ]

print(file_order, "\n")
for query in SEARCH_QUERIES:
  score = numpy.asarray(bm25.get_scores(query))

  score = numpy.exp(score) / numpy.sum(numpy.exp(score))
  scores.append(score)

  for s in score: print(f"{s:.4e}", end=",")
  print("\b;")
  # print("\n", file_order[score.argmax()], "\n")

['PaperA', 'PaperB', 'PaperC', 'PaperD'] 

9.8571e-01,4.7618e-03,4.7618e-03,4.7618e-03,;
9.4040e-01,2.2144e-02,2.2303e-02,1.5149e-02,;
9.2257e-01,2.1241e-02,2.6805e-02,2.9383e-02,;
2.0727e-01,3.0559e-01,2.4308e-01,2.4406e-01,;
2.0727e-01,3.0559e-01,2.4308e-01,2.4406e-01,;
1.1715e-01,1.1262e-01,6.9319e-01,7.7046e-02,;


# Clean BM25

In [7]:
corpus, file_order = [ [], [] ]

datasets_path = "/content/drive/MyDrive/Dataset Corpus/clean"
for file in os.scandir(datasets_path):
  if file.is_file() and os.path.splitext(file.path)[1] == ".txt":
    with open(file.path, "r") as readfile:
      file_order.append(file.name.split('.')[0])
      corpus.append(readfile.read().split())


bm25, scores = [ bm25fn(corpus), [] ]

print(file_order, "\n")
for query in SEARCH_QUERIES:
  score = numpy.asarray(bm25.get_scores(query))

  score = numpy.exp(score) / numpy.sum(numpy.exp(score))
  scores.append(score)

  for s in score: print(f"{s:e}", end=",")
  print("\b;")
  # print("\n", file_order[score.argmax()], "\n")

['PaperA', 'PaperB', 'PaperC', 'PaperD'] 

9.911254e-01,2.958214e-03,2.958214e-03,2.958214e-03,;
9.489823e-01,1.872674e-02,1.886416e-02,1.342676e-02,;
9.904502e-01,2.681216e-03,3.246431e-03,3.622198e-03,;
2.136262e-01,2.975674e-01,2.433182e-01,2.454882e-01,;
2.136262e-01,2.975674e-01,2.433182e-01,2.454882e-01,;
1.108634e-01,1.082399e-01,7.032906e-01,7.760615e-02,;


# Regular Sentence Modeler

In [8]:
datasets_path = "/content/drive/MyDrive/Dataset Corpus/clean"
corpus_documents = [
    open(file.path, "r").read() for file in os.scandir(datasets_path)
]

simple_model = SentenceTransformer("all-MiniLM-L6-v2")
embeddings = simple_model.encode(corpus_documents)

embedded_queries = simple_model.encode(SEARCH_QUERIES)
scores = [
    numpy.dot(embedded_queries, document_embedding)
      / numpy.linalg.norm(embedded_queries) * numpy.linalg.norm(document_embedding)
    for document_embedding in embeddings
]

scores

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

[array([ 0.10444192,  0.15823638,  0.05270284,  0.02067065, -0.00804291,
         0.07414407], dtype=float32),
 array([0.00336828, 0.0646854 , 0.00331024, 0.10876233, 0.11648379,
        0.12265363], dtype=float32),
 array([-0.03165048,  0.0159296 , -0.04470716,  0.04504631,  0.08562095,
         0.1610292 ], dtype=float32),
 array([-0.01375962,  0.00978606, -0.00355179, -0.02444826, -0.0407578 ,
         0.01588146], dtype=float32)]

# SciBERT Uncleaned



In [19]:
datasets_path = "/content/drive/MyDrive/Dataset Corpus/raw"

access_order = []
corpus_documents = []
for file in os.scandir(datasets_path):
  access_order.append(file.name)
  corpus_documents.append(open(file.path, "r").read())

scibert_model = SentenceTransformer("pritamdeka/S-Scibert-snli-multinli-stsb")
document_embeddings = scibert_model.encode(corpus_documents)

query_embeddings = scibert_model.encode(SEARCH_QUERIES)

scores, results = [ [], [] ]
for qemb in query_embeddings:
  score = []
  for demb in document_embeddings:
    score.append(numpy.dot(qemb, demb) / (numpy.linalg.norm(qemb) * numpy.linalg.norm(demb)))
  scores.append(score)
  results.append(access_order[numpy.argmax(score)])


print("Target\t\tResult")
for i in range(len(scores)):
  # [ print(f"{s:.2E}", end="\t") for s in scores[i] ]
  # print()
  print(TARGET_RESULTS[i], "\t\t", results[i].split('.')[0])


#for score in scores:
#  print(numpy.exp(scores) / numpy.sum(numpy.exp(score)))

Target		Result
PaperA 		 PaperB
PaperA 		 PaperA
PaperA 		 PaperB
PaperB 		 PaperB
PaperB 		 PaperB
PaperC 		 PaperC


# SciBERT Cleaned

In [20]:
datasets_path = "/content/drive/MyDrive/Dataset Corpus/clean"

access_order = []
corpus_documents = []
for file in os.scandir(datasets_path):
  access_order.append(file.name)
  corpus_documents.append(open(file.path, "r").read())

scibert_model = SentenceTransformer("pritamdeka/S-Scibert-snli-multinli-stsb")
document_embeddings = scibert_model.encode(corpus_documents)

query_embeddings = scibert_model.encode(SEARCH_QUERIES)

scores, results = [ [], [] ]
for qemb in query_embeddings:
  score = []
  for demb in document_embeddings:
    score.append(numpy.dot(qemb, demb) / (numpy.linalg.norm(qemb) * numpy.linalg.norm(demb)))
  scores.append(score)
  results.append(access_order[numpy.argmax(score)])


print("Target\t\tResult")
for i in range(len(scores)):
  # [ print(f"{s:.2E}", end="\t") for s in scores[i] ]
  # print()
  print(TARGET_RESULTS[i], "\t\t", results[i].split('.')[0])


Target		Result
PaperA 		 PaperA
PaperA 		 PaperA
PaperA 		 PaperA
PaperB 		 PaperB
PaperB 		 PaperB
PaperC 		 PaperB
