### Install packages

In [None]:
!pip install contractions
!pip install textsearch
!pip install tqdm
!pip install -U sentence-transformers rank_bm25
!pip install -U -q PyDrive
!pip install python-docx
!pip install -U pypdfium2

In [2]:
import docx
import glob
import io
import nltk
import numpy as np
import os
import pandas as pd
import pickle
import pypdfium2 as pdfium
import requests
import re
from sentence_transformers import SentenceTransformer, CrossEncoder, util
import torch
import unicodedata

from nltk.tokenize import sent_tokenize
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

### Replace 'folder_id' by the location where the tutorial was saved

In [3]:
folder_id = '1O4AO1HxRGbL-IfG__UpJORo50Hl0oN9z'

### Download documents to the file section of the notebook

In [4]:
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials

auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)

folders = [folder_id]

for folder in folders:
  file_list = drive.ListFile({'q': f"'{folder}' in parents and trashed=false"}).GetList()
  for file in file_list:
    file_id = file['id']
    file_name = file['title']

    downloaded = drive.CreateFile({'id': file_id})
    downloaded.FetchMetadata(fetch_all=True)
    downloaded.GetContentFile(downloaded.metadata['title'])


### Normalize Text

In [11]:
def normalize_sentence(sentence):
    sentence = sentence.lower()
    sentence = sentence.strip()
    tokens = nltk.word_tokenize(sentence)
    sentence = ' '.join(tokens)
    return sentence

def normalize_doc(doc):
  sentences = re.split(r'\n|(?<=[.!?])\s+', doc)
  sentences = [sentence.strip() for sentence in sentences if sentence.strip()]
  normalized_lst = [normalize_sentence(x) for x in sentences if x is not None]
  return normalized_lst

def normalize_pdf(pdf):
  doc = pdf.lower()
  sentences = nltk.sent_tokenize(doc)
  return sentences


### Encoder

In [6]:
bi_encoder = SentenceTransformer('multi-qa-MiniLM-L6-cos-v1')
cross_encoder = CrossEncoder('cross-encoder/ms-marco-MiniLM-L-6-v2')

Downloading (…)5fedf/.gitattributes:   0%|          | 0.00/737 [00:00<?, ?B/s]

Downloading (…)_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Downloading (…)2cb455fedf/README.md:   0%|          | 0.00/11.5k [00:00<?, ?B/s]

Downloading (…)b455fedf/config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

Downloading (…)ce_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

Downloading (…)edf/data_config.json:   0%|          | 0.00/25.5k [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

Downloading (…)nce_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading (…)5fedf/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/383 [00:00<?, ?B/s]

Downloading (…)fedf/train_script.py:   0%|          | 0.00/13.8k [00:00<?, ?B/s]

Downloading (…)2cb455fedf/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)455fedf/modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/794 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/316 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

### Create data structures to save the encoded vectors

In [8]:
encoded_df = pd.DataFrame(columns=['Encoded_Vector','Text'])
biencoder_vec = []

### Get encoded vectors through the tutorial pdfs

In [9]:
'Function to extract pdf files to text'
def pdfium_get_text(data: bytes) -> str:
    text = ""
    pdf = pdfium.PdfDocument(data)
    for i in range(len(pdf)):
        page = pdf.get_page(i)
        textpage = page.get_textpage()
        text += textpage.get_text_range() + "\n"
    return text

In [None]:
pdf_files =glob.glob('/content/*.pdf')
for pdf in pdf_files:
  with open(pdf,"rb") as f:
    data = f.read()
    entire_text=pdfium_get_text(data)
    pre_processed_doc=normalize_pdf(entire_text)
    for normalized_sentence in pre_processed_doc:
        corpus_embeddings_biencoder = bi_encoder.encode(normalized_sentence,convert_to_tensor = True)
        biencoder_vec.append(corpus_embeddings_biencoder)
        embeddings = corpus_embeddings_biencoder.cpu().numpy()  # Move tensor to CPU and convert to NumPy array
        encoded_df = encoded_df.append({'Encoded_Vector': embeddings,'Text':normalized_sentence},ignore_index=True)


### Construct QA engine

In [13]:
def search(query):
    print("Input question:", query)


    ##### Bi-Encoder: Sematic Search #####
    # Encode the query using the bi-encoder and find potentially relevant passages
    question_embedding = bi_encoder.encode(query, convert_to_tensor=True)
    question_embedding = question_embedding.cuda()
    hits = util.semantic_search(question_embedding, biencoder_vec, top_k=5)
    hits = hits[0]  # Get the hits for the first query


    ##### Cross-Encoder: Re-Ranking #####
    # Now, score all retrieved passages with the cross_encoder
    cross_inp = [(query, encoded_df.iloc[hit['corpus_id']]['Text']) for hit in hits]
    cross_scores = cross_encoder.predict(cross_inp)

    # Sort results by the cross-encoder scores
    for idx in range(len(cross_scores)):
        hits[idx]['cross-score'] = cross_scores[idx]


    # Output of top-1 hits from re-ranker
    print("\n-------------------------\n")
    print("Answer")
    hits = sorted(hits, key=lambda x: x['cross-score'], reverse=True)
    df_row_idx = hits[0]['corpus_id']
    text = encoded_df.iloc[df_row_idx]['Text']
    print(text)

### Sample QA

In [14]:
search(query = 'what is a random variable?')

Input question: what is a random variable?

-------------------------

Answer
a random variable is a mapping1
x : ω → r
that assigns a real number x(ω) to each outcome ω.
at a certain point in most probability courses, the sample space is rarely
mentioned anymore and we work directly with random variables.


In [15]:
search(query = 'what is the central limit theorem')

Input question: what is the central limit theorem

-------------------------

Answer
5.8 theorem (the central limit theorem (clt)).
