In [None]:
# @title Pull Local RAG repository https://github.com/relyativist/rag-gain

In [None]:
!echo $PWD

In [None]:
%%bash
  echo "Installing apt packages ...\n" && \
  apt-get install -y tree && \
  echo "Clonning rag git repos ... \n" && \
  git clone https://github.com/relyativist/rag-gain.git code/rag-gain && \
  echo $PWD/code
  #rm -r /content/sample_data
  #tree

## Implement .pdf parser, chunking and embed chunks

Parse .pdf doc of choice. ([Feature]: Any kind of document)

Steps to implement:



1.   Import .pdf;
2.   Parse text;
3.   Implement multiple chunking strategies to compare performance:
  3.1 Paragraph chunking;
  3.2 Fixed size chunking;
  3.2 LangChain (e.g. [RecursiveCharacterTextSplitter](https://python.langchain.com/docs/modules/data_connection/document_transformers/recursive_text_splitter/), [SemanticChanking](https://python.langchain.com/docs/modules/data_connection/document_transformers/semantic-chunker/))
4.   Embedding with SentenceTransformers
5.   Populate lance .db


Example *.pdf*s are taken from europarl.europa.eu on AI regulations;

[AIDA public hearing on AI and the Data Strategy of 30 September 2021 (PDF - 458 KB)](https://www.europarl.europa.eu/cmsdata/242360/AIDA_Verbatim_30_September_2021_EN.pdf)

[AIDA-AGRI public hearing on AI in agriculture and food security of 14 June 2021 (PDF - 401 KB)](https://www.europarl.europa.eu/cmsdata/238861/AIDA_Verbatim_14_June_2021_EN.pdf)

[AIDA-EMPL public hearing on AI and the Labour Market of 25 May 2021 (PDF - 453 KB)](https://www.europarl.europa.eu/cmsdata/238560/AIDA_Verbatim_25_May_2021_EN.pdf)

[AIDA public hearing on AI and Health of 2 December 2020 (PDF - 550 KB)](https://www.europarl.europa.eu/cmsdata/222231/AIDA_Verbatim_2_December_2020_EN.pdf)

In [None]:
import os, re
from pathlib import Path
import requests
import fitz
from tqdm.auto import tqdm
from typing import List, Dict, Tuple

from langchain.document_loaders import DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from spacy.lang.en import English
import pdb


def import_pdf(path):
    """
    Imports pdf from path, if not exists download files predefined urls
    args:
      path : str - absolute path to pdf document

    return : str - pdf paths
    """

    pdf_path = Path(path)

    if not pdf_path.is_file():
        urls = ["https://www.europarl.europa.eu/cmsdata/242360/AIDA_Verbatim_30_September_2021_EN.pdf",
              "https://www.europarl.europa.eu/cmsdata/238861/AIDA_Verbatim_14_June_2021_EN.pdf",
              "https://www.europarl.europa.eu/cmsdata/238560/AIDA_Verbatim_25_May_2021_EN.pdf",
              "https://www.europarl.europa.eu/cmsdata/222231/AIDA_Verbatim_2_December_2020_EN.pdf"]

        for url in urls:
            filename = url.split("/")[-1]
            response = requests.get(url)

        if response.status_code == 200:
            with open(filename, "wb") as file:
                file.write(response.content)
        else:
            return response.status_code
    else:
        return pdf_path


def text_processor(text: str) -> str:

    proc_text = text.replace("\n", " ")

    return proc_text


def pdf_parser(pdf_path : str) -> List[Dict]:
    doc = fitz.open(pdf_path)
    pages = []

    for _, page in tqdm(enumerate(doc)):
        text = page.get_text()
        text = text_processor(text)
        pages.append({"page_chars": len(text),
                    "page_words": len(text.split(" ")),
                    "page_sentences_naive": len(text.split(". ")),
                    "page_tokens": len(text) / 4,  # ~ 1 token is 4 chars https://help.openai.com/en/articles/4936856-what-are-tokens-and-how-to-count-them
                    "text": text
                    })
    
    return pages


def custom_textsplitter(text, chunk_size : int = 4):

    """
    Group sentences in chunks
    args:
      text : str
      chunk_size : int  - number of sentences in one chunk, e.g. [25] -> [int, int ,mod]
    return : List
    """

    nlp = English()
    nlp.add_pipe("sentencizer")
    dot_sep_list = list(nlp(text).sents)
    dot_sep_str = [str(x) for x in dot_sep_list]
    
    sentence_chunks = [dot_sep_str[i:i + chunk_size] for i in range(0, len(dot_sep_str), chunk_size)]

    text_chunks = []

    for chunk in sentence_chunks:
        chunkd = {}
        joined_chunk = "".join(chunk).replace("  ", " ").strip()
        joined_chunk = re.sub(r"\.([A-Z])", r". \1", joined_chunk)  # add space to sentence ending dot, e.g. .A => . A
        chunkd["sentence_chunk"] = joined_chunk
        chunkd["chunk_char_count"] = len(joined_chunk)
        chunkd["chunk_word_count"] = len([word for word in joined_chunk.split(" ")])
        chunkd["chunk_sentence_count"] = len(joined_chunk.split("."))
        chunkd["chunk_token_count"] = len(joined_chunk) / 4
        text_chunks.append(chunkd)

    return text_chunks


def lc_rec_textsplitter(text, chunk_size):
    """
     The default list is ["\n\n", "\n", " ", ""]. This has the effect of trying to keep all paragraphs
    (and then sentences, and then words) together as long as possible
    """

    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size = chunk_size,  # mean doc token len is 836, with maximum of 7549, and min of 130
        chunk_overlap = 100,
        length_function = len,
        add_start_index = True,  # just to check the index of particular chunk
    )
    lc_textsplitter = text_splitter.split_text(text)
    return lc_textsplitter


def chunking(
        text : str,
        fixed_chunk_size : int = 256,
        sentence_chunk_size : int = 4,
        method : str ="langchain"
        ) -> List[str]:
    
    """
    Chunk inpurt text, with method defined.
    In case of "custom" method, the sentence_chunk_size is used as the number of sentences in one chunk, e.g. 10 sentences in one chunk
    In case of "langchain" method, the fixed_chunk_size defines the number of chars in one chunk, e.f. 256 in one chunk
    args:
      text : str
      fixed_chunk_size : int
      sentence_chunk_size : int
      method : str
    return : str List of lists of chunks
    """

    if method == "langchain":
        chunks = lc_rec_textsplitter(text, fixed_chunk_size)
    if method == "custom":
        chunks = custom_textsplitter(text, sentence_chunk_size : = 10)
    else:
        raise AttributeError(
            """
            wrong chunking method: choose from supported options: \n
            - 'fixed_sized' \n
            - 'paragraph' \n
            - 'langchain' \n
            """
        )

    return chunks


In [None]:
import re

doc_regex = r"\bAIDA.*\.pdf\b"

directory = "/code/rag-gain"

doc_lists = []

for file in os.listdir(directory):
  
    if re.match(doc_regex, file):
        pdf_sample = import_pdf(f"{directory}/{file}")
        pdf_page_texts = pdf_parser(pdf_sample)
        for item in tqdm(pdf_page_texts):
            item["sentence_chunks"] = chunking(item["text"], method="custom")
            item["page_sentences_splitter"] = len(item["sentence_chunks"])
        doc_lists.append(pdf_page_texts)

### Embedder 

Embed text from *.pdfs and with SentenceTransformers and populate LanceDB
Input: doc_lists
Output: .lancedb vector database

In [None]:
import numpy as np
import pandas as pd
import lancedb
import pyarrow as pa

from sentence_transformers import SentenceTransformer


# DEFINE variables for sentence embedder and lanceDB

SENTS_EMBEDDER_MODEL = "all-mpnet-base-v2"                   #  sentence embedder model  

sents_embedder = SentenceTransformer(SENTS_EMBEDDER_MODEL, device="cuda")
sents_embedder.eval()
                   
BATCH_SIZE = 32                                               #  batches for embedder, we don't want to embed all sentences at once
LANCE_DB_LOC = "./.lancedb"                      #  location for .lancedb on host
NUM_SUB_VEC = sents_embedder.max_seq_length                   #  max token length of embedder model
LANCE_DB_TABLENAME = SENTS_EMBEDDER_MODEL + f"_{NUM_SUB_VEC}" #  vector table name
EMBEDDING_DIM_MODEL = 768                                     #  Transformer embedding dimension
NUM_PARTITIONS_VEC = 128
NUM_SUB_VEC = 96
VEC_COLUMN = "vector"                                         #  vector table embeddings column name
TEXT_COLUMN = "text"                                          #  vector table text column name


Extract all sentences from all documents to sentences -> List[str]: 

In [None]:
%%time

all_docs_embeds = []
sentences = []
for doc in tqdm(doc_lists):
    #for item in pdf_page_texts:
    for pages in doc:
        for chunk in pages["sentence_chunks"]:
            chunk["chunk_embeds"] = sents_embedder.encode(chunk["sentence_chunk"], convert_to_tensor=True)
            sentences.append(chunk["sentence_chunk"])
            all_docs_embeds.append(chunk["chunk_embeds"])

Populate DB:

In [None]:
assert EMBEDDING_DIM_MODEL % NUM_SUB_VEC == 0, \
        "Embedding size must be divisible by the num of sub vectors"

db = lancedb.connect(LANCE_DB_LOC)

schema = pa.schema(
      [
          pa.field(VEC_COLUMN, pa.list_(pa.float32(), EMBEDDING_DIM_MODEL)),
          pa.field(TEXT_COLUMN, pa.string())
      ]
    )

tbl = db.create_table(LANCE_DB_TABLENAME, schema=schema, mode="overwrite")
#pdb.set_trace()
for i in tqdm(range(0, int(np.ceil(len(sentences) / BATCH_SIZE)))):
    try:
        batch = [sent for sent in sentences[i * BATCH_SIZE:(i + 1) * BATCH_SIZE] if len(sent) > 0]
        encoded = sents_embedder.encode(batch, normalize_embeddings=True, device="cuda")
        encoded = [list(vec) for vec in encoded]


        df = pd.DataFrame({
            VEC_COLUMN: encoded,
            TEXT_COLUMN: batch
        })
        tbl.add(df)
    except:
       pass

tbl.create_index(
        num_partitions=NUM_PARTITIONS_VEC,
        num_sub_vectors=NUM_SUB_VEC,
        vector_column_name=VEC_COLUMN
    )

In [None]:
db = lancedb.connect(LANCE_DB_LOC)
table = db.open_table(LANCE_DB_TABLENAME)

In [None]:
query_model = SentenceTransformer(SENTS_EMBEDDER_MODEL, device="cpu")
def search(query, top_k = 20):

    query_vector = query_model.encode(query, convert_to_numpy=True)

    search_results = table.search(query_vector).limit(top_k)
    return search_results

In [None]:
query_examples_for_test =  [" * Rules on artificial intelligence in healthcare\n",
                            " * Artificial intelligence in agriculture\n",
                            " * Policies in data privacy\n",
                            " * AI and labour market\n"]

print("[EXAMPLES]:\n")
for query in query_examples_for_test:
    print(query)

print("[QUERY]: Enter query to vector DB ->\n")
query = str(input())
print(f"[USER QUERY]:\n{query}\n")

search_results = search(query, top_k = 20).to_pandas().dropna(subset = "text").reset_index(drop=True)
print(f"[Vector DB search]:\n")
for t in range(len(search_results.text)):
    print(search_results.text[t])
    

In [None]:
search_results = search(query, top_k = 20).to_pandas().dropna(subset = "text").reset_index(drop=True)


In [None]:
search_results["old_similarity_rank"] = search_results.sort_values("_distance", ascending=False).index+1
search_results

In [None]:

import torch
from sentence_transformers import CrossEncoder


reranker_model_name = "cross-encoder/ms-marco-MiniLM-L-6-v2"

cross_encoder_model = CrossEncoder(reranker_model_name, device="cuda")

def rerank(query, search_results, K : int = 5):
    query_retrieve_comb = [[query, sent] for sent in search_results["text"]]
    search_results["_distance_reranked"] = cross_encoder_model.predict(query_retrieve_comb, activation_fct=torch.nn.Sigmoid())
    topk = search_results.sort_values("_distance_reranked", ascending=False).head(K)
    return topk

new_df = rerank(query, search_results)

print(f"[Reranked Vector DB search]:\n")

for t in new_df.text:
    print(t)
