In [62]:
# configurations

config = {
    'data_path' : '/Users/RajivGaba/aiml_projects/Semantic Spotter/Data/',
    'chunk_size' : 1000,
    'chunk_overlap' : 200,
    'vector_store_name' : "faiss_index",
    'hf_token' : "*****",
    'embedding_model' : 'all-MiniLM-L6-v2',
    'refresh_vector_store' : 'N',
    'cross_encoder_model': 'cross-encoder/ms-marco-MiniLM-L-6-v2',
    'PPLX_API_KEY' : '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'
}

In [None]:
# install required packages

! pip install -qU langchain-community pymupdf
! pip install -qU langchain_huggingface
! pip install sentence-transformers

In [None]:
# import libraries

import os, glob
import importlib

# Restart kernel if needed
try:
    importlib.reload(importlib)
except:
    pass

from langchain_community.document_loaders import PyMuPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_huggingface import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS
from sentence_transformers import CrossEncoder, util

In [None]:
# Define reusable functions

def get_data_chunks(pdf_file):
    loader = PyMuPDFLoader(pdf_file)
    documents = loader.load()

    # chunking/splitting
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=config['chunk_size'],
        chunk_overlap=config['chunk_overlap']
    )
    text_chunks = text_splitter.split_documents(documents=documents)
    return text_chunks

def get_all_pdf_files(data_path):
    pdf_files = glob.glob(os.path.join(data_path, "*.pdf"))
    return pdf_files

def get_embeddings_model():
    embedding_model = HuggingFaceEmbeddings(model_name=config['embedding_model'], show_progress=True)
    return embedding_model

def create_vector_store(text_chunks, embedding_model):
    if config['refresh_vector_store'] == 'Y' and os.path.exists(config['vector_store_name']):
        vector_store = FAISS.from_documents(text_chunks, embedding_model)
        vector_store.save_local(config['vector_store_name'])
    else:
        vector_store = FAISS.load_local(config['vector_store_name'], embedding_model, allow_dangerous_deserialization=True)

def get_cross_encoder_score(query, results):
    cross_encoder = CrossEncoder(config['cross_encoder_model'])
    for i, res in enumerate(results):
        ce_score = cross_encoder.predict([query, res.page_content])
        print(ce_score)

In [38]:
if __name__ == "__main__":
    chunked_data = []
    pdf_files = get_all_pdf_files(config['data_path'])
    for pdf_file in pdf_files:
        # print(pdf_file)
        text_chunks = get_data_chunks(pdf_file)
        chunked_data.extend(text_chunks)

    embedding_model = get_embeddings_model()
    create_vector_store(chunked_data, embedding_model)
    vector_store = FAISS.load_local(config['vector_store_name'], embedding_model, allow_dangerous_deserialization=True)
    # get_cross_encoder_score(query, results)

Batches: 100%|██████████| 5/5 [00:00<00:00, 11.29it/s]


In [57]:
query = "what is the global Webserver plug-in Configuration required for installation"
results = vector_store.similarity_search(query, k=10)
for i, res in enumerate(results):
    print(i, res.page_content,  "\n"*3)

Batches: 100%|██████████| 1/1 [00:00<00:00,  1.16it/s]

0 Manual Post Installation from Solutions .................................................................... 21 
6.2. 
Update global Webserver plug-in Configuration (Only for WAS cluster webserver 
setup) ........................................................................................................................ 21 
6.3. 
Restart OHS Server (Only for WEBLOGIC Cluster webserver setup) ...................... 21 
6.4. 
Change Binding for jBoss .......................................................................................... 22 
6.5. 
Moving all DB Objects to Default TableSpace in Postgres ....................................... 22 
6.6. 
Restart JBPM ............................................................................................................ 22 
6.7. 
CSIS DB Configuration .............................................................................................. 23 
6.8. 



1 Ip address : 10.11.32.29 User Name:Rajiv Gaba       Time Stamp : 3/7/2




In [59]:
! pip install -qU "langchain-perplexity"

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [60]:
from langchain.chat_models import init_chat_model
os.environ["PPLX_API_KEY"] = config['PPLX_API_KEY']
model = init_chat_model("llama-3.1-sonar-small-128k-online", model_provider="perplexity")
model.invoke("hello dear")

In [None]:
def get_llm_response():
    pass