In [None]:
pip -q install openai PyPDF2 sentence-transformers qdrant_client


In [None]:

import openai
import PyPDF2
import re
from qdrant_client.models import PointStruct
from qdrant_client.models import Distance, VectorParams
from qdrant_client import QdrantClient
import openai
from sentence_transformers import SentenceTransformer


In [None]:
embeddings = SentenceTransformer("sentence-transformers/all-mpnet-base-v2")

qdrant_client = QdrantClient(":memory:")


In [None]:

pdf_reader = PyPDF2.PdfReader("soum_paul_profile_pdf.pdf")
pdf_corpus = []
for page in pdf_reader.pages:
    pdf_corpus.append(page.extract_text())

In [None]:
def tokenize_paragraphs(pdf_corpus):
    send = []
    page_no = 1
    for document in pdf_corpus:
        section_no = 1
        paragraphs = document.split(".\n")
        for para in paragraphs:
            send.append([para,{'page_no': page_no, 'section_no': section_no}])
            section_no += 1
        page_no = page_no + 1
    return send

# data = [raw_text,{page_no, section_no}]


In [None]:
def initialize_qdrant(length: int):
    vector_size = length
    # Define the vectors configuration
    vector_params = VectorParams(
        size=vector_size,                # Size of the vectors
        distance=Distance.COSINE         # Choose distance metric (COSINE, EUCLID, or IP)
    )
    
    # Create the collection with the specified configuration
    if qdrant_client.get_collections().collections == []:
        qdrant_client.create_collection(
            collection_name="CHATBOT",
            vectors_config=vector_params  # Specify vector configuration
        )
    else:
        if "CHATBOT" not in qdrant_client.get_collections().collections[0].name:
            qdrant_client.create_collection(
                collection_name="CHATBOT",
                vectors_config=vector_params  # Specify vector configuration
            )


In [None]:
def generate_embeddings(data_text):
  return embeddings.encode(data_text)

In [None]:
def prepare_embeddings(data, batch_size=10):
    total_items = len(data)
    final_data = []
    for item in data:  # Extract contexts for this batch
        vectors = generate_embeddings(item[0])  # Generate embeddings for the batch
        final_data.append([{"raw_text":item[0], "page_no": item[1]['page_no'], "section_no": item[1]['section_no']}, vectors])     
        
    return final_data
# final_data = [{raw_text, page_no:, section_no}, vectors]


In [None]:
def qdrant_entry(final_data):
    points=[PointStruct( id=i,  vector=final_data[i][1],payload={'raw_context':final_data[i][0]['raw_text'], 'page_no':final_data[i][0]['page_no'], 'section_no':final_data[i][0]['section_no'] }) for i in range(len(final_data))]
    qdrant_client.upsert(collection_name="CHATBOT", points=points)
    print(qdrant_client.get_collections())


In [None]:
def query_qdrant(query, collection_name='CHATBOT', limit=8):
     
    query_vector=generate_embeddings(query)
     
    result = qdrant_client.search(
        collection_name = collection_name,
        query_vector=query_vector,
        limit = limit,
        with_vectors = False
    )
    # search_result=[]
    
    return result


In [None]:
def prepare_llm_context(result):
    # result[0].payload['raw_context']
    context =[]
    for i in range(len(result)):
        context.append(result[i].payload['raw_context'])
    return context

In [None]:
def query_llm(context, query):
    token = "eyJhbGciOiJSUzI1NiIsInR5cCIgOiAiSldUIiwia2lkIiA6ICJGSjg2R2NGM2pUYk5MT2NvNE52WmtVQ0lVbWZZQ3FvcXRPUWVNZmJoTmxFIn0.eyJleHAiOjE3NTg1OTc3MzksImlhdCI6MTcyNzA2MTczOSwianRpIjoiMWFjMTQ0MzYtZDcyYS00ODQwLTkyMjEtNWJkYTA3MTA3ZTY0IiwiaXNzIjoiaHR0cDovL2dhdGV3YXkuZTJlbmV0d29ya3MuY29tL2F1dGgvcmVhbG1zL2FwaW1hbiIsImF1ZCI6ImFjY291bnQiLCJzdWIiOiI4NDQxNDc4Yy1hYjFlLTQzOWItYjQ1YS0xZWNkY2JhNmM3OWMiLCJ0eXAiOiJCZWFyZXIiLCJhenAiOiJhcGltYW51aSIsInNlc3Npb25fc3RhdGUiOiI1N2I3M2VmMy03ZjFlLTQ1NTctYjM3Zi0yOTFjYWQ1YjNhZDkiLCJhY3IiOiIxIiwiYWxsb3dlZC1vcmlnaW5zIjpbIiJdLCJyZWFsbV9hY2Nlc3MiOnsicm9sZXMiOlsib2ZmbGluZV9hY2Nlc3MiLCJ1bWFfYXV0aG9yaXphdGlvbiIsImFwaXVzZXIiLCJkZWZhdWx0LXJvbGVzLWFwaW1hbiJdfSwicmVzb3VyY2VfYWNjZXNzIjp7ImFjY291bnQiOnsicm9sZXMiOlsibWFuYWdlLWFjY291bnQiLCJtYW5hZ2UtYWNjb3VudC1saW5rcyIsInZpZXctcHJvZmlsZSJdfX0sInNjb3BlIjoicHJvZmlsZSBlbWFpbCIsInNpZCI6IjU3YjczZWYzLTdmMWUtNDU1Ny1iMzdmLTI5MWNhZDViM2FkOSIsImVtYWlsX3ZlcmlmaWVkIjpmYWxzZSwibmFtZSI6IlByYXRpeXVzaCBLdW1hciIsInByaW1hcnlfZW1haWwiOiJzdXBlcnRlYW1zQGUyZW5ldHdvcmtzLmNvbSIsImlzX3ByaW1hcnlfY29udGFjdCI6ZmFsc2UsInByZWZlcnJlZF91c2VybmFtZSI6InByYXRpeXVzaC5rdW1hckBzdXBlcnRlYW1zLmFpIiwiZ2l2ZW5fbmFtZSI6IlByYXRpeXVzaCIsImZhbWlseV9uYW1lIjoiS3VtYXIiLCJlbWFpbCI6InByYXRpeXVzaC5rdW1hckBzdXBlcnRlYW1zLmFpIn0.FzwXSsbCEJ0xdqOO_jcv_NGf-TdifbdKT2JbavufhSlya5zHSEM7cy1VO4kIUzTsGizl-VT-0p-QHJnN05kfrYc9VezZb4R7paZBSjLwo9VDsYtOeBG3RWBqelm8zv9LsLtjuf55i1xJAsusqVigmTxfYnDxqnxKn6sbvk4LcJ0"
    openai.api_key = token
    openai.base_url = "https://infer.e2enetworks.net/project/p-1450/endpoint/is-2778/v1/"
​
    completion = openai.chat.completions.create(
        model="meta-llama/Meta-Llama-3-8B-Instruct",
        messages=[
            {
                "role": "system",
                "content": "You are an answer generation agent, you'll be given context and query, generate answer in human readable form",
                
                "role": "user",
                "content": f"here's the question {query} and here's the context {'--'.join(context)}"
            },
        ],
​
    )
    return completion.choices[0].message.content

In [None]:

data = tokenize_paragraphs(pdf_corpus)   #data = [raw_text,{page_no, section_no}]


In [None]:
final_data = prepare_embeddings(data)
initialize_qdrant(len(final_data[0][1]))
qdrant_entry(final_data)


In [None]:
query = "Where is soum currently working at ? "

In [None]:
result = query_qdrant(query)

In [None]:
llm_context = prepare_llm_context(result)
response = query_llm(llm_context, query)