In [None]:
!pip install langchain pinecone-client sentence-transformers streamlit PyMuPDF bitsandbytes langchain-pinecone pinecone-client langchain_huggingface langchainhub

In [None]:
!pip install accelerate -U

In [None]:
!pip install -U langchainimport fitz  # PyMuPDF

In [None]:
!pip install -U langchain-community

# Set up VectorDB

In [None]:
import fitz  # PyMuPDF

#extract text from pdf

def extract_text_from_pdf(pdf_path):
    document = fitz.open(pdf_path)
    text = ""
    for page_num in range(len(document)):
        page = document.load_page(page_num)
        text += page.get_text()
    return text

pdf_paths = ["GOOG.pdf", "TSLA.pdf", "UBER.pdf"]
texts = [extract_text_from_pdf(pdf_path) for pdf_path in pdf_paths]

In [None]:
from sentence_transformers import SentenceTransformer

# Load the embedding model

embedding_model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')

In [None]:
from langchain.text_splitter import CharacterTextSplitter

# chunk the doc text into smaller pieces

text_splitter = CharacterTextSplitter(
    separator = "\n",
    chunk_size = 1024,
    chunk_overlap  = 20 # number of tokens to overlap
)

docs = []
for text in texts:
    docs.append(text_splitter.split_text(text))
docs

In [None]:
# manually chunk the text for granular control

def split_text(text, chunk_size=500, overlap=10):
    chunks = []
    current_chunk = []
    current_length = 0

    for paragraph in text.split('\n'):
        if current_length + len(paragraph) > chunk_size:
            chunks.append(" ".join(current_chunk))
            current_chunk = current_chunk[-overlap:]  # Overlap
            current_length = sum(len(p) for p in current_chunk)
        current_chunk.append(paragraph)
        current_length += len(paragraph)

    if current_chunk:
        chunks.append(" ".join(current_chunk))

    return chunks

chunk_size = 500
overlap = 10
chunked_texts = [split_text(text, chunk_size, overlap) for text in texts]

In [None]:
# encode the chunks using embedding model

chunked_embeddings = []
for chunks in docs:
    embeddings = embedding_model.encode(chunks)
    chunked_embeddings.append(embeddings)

In [None]:
# create vectors for pinecone

vectors = []
for doc_id, (chunks, embeddings) in enumerate(zip(docs, chunked_embeddings)):
    if doc_id == 0:
        company = "GOOG"
    elif doc_id == 1:
        company = "TSLA"
    elif doc_id == 2:
        company = "UBER"
    for chunk_id, (chunk, embedding) in enumerate(zip(chunks, embeddings)):
        vector_id = f"{doc_id}_{chunk_id}"
        vectors.append((vector_id, # id to uniquely identify the vector
                        embedding,
                        {"text": chunk,
                         "company": company}
                        ))

In [None]:
from pinecone import Pinecone, ServerlessSpec
import os

# create a pinecone object

pc = Pinecone(api_key=os.environ["PINECONE_API_KEY"], environment="us-west1-gcp", serverless=ServerlessSpec("us-west1-gcp"))

In [None]:
# index name to store the vectors

index_name = "contentai"

In [None]:
# create an index

pc.create_index(
    name=index_name,
    dimension=384, # dimension of the embedding vectors
    metric="cosine", # distance metric for similarity search
    spec=ServerlessSpec(
        cloud="aws",
        region="us-east-1"
    )
)

In [None]:
# ready the index
index = pc.Index(index_name)

In [None]:
# Upsert vectors into Pinecone index in batches
BATCH_SIZE = 100

# batches required to upsert all vectors else API throws error

for i in range(0, len(vectors), BATCH_SIZE):
    try:
        batch = vectors[i: i + BATCH_SIZE]
        index.upsert(vectors=batch)
        print("Upserted batch starting at index:", i)
    except Exception as e:
        # If the batch fails, upsert the vectors at lower batch size
        BATCH_SIZE = 50
        batch = vectors[i: i + BATCH_SIZE]
        index.upsert(vectors=batch)
        print("Upserted batch starting at index:", i)

# Query the VectorDB

In [None]:
from pinecone import Pinecone, ServerlessSpec
import os

# set up pinecone object

pc = Pinecone(api_key=os.environ["PINECONE_API_KEY"], environment="us-west1-gcp", serverless=ServerlessSpec("us-west1-gcp"))

index_name = "contentai"

index = pc.Index(index_name)

In [None]:
from sentence_transformers import SentenceTransformer

# load the embedding model

embedding_model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')

In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline

# load the chatbot model

model_id = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForCausalLM.from_pretrained(model_id)
pipe = pipeline("text-generation", model=model,
                tokenizer=tokenizer,
                device_map="cuda")

In [None]:
# example query
query_text = "What are the differences in the business of Tesla and Uber?"

In [None]:
# embed the query text
encoded_query = embedding_model.encode(query_text).tolist()

k_responses = 2 # no of responses to fetch from each docs text

goog_results = index.query(vector=encoded_query,
                           filter={"company":"GOOG"},
                           top_k=k_responses,
                           include_metadata = True # include metadata to get the original text of the document
                           )

tsla_results = index.query(vector=encoded_query,
                           filter={"company":"TSLA"},
                           top_k=k_responses,
                           include_metadata = True
                           )

uber_results = index.query(vector=encoded_query,
                           filter={"company":"UBER"},
                           top_k=k_responses,
                           include_metadata = True
                           )

# get the context of the documents

context = "======== Google Document Context ========\n"
for chunk in [matchs['metadata']['text'] for matchs in uber_results['matches']]:
    context += chunk

context += '''\n======== Tesla Document Context ========\n'''
for chunk in [matchs['metadata']['text'] for matchs in tsla_results['matches']]:
    context += chunk

context += '''\n======== Uber Document Context ========\n'''
for chunk in [matchs['metadata']['text'] for matchs in goog_results['matches']]:
    context += chunk
context

In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch

# ready the chatbot model

model_id = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForCausalLM.from_pretrained(model_id).to("cuda")

# prompt message for the chatbot

messages = [
    {
        "role": "system",
        "content": "You are an AI assistant tasked with analyzing and comparing multiple PDF documents, specifically Form 10-K filings for Alphabet Inc., Tesla, Inc., and Uber Technologies, Inc. You are to provide clear, concise, and accurate responses for each query based on the extracted document content provided as context.",
    },
    {
        "role": "user",
        "content": f"Document Content: {context}\n\n=======================User Query =======================\n\n",
    },
    {
        "role": "user",
        "content": f"{query_text} \n\n=======================End Of User Query =======================\n\n",
    },
    {
        "role": "System Response",
        "content": ""
    }
]

# Prepare the prompt manually
prompt = ""
for message in messages:
    role = message["role"]
    content = message["content"]
    prompt += f"{role}: {content}\n"

# Encode the prompt
input_ids = tokenizer(prompt, return_tensors="pt").input_ids.to("cuda")

# Generate output
with torch.no_grad():
    outputs = model.generate(input_ids,
                             max_new_tokens=512,
                             min_new_tokens=256,
                             temperature=1.0,
                             repetition_penalty=1.2
                             )

# Decode the output
generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)

print(generated_text)
