# Stage1 - PDFreader with pdfplumber

In [7]:
#%pip install pdfplumber

In [8]:
#%pip show pdfplumber

In [1]:
import os 
import pdfplumber

def pdf_folder_reader(folder_path):

    pdf_text =""

    for filename in os.listdir(folder_path):

        if filename.endswith(".pdf"):
            file_path = os.path.join(folder_path, filename)

            #read the pdf
            with pdfplumber.open(file_path) as file:
                for page in file.pages:
                    pdf_text += page.extract_text()
    return pdf_text
folder_path = f"../resources/uploads"
document = pdf_folder_reader(folder_path)

In [10]:
#document

# Stage2 - text preprocessing
 - steps include:
   1. Removing unwanted characters (dots, spaces, etc.).
   2. Normalizing whitespace.
   3. Handling line breaks.

In [2]:

import re

def preprocess_text(raw_text):
    # Remove dots
    text_without_dots = raw_text.replace('.', '.')
    
    # Normalize whitespace
    text_normalized = ' '.join(text_without_dots.split())
    pattern = re.compile(r'[^a-zA-Z0-9\s]')
    
    # Use the sub method to replace matched characters with an empty string
    cleaned_text = re.sub(pattern, '', text_normalized)

    return cleaned_text 

cleaned_text = preprocess_text(document)


In [3]:

def remove_special_characters(text):
    # Define a regular expression to match any non-alphanumeric character
    pattern = re.compile(r'[^a-zA-Z0-9\s]')
    
    # Use the sub method to replace matched characters with an empty string
    cleaned_text = re.sub(pattern, '', text)
    
    return cleaned_text
cleaned_text = remove_special_characters(cleaned_text)


In [None]:
cleaned_text

# Stage3: chunking & embedding


## Text Splitter 

In [4]:
from llama_index.core.node_parser import SentenceSplitter

In [5]:

custom_separator = '\n'
custom_chunk_size = 5000
custom_chunk_overlap = 50
custom_paragraph_separator = '\n\n\n'
custom_regex = '[^,.;。？！]+[,.;。？！]?'


custom_sentence_splitter = SentenceSplitter(
    separator=custom_separator,
    chunk_size=custom_chunk_size,
    chunk_overlap=custom_chunk_overlap,
    paragraph_separator=custom_paragraph_separator,    
)

# Split the text into chunks using custom parameters
custom_chunks = custom_sentence_splitter.split_text(cleaned_text)


In [None]:
# Print the result
for i, chunk in enumerate(custom_chunks, start=1):
    print(f"Chunk {i}:\n{chunk}\n")

## Embedding

In [6]:
import os

GOOGLE_API_KEY = os.getenv("GOOGLE_API_KEY")
os.environ["GOOGLE_API_KEY"] = GOOGLE_API_KEY

In [7]:
GOOGLE_API_KEY

'AIzaSyDpBJeTmKU1yHCUxF-T85nxTi3dMxqbZTY'

In [8]:
safety_settings = [
  {
    "category": "HARM_CATEGORY_HARASSMENT",
    "threshold": "BLOCK_ONLY_HIGH"
  },
  {
    "category": "HARM_CATEGORY_HATE_SPEECH",
    "threshold": "BLOCK_ONLY_HIGH"
  },
  {
    "category": "HARM_CATEGORY_SEXUALLY_EXPLICIT",
    "threshold": "BLOCK_ONLY_HIGH"
  },
  {
    "category": "HARM_CATEGORY_DANGEROUS_CONTENT",
    "threshold": "BLOCK_ONLY_HIGH"
  },
]

In [9]:
from llama_index.embeddings.gemini import GeminiEmbedding
from llama_index.llms.gemini import Gemini

gemini_embedding = GeminiEmbedding(model_name="models/embedding-001", api_key=GOOGLE_API_KEY)
gemini = Gemini(model_name="models/gemini-pro", temperature=1, max_tokens=2048, safety_settings=safety_settings)

  from .autonotebook import tqdm as notebook_tqdm


In [21]:
#embeddings = gemini_embedding.get_text_embedding(custom_chunks)

In [22]:
#print(f"Dimension of embeddings: {len(embeddings)}")
#print(embeddings[0][:5])
#print(embeddings[1][:5])

## Emdedding function

In [10]:
def text_to_embedding(text):
    embeddings = gemini_embedding.get_text_embedding(text)
    return embeddings
    

In [11]:
embeddings = []
for chunk in custom_chunks:
  chunk_embedding = text_to_embedding(chunk)
  embeddings.append(chunk_embedding)

In [25]:
#embeddings

# Embeddings and Vector store

In [14]:
len(custom_chunks)

1

In [27]:
len(embeddings)

55

In [16]:
data = []
for i, chunk in enumerate(custom_chunks):
  # Create a dictionary for each data point
  data_point = {
      #"text": chunk,  # Original text chunk
      "embedding": embeddings[i],  # Corresponding embedding for the chunk
      "id": str(f"chunk_{i}")  # Unique identifier for each chunk
  }
  data.append(data_point)


In [17]:
ids = [item['id'] for item in data]

In [30]:
data_embeddings = [item['embedding'] for item in data]

In [14]:
import chromadb
from llama_index.vector_stores.chroma import ChromaVectorStore
from llama_index.core import VectorStoreIndex, StorageContext, SimpleDirectoryReader, load_index_from_storage
from IPython.display import Markdown

In [17]:
#Llama global   Settings
from llama_index.core import Settings

#as default chroma db uses the open ai embeddings
#setting up Gemini pro -llm and gemini embedding
Settings.llm = gemini
Settings.embed_model = gemini_embedding
Settings.text_splitter = custom_sentence_splitter

## Chorma PersistentClient

### Load the data
getting your data from wherever it lives, whether that’s unstructured text, PDFs, databases, or APIs to other applications. 

In [21]:
documents = SimpleDirectoryReader("../resources/HR_Documents").load_data()

### Indexing and Storing
Indexing is designed to enable querying by an LLM. 

#### Store Using Vector Stores

In [10]:
# save to disk 
db = chromadb.PersistentClient(path="../resources/chroma_db")   
chroma_collection = db.get_or_create_collection("quickstart")
vector_store = ChromaVectorStore(chroma_collection=chroma_collection)
vector_context = StorageContext.from_defaults(vector_store=vector_store)

# build index
index = VectorStoreIndex.from_documents(
    documents=documents, storage_context=vector_context, embed_model=gemini_embedding, transformations=[custom_sentence_splitter]
)

In [None]:
# load from disk
index = VectorStoreIndex.from_vector_store(vector_store)
#OR#
#index = load_index_from_storage(storage_context=storage_context)

#### Store using .persist()

In [31]:
PERSIST_DIR ="../resources/persist"
persist_client = chromadb.PersistentClient(path=PERSIST_DIR)
persist_collection = persist_client.get_or_create_collection("persist")

persist_store = ChromaVectorStore(chroma_collection=persist_collection)
persist_context = StorageContext.from_defaults(vector_store=persist_store)
# load the data and create index
index = VectorStoreIndex.from_documents(documents=documents, storage_context=persist_context, embed_model=gemini_embedding)
index.storage_context.persist(persist_dir=PERSIST_DIR)



In [32]:
# load the exiting index
#storage_context= StorageContext.from_defaults(persist_dir=PERSIST_DIR)
index=load_index_from_storage(storage_context=persist_context)



### Query

##### Query Data from the stored vector

In [29]:
#Without retriever
index = VectorStoreIndex.from_vector_store(vector_store)
query_engine = index.as_query_engine()
response = query_engine.query("what are key points in context")
display(Markdown(f"<b>{response}</b>"))

Number of requested results 2 is greater than number of elements in index 1, updating n_results = 1


<b>1. Vibhor Steel Tubes IPO is set for launch on 13 February 2024 with a price band of ₹141-151 per share.
2. The company's strengths include its partnership with Jindal Pipes, experienced promoters, and a large workforce.
3. Key risks to consider include the company's dependence on Jindal Pipes, negative cash flow from financing and investing activities, and volatile steel prices.
4. The company's financial performance has shown growth in recent years, with an upward trend in profitability and key ratios such as Return on Equity (ROE).
5. Vibhor Steel Tubes has the lowest Price to Earnings (P/E) ratio among its peers.</b>

In [None]:
#With COntextChat engine
from llama_index.core.memory import ChatMemoryBuffer

memory = ChatMemoryBuffer.from_defaults(token_limit=3000)
#VectorStoreIndex.refresh(vector_store)

index = VectorStoreIndex.from_vector_store(vector_store)
chat_engine = index.as_chat_engine(
    chat_mode="context",
    memory=memory,
    system_prompt=(
        "You are a chatbot, able to have normal interactions, as well as talk and can greet"
        " you are a chatbot well trained on IPO context and its working "
    ),

)

response = chat_engine.chat("when ipo is opening for bidding")
display(Markdown(f"<b>{response}</b>"))

In [28]:
#With retriever
from llama_index.core.retrievers import VectorIndexRetriever
from llama_index.core import VectorStoreIndex, get_response_synthesizer
from llama_index.core.query_engine import RetrieverQueryEngine
from llama_index.core.postprocessor import SimilarityPostprocessor

#load index
index = VectorStoreIndex.from_vector_store(vector_store)

# configure retriever
retriever = VectorIndexRetriever(
    index=index,
    similarity_top_k=1,
)
# configure response synthesizer
response_synthesizer = get_response_synthesizer()

# assemble query engine
query_engine = RetrieverQueryEngine(
    retriever=retriever,
    response_synthesizer=response_synthesizer,
    node_postprocessors=[SimilarityPostprocessor(similarity_cutoff=0)],
)

# query
response = query_engine.query("provide the ipo details?")
print(response)

Number of requested results 8 is greater than number of elements in index 1, updating n_results = 1


The details for the Vibhor Steel Tubes IPO are as follows:

- IPO is scheduled from 13th to 15th February 2024.
- Face value of ₹10 per share.
- Price range is ₹141-152 per share.
- Total IPO size is ₹72.17 crore.
- No Offer for Sale (OFS) component.
- Fresh issue of ₹72.17 crore.


##### Query Data from persisted data

In [34]:
#Without retriever
index=load_index_from_storage(storage_context=persist_context)
query_engine = index.as_query_engine()
response = query_engine.query("Summarize the IPO")
display(Markdown(f"<b>{response}</b>"))

<b>Vibhor Steel Tubes IPO opens for subscription from Feb 13 to Feb 15, 2024, with a price band of ₹141-152 per share. The IPO has a face value of ₹10 per share and comprises a fresh issue of ₹72.17 crores. The company’s strengths include a partnership with Jindal Pipes, diverse networking channels, and a workforce of 636 employees. However, the company faces risks related to negative cash flow from financing and investing activities, volatile steel prices, and continuous increase in total borrowing.</b>

In [33]:
#With retriever
from llama_index.core.retrievers import VectorIndexRetriever
from llama_index.core import VectorStoreIndex, get_response_synthesizer
from llama_index.core.query_engine import RetrieverQueryEngine
from llama_index.core.postprocessor import SimilarityPostprocessor

#load index
index = load_index_from_storage(storage_context=persist_context)

# configure retriever
retriever = VectorIndexRetriever(
    index=index,
    similarity_top_k=6,
)
# configure response synthesizer
response_synthesizer = get_response_synthesizer()

# assemble query engine
query_engine = RetrieverQueryEngine(
    retriever=retriever,
    response_synthesizer=response_synthesizer,
    node_postprocessors=[SimilarityPostprocessor(similarity_cutoff=0)],
)

# query
response = query_engine.query("how much of the ipo size and provide the investors portions for the ipo with how many crores can each group can invest?")
print(response)

The IPO size is 72.17 crores.

QIBs can invest 50% of this amount which is 36.085 crores

NIIs can invest 35% of this amount which is 25.2595 crores

Retail investors can invest 35% of this amount which is 25.2595 crores


## Default llama-index Vector Store
Store your indexed data is to use the built-in `StorageContext.persist()` method of every Index, which writes all the data to disk at the location specified. 

In [13]:
from llama_index.core.storage.docstore import SimpleDocumentStore
from llama_index.core.storage.index_store import SimpleIndexStore
from llama_index.core.vector_stores import SimpleVectorStore

In [22]:


# not specfic to chroma db
PERSIST_DIR ="../resources/persistenDB"
if not os.path.exists(PERSIST_DIR):
    # create storage context using default stores
    storage_context = StorageContext.from_defaults(
        docstore=SimpleDocumentStore(),
        vector_store=SimpleVectorStore(),
        index_store=SimpleIndexStore(),
    )
    # load the data and create index
    index = VectorStoreIndex.from_documents(documents=documents, embed_model=gemini_embedding)
    index.storage_context.persist(persist_dir=PERSIST_DIR)
else:
    # load the exitiong index
    storage_context= StorageContext.from_defaults(persist_dir=PERSIST_DIR)
    index=load_index_from_storage(storage_context=storage_context)


In [29]:
from llama_index.core import PromptTemplate
custom_qa_prompt_str=(
    "Given the context information and not prior knowledge, "
    "answer the query in the style of a Francis Beaumont play.\n"
    "Query: {query_str}\n"
    "Answer: "
) 
qa_prompt_tmpl = PromptTemplate(custom_qa_prompt_str)

query_engine=index.as_query_engine(text_qa_template = qa_prompt_tmpl)

query_engine=index.as_query_engine()
prompts_dict = query_engine.get_prompts()
#print(list(prompts_dict))

response = query_engine.query("Summarize the IPO details")
Markdown(f"<b>{response.response}</b>") 

<b>Vibhor Steel Tubes IPO, set to open for subscription from February 13 to 15, 2024, offers a price band of ₹141-152 per share. The IPO size is ₹72.17 crore, and the lot size is 99 shares. The issue comprises only fresh issue and has reserved shares for qualified institutional buyers (QIBs), non-institutional investors (NIIs), and retail investors.</b>

In [None]:
response.source_nodes

# Setting up the Chatbot Agent
Testing the Agent
https://docs.llamaindex.ai/en/stable/understanding/putting_it_all_together/chatbots/building_a_chatbot.html

Data Agents are LLM-powered knowledge workers in LlamaIndex that can intelligently perform various tasks over your data, in both a “read” and “write” function. They are capable of the following:

Perform automated search and retrieval over different types of data - unstructured, semi-structured, and structured.

Calling any external service API in a structured fashion, and processing the response + storing it for later.

In [None]:
from llama_index.core.agent import ReActAgent

# Manual Embeddings

In [61]:
# Create an ephemeral ChromaDB client
chroma_client = chromadb.PersistentClient(path="../resources/manual")
#chroma_collection = chroma_client.create_collection(path="../resources/embeddingsDB")

# Create a Chroma collection
chroma_collection = chroma_client.get_or_create_collection(name="embed_collecting")


# Add embeddings to the Chroma collection
chroma_collection.upsert(ids=ids, embeddings=data_embeddings)


In [43]:
# save to disk 
#vector_store = ChromaVectorStore(chroma_collection=chroma_collection)
#storage_context = StorageContext.from_defaults(vector_store=vector_store)

In [12]:
from IPython.display import Markdown
# load from disk
db2 = chromadb.PersistentClient(path="../resources/manual")
chroma_collection = db2.get_or_create_collection("embed_collecting")
vector_store = ChromaVectorStore(chroma_collection=chroma_collection)
storage_context = StorageContext.from_defaults(vector_store=vector_store)
index = VectorStoreIndex.from_vector_store(
    vector_store, storage_context= storage_context
)
#embeddings
data = []
for i, chunk in enumerate(custom_chunks):
  # Create a dictionary for each data point
  data_point = {
      #"text": chunk,  # Original text chunk
      "embedding": embeddings[i],  # Corresponding embedding for the chunk
      "id_": str(f"chunk_{i}")  # Unique identifier for each chunk
  }
  data.append(data_point)

index.insert(data)


# Query Data from the persisted index
query_engine = index.as_query_engine()
response = query_engine.query("explain few shift allowance benifts")
display(Markdown(f"<b>{response}</b>"))

NameError: name 'chromadb' is not defined

In [None]:
response = collection.query(
    query_embeddings=[[1.1, 2.3, 3.2], [5.1, 4.3, 2.2]],
    n_results=2,
    where={"style": "style2"}
)

InvalidDimensionException: Embedding dimension 3 does not match collection dimensionality 768

"""index = VectorStoreIndex(
    vector_store,
    storage_context=StorageContext.in_memory(),
    filename='vector_index.llama',
)"""

In [47]:
from IPython.display import Markdown
# load from disk
db2 = chromadb.PersistentClient(path="../resources/manual")
chroma_collection = db2.get_or_create_collection("embed_collecting")
vector_store = ChromaVectorStore(chroma_collection=chroma_collection)
index = VectorStoreIndex.from_vector_store(
    vector_store,
    embed_model= gemini_embedding,
)

# Query Data from the persisted index
query_engine = index.as_query_engine()
#response = query_engine.query("which is comapny that coming for ipo")
display(Markdown(f"<b>{response}</b>"))

<b>The IPO dates are from 13th February 2024 to 15th February 2024.</b>

In [None]:
#Defalut chroma
if os.path.exists("../resources/manual"):
    storage_context= StorageContext.from_defaults(vector_store=vector_store)
    index=load_index_from_storage(storage_context=storage_context)

ValueError: No index in storage context, check if you specified the right persist_dir.

In [None]:

# Set up ChromaVectorStore with the required config value 'chroma_server_host'
vector_store = ChromaVectorStore(chroma_collection=chroma_collection)

# Load data into the ChromaVectorStore
#vector_store.load(collection=chroma_collection, field='embedding')

# Create a vector index

#storage_context = StorageContext.from_defaults(vector_store=vector_store)
index = VectorStoreIndex.from_vector_store(
    vector_store=vector_store, embed_model=gemini_embedding
)


In [None]:
# Query Data from the persisted index
query_engine = index.as_query_engine()
response = query_engine.query("what are key points in context")
display(Markdown(f"<b>{response}</b>"))


In [49]:
chroma_collection.count()

55

In [None]:
results = chroma_collection.query(
    query_texts=["what is this document about"],
    n_results=1
)

In [None]:
results

{'ids': [[]],
 'distances': [[]],
 'metadatas': [[]],
 'embeddings': None,
 'documents': [[]],
 'uris': None,
 'data': None}

In [None]:
# how to use chunks to create vector index,

In [35]:

# build index
def build_index(dir, documents, key):
    PERSIST_DIR = dir
    persist_client = chromadb.PersistentClient(path=PERSIST_DIR)
    persist_collection = persist_client.get_or_create_collection("persist")

    persist_store = ChromaVectorStore(chroma_collection=persist_collection)
    persist_context = StorageContext.from_defaults(vector_store=persist_store)
    # load the data and create index
    index = VectorStoreIndex.from_documents(documents=documents, storage_context=persist_context, embed_model=gemini_embedding)
    index.storage_context.persist(persist_dir=PERSIST_DIR)



  

In [36]:
PERSIST_DIR = f"C:/Users/SUBOMMAS/LLM_Projects/HRBOT/resources/finetuned2"
PERSIST_DIR ="../resources/persist"
if os.path.exists(PERSIST_DIR):
  index=load_index_from_storage(storage_context=persist_context)
else:
 index = build_index(dir=PERSIST_DIR, documents=documents, key=GOOGLE_API_KEY)

In [37]:
query_engine=index.as_query_engine()
response = query_engine.query("Summarize the IPO details")
print(response)

The Vibhor Steel Tubes IPO will take place between 13th and 15th February 2024, with a price band of ₹141-151 per share. The face value of each share is ₹10, and the total size of the IPO is ₹72.17 crores. The entire issue is a fresh issue, and no offer for sale is included.


Huggingface space to host