# SCOPE and OBEJECTIVE
- observe change in the `uploads` folder
- if any change, get path of changed file and read it
- update the vector store and index - Document management
  - https://docs.llamaindex.ai/en/stable/module_guides/indexing/document_management.html
- test the llm is able to answers to the question from updated context.

In [9]:
import os
GOOGLE_API_KEY = os.getenv("GOOGLE_API_KEY")
GOOGLE_API_KEY

'AIzaSyDpBJeTmKU1yHCUxF-T85nxTi3dMxqbZTY'

In [10]:
from llama_index.llms.gemini import Gemini
from llama_index.embeddings.gemini import GeminiEmbedding
safety_settings = [
  {
    "category": "HARM_CATEGORY_HARASSMENT",
    "threshold": "BLOCK_ONLY_HIGH"
  },
  {
    "category": "HARM_CATEGORY_HATE_SPEECH",
    "threshold": "BLOCK_ONLY_HIGH"
  },
  {
    "category": "HARM_CATEGORY_SEXUALLY_EXPLICIT",
    "threshold": "BLOCK_ONLY_HIGH"
  },
  {
    "category": "HARM_CATEGORY_DANGEROUS_CONTENT",
    "threshold": "BLOCK_ONLY_HIGH"
  },
]

gemini = Gemini(model_name="models/gemini-pro", temperature=1, safety_settings=safety_settings)
gemini_embedding = GeminiEmbedding(model_name="models/embedding-001", api_key=GOOGLE_API_KEY)

In [11]:
from llama_index.core.settings import Settings
Settings.llm = gemini
Settings.embed_model = gemini_embedding


In [13]:
from llama_index.core import SimpleDirectoryReader
documents = SimpleDirectoryReader("../resources/HR_Documents").load_data()

In [14]:
document = SimpleDirectoryReader("../resources/uploads").load_data()
documents.append(document)

In [None]:
documents

In [18]:
from llama_index.core import SimpleDirectoryReader
from llama_index.core.node_parser import SentenceSplitter

documents = SimpleDirectoryReader("../resources/HR_Documents").load_data()


custom_separator = '\n'
custom_chunk_size = 5000
custom_chunk_overlap = 50
custom_paragraph_separator = '\n\n\n'
custom_regex = '[^,.;。？！]+[,.;。？！]?'


custom_sentence_splitter = SentenceSplitter(
    separator=custom_separator,
    chunk_size=custom_chunk_size,
    chunk_overlap=custom_chunk_overlap,
    paragraph_separator=custom_paragraph_separator,    
)

In [32]:

import re
import os 
import pdfplumber

def pdf_folder_reader(folder_path):

    pdf_text =""

    for filename in os.listdir(folder_path):

        if filename.endswith(".pdf"):
            file_path = os.path.join(folder_path, filename)

            #read the pdf
            with pdfplumber.open(file_path) as file:
                for page in file.pages:
                    pdf_text += page.extract_text()
        else:
            file_path = os.path.join(folder_path, filename)

            #read the pdf
            with open(file_path) as file:
                for page in file.pages:
                    pdf_text += page.extract_text()
            
    return pdf_text

def preprocess_text(raw_text):
    # Remove dots
    text_without_dots = raw_text.replace('.', '.')
    
    # Normalize whitespace
    text_normalized = ' '.join(text_without_dots.split())

    return text_normalized


def remove_special_characters(text):
    # Define a regular expression to match any non-alphanumeric character
    pattern = re.compile(r'[^a-zA-Z0-9\s]')
    
    # Use the sub method to replace matched characters with an empty string
    cleaned_text = re.sub(pattern, '', text)
    
    return cleaned_text

In [44]:
folder_path = f"../resources/uploads"

document = pdf_folder_reader(folder_path)

cleaned_text = preprocess_text(document)

cleaned_text = remove_special_characters(cleaned_text)

custom_chunks = custom_sentence_splitter.split_text(cleaned_text)

In [None]:
custom_chunks

In [34]:
# Print the result
for i, chunk in enumerate(custom_chunks, start=1):
    print(f"Chunk {i}:\n{chunk}\n")

Chunk 1:




In [39]:
from llama_index.core import VectorStoreIndex, Document, embeddings
import chromadb
from llama_index.vector_stores.chroma import ChromaVectorStore
from llama_index.core import VectorStoreIndex, StorageContext, load_index_from_storage

doc = SimpleDirectoryReader("../resources/HR_Documents").load_data()

client = chromadb.EphemeralClient()
collection = client.get_or_create_collection("management")

store = ChromaVectorStore(chroma_collection=collection)
context = StorageContext.from_defaults(vector_store= store)

index = VectorStoreIndex.from_documents(documents=doc, vector_store=store, storage_context=context, embed_model=gemini_embedding)


In [46]:
doc_chunks = []
for i, text in enumerate(custom_chunks):
    doc = Document(text=text, id_=f"doc_id_{i}")
    doc_chunks.append(doc)

# insert
for doc_chunk in doc_chunks:
    index.insert(doc_chunk)

In [47]:


query_engine=index.as_query_engine()
response=query_engine.query("indetail about the Gopal snacks ipo")
response.response

"Gopal Snacks, founded in 1999, is a well-known FMCG company that offers a variety of snack products such as namkeen, wafers, and extruded snacks. As of September 2023, the company sold its products to over 523 locations across 10 states and two Union Territories. Its revenue increased by 31%, and profit after tax (PAT) rose by 170.52% between the financial years ending March 31, 2022, and March 31, 2023. The company's ROE, ROCE, and Debt/Equity ratios are 16.05%, 20.83%, and 0.08, respectively. The Pre-IPO EPS is Rs 9.02, while the Post-IPO EPS is Rs 8.92. The company plans to use the proceeds from the IPO to meet its working capital requirements, repay certain debts, and fund capital expenditures."

In [None]:
# when the uploads folder gets changed
# read the file and append to the exiting docuemnt or new docuemnts - use Sequence
# call the VectrStoreIndex.refresh

## folder observer 

In [2]:
%pip install watchdog

Collecting watchdog
  Downloading watchdog-4.0.0-py3-none-win_amd64.whl.metadata (37 kB)
Downloading watchdog-4.0.0-py3-none-win_amd64.whl (82 kB)
   ---------------------------------------- 0.0/82.9 kB ? eta -:--:--
   ---------------------------------------- 82.9/82.9 kB 1.5 MB/s eta 0:00:00
Installing collected packages: watchdog
Successfully installed watchdog-4.0.0
Note: you may need to restart the kernel to use updated packages.


In [1]:
from watchdog.observers import Observer
from watchdog.events import FileSystemEventHandler

class FolderEventHandler(FileSystemEventHandler):
    def on_any_event(self, event):
        if event.is_directory:
            # Folder change event
            document = SimpleDirectoryReader(folder_path).load_data()
            #print(f'Folder {event.src_path} has been modified')
            print(document) 

if __name__ == "__main__":
    folder_path = f"C:/Users/SUBOMMAS/LLM_Projects/HRBOT/frontend/resources/uploads"

    event_handler = FolderEventHandler()
    observer = Observer()
    observer.schedule(event_handler, path=folder_path, recursive=True)
    observer.start()

    try:
        print(f"Watching folder: {folder_path}")
        observer.join()
    except KeyboardInterrupt:
        observer.stop()
        print("Watcher stopped")


Watching folder: C:/Users/SUBOMMAS/LLM_Projects/HRBOT/frontend/resources/uploads


: 

In [None]:
print("it runs")

* Issue Type: Book Built Issue * Offer for Sale: 16209476 shares aggregating up to cid065000 Cr * Face Value: cid01 per share * Price Band: cid0381 to cid0401 per share * Minimum Lot Size: 37 Shares * Minimum Amount of Investment for Retail Investors: cid014837 * IPO Date: March 6, 2024 to March 11, 2024 * Listing Date: Tentatively set as Thursday, March 14, 2024 * Listing at: BSE NSE