# Load Environment

In [59]:
import os
from dotenv import load_dotenv
load_dotenv()

True

In [60]:

GOOGLE_API_KEY = os.getenv("GOOGLE_API_KEY")
os.environ["GOOGLE_API_KEY"] = GOOGLE_API_KEY

# LLM -Gemini Pro libraries

In [61]:
%pip install llama-index-multi-modal-llms-gemini
%pip install llama-index-llms-gemini

Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.


In [62]:
from llama_index.multi_modal_llms.gemini import GeminiMultiModal
from llama_index.llms.gemini import Gemini

# Embedding libraries

In [64]:
%pip install llama-index-embeddings-gemini
%pip install llama-index-vector-stores-chroma


Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.


In [65]:
from llama_index.embeddings.gemini import GeminiEmbedding

In [66]:
%pip install llama-index --quiet
%pip install chromadb
%pip install sentence-transformers
%pip install pydantic==1.10.11


Note: you may need to restart the kernel to use updated packages.

Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.


In [67]:
safety_settings = [
  {
    "category": "HARM_CATEGORY_HARASSMENT",
    "threshold": "BLOCK_ONLY_HIGH"
  },
  {
    "category": "HARM_CATEGORY_HATE_SPEECH",
    "threshold": "BLOCK_ONLY_HIGH"
  },
  {
    "category": "HARM_CATEGORY_SEXUALLY_EXPLICIT",
    "threshold": "BLOCK_ONLY_HIGH"
  },
  {
    "category": "HARM_CATEGORY_DANGEROUS_CONTENT",
    "threshold": "BLOCK_ONLY_HIGH"
  },
]

# Configure Gemini LLM and Emdedding modals

In [117]:
 # not used videos images
gemini_pro = GeminiMultiModal(model_name="models/gemini-pro-vision", safety_settings=safety_settings)
#gemini = Gemini(model_name="models/gemini-pro", safety_settings=safety_settings, api_key=GOOGLE_API_KEY)
gemini = Gemini(model_name="models/gemini-pro", temperature=1, max_tokens=2048, safety_settings=safety_settings)
#used
gemini_embedding = GeminiEmbedding(model_name="models/embedding-001", api_key=GOOGLE_API_KEY)



# Libraries for Vector DB

In [69]:
# import
from llama_index.core import ServiceContext, VectorStoreIndex, download_loader, set_global_service_context
from llama_index.core import  SimpleDirectoryReader
from llama_index.vector_stores.chroma import ChromaVectorStore
from llama_index.core import StorageContext
from IPython.display import Markdown, display
import chromadb

# HR documents 

In [50]:
# Not using the below data
import urllib.request

# Create the directory
os.makedirs('data/paul_graham/', exist_ok=True)

# Download the file
url = 'https://raw.githubusercontent.com/run-llama/llama_index/main/docs/examples/data/paul_graham/paul_graham_essay.txt'
urllib.request.urlretrieve(url, 'data/paul_graham/paul_graham_essay.txt')


('data/paul_graham/paul_graham_essay.txt',
 <http.client.HTTPMessage at 0x1d4e1ba58a0>)

In [None]:
import os

def read_folder(folder):
    content = []
    for file in os.listdir(folder):
        file_path = os.path.join(folder, file)
        try:
            with open(file_path, 'r', encoding='utf-8') as f:
                print(f.read())
        except UnicodeDecodeError:
            # If decoding with utf-8 fails, try 'ISO-8859-1'
            with open(file_path, 'r', encoding='ISO-8859-1') as f:
                print(f.read())
    return content

folder = "C:\\Users\\SUBOMMAS\\LLM_Projects\\HRBOT\\resources\\HR_Documents"
docs = read_folder(folder)


In [93]:
documents = SimpleDirectoryReader("../resources/HR_Documents").load_data()

In [71]:
from llama_index.core.node_parser import SentenceSplitter
from llama_index.core.schema import MetadataMode

In [None]:
#Dont'run
def load_corpus(documents, verbose=False):
    if verbose:
        print(f"Loading files {documents}")
    if verbose:
        print(f"Loaded {len(documents)} docs")

    parser = SentenceSplitter()
    nodes = parser.get_nodes_from_documents(documents, show_progress=verbose)

    if verbose:
        print(f"Parsed {len(nodes)} nodes")

    return nodes

In [None]:
#Dont'run
train_nodes = load_corpus(documents, verbose=True)

# Configure Vector DB and Vector Index

In [94]:

# create client and a new collection
chroma_client = chromadb.EphemeralClient()
chroma_collection = chroma_client.create_collection("hraiboting")

# Define which embedding model to use "models/embedding-001"
#gemini_embed_model = GeminiEmbedding(model_name="models/embedding-001")


"""
#ServiceContext is deprecated, please use `llama_index.settings.Settings` instead.) -- Deprecated since version 0.10.0.
GOOGLE_API_KEY = os.getenv("GOOGLE_API_KEY")
from llama_index.core import Settings
# Using the embedding model to Gemini
Settings.embed_model = GeminiEmbedding(
    model_name="models/embedding-001", api_key=GOOGLE_API_KEY
)
Settings.llm = Gemini(api_key=GOOGLE_API_KEY)

"""

UniqueConstraintError: Collection hraiboting already exists

In [118]:
from llama_index.core import Settings
#as default chroma db uses the open ai embeddings
#setting up Gemini pro -llm and gemini embedding
Settings.llm = gemini
Settings.embed_model = gemini_embedding

In [119]:

# set up ChromaVectorStore and load in data
vector_store = ChromaVectorStore(chroma_collection=chroma_collection)

storage_context = StorageContext.from_defaults(vector_store=vector_store)
index = VectorStoreIndex.from_documents(
    documents, storage_context=storage_context)

In [120]:
index

<llama_index.core.indices.vector_store.base.VectorStoreIndex at 0x1d4e6df2620>

# Query prompt from index query engine

In [121]:
# Query Data
query_engine = index.as_query_engine()
response = query_engine.query("Which is the company, how much is the size of the ipo?")

In [None]:
import pathlib
import textwrap
from IPython.display import  Markdown
from IPython.display import  display

def to_markdown(text):
    #text = text.replace(".", "*")
    return Markdown(textwrap.indent(text,'> ', predicate=lambda _:True))
    

# Prompt response

In [122]:
display(Markdown(f"<b>{response}</b>"))

<b>The company is Vibhor Steel Tubes, and the size of the IPO is ₹72.17 crores.</b>