### This file creates embeddings based on the given text data and saves the model

In [1]:
# importing necessary libraries
import os
from langchain.document_loaders import TextLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import Chroma
from langchain import hub
from langchain_google_genai import ChatGoogleGenerativeAI
from langchain_core.runnables import RunnablePassthrough
from langchain_core.output_parsers import StrOutputParser
from langchain.prompts import PromptTemplate

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# --- 1. Load the Document ---
# Make sure your file path is correct
from langchain_community.document_loaders import DirectoryLoader, TextLoader
from dotenv import load_dotenv

# Use a Glob pattern to load all .txt files from the 'content' directory
loader = DirectoryLoader(
    "./content/",
    glob="*.txt",
    loader_cls=TextLoader
)
data = loader.load()

# --- 2. Split the Document (with larger chunk size) ---
# We use a larger chunk size to get more context per document chunk
text_splitter = RecursiveCharacterTextSplitter(chunk_size=2000, chunk_overlap=200)
docs = text_splitter.split_documents(data)
print(f"Total number of documents (chunks): {len(docs)}")

# --- 3. Create Embeddings and Vector Store ---
# The embedding model converts text to numerical vectors
embedding = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")

# Create the vector store and a retriever that gets more documents
vectorstore = Chroma.from_documents(documents=docs, embedding=embedding)
# We set 'k' to 10 to retrieve the top 10 most relevant documents
retriever = vectorstore.as_retriever(search_kwargs={"k": 10})

# --- 4. Define the RAG Prompt and Language Model ---
# We define a custom prompt to encourage a more detailed answer
template = """
You are a helpful assistant. Use the following context to answer the question.
If you don't know the answer, just say that you don't know. Be very detailed and comprehensive in your answer, providing a thorough summary based on the given context.

Context:
{context}

Question:
{question}

Helpful Answer:"""

prompt = PromptTemplate.from_template(template)
load_dotenv()
# Initialize the Gemini Language Model
# Note: You need to set up your Google API key in Colab secrets
llm = ChatGoogleGenerativeAI(model="gemini-1.5-flash-latest", temperature=0.7, google_api_key=os.environ.get('GOOGLE_API_KEY'))

# --- 5. Build the RAG Chain ---
def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)

rag_chain = (
    {"context": retriever | format_docs, "question": RunnablePassthrough()}
    | prompt
    | llm
    | StrOutputParser()
)


Total number of documents (chunks): 268


  embedding = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
E0000 00:00:1758172797.290475   44401 alts_credentials.cc:93] ALTS creds ignored. Not running on GCP and untrusted ALTS is not enabled.


### Inferencing with the model

In [3]:
# --- 6. Invoke the Chain ---
# The final result will be a more detailed summary
print("\nGenerating summary...\n")
summary_question = "Compare settlement of 2017-2021 to 2025 what are major changes ?"
result = rag_chain.invoke(summary_question)
print(result)


Generating summary...

Based on the provided text, a direct comparison of the 2017-2021 and 2025 settlements is difficult because the 2025 settlement is not fully detailed.  The documents describe the 2025 settlement as a conclusion to demands raised in 2021, 2024, and a management proposal from 2021.  The specifics of those demands and proposals are not included in this excerpt.  However, we can make some observations regarding differences based on what is available:


**Differences that can be inferred:**

* **Duration:** The 2017-2021 settlement covered a four-year period (48 months). The 2025 settlement's duration isn't explicitly stated but it's implied to be at least from 01.12.2021 to 31.12.2026 (5 years), based on the statement that the next settlement will be due from January 1st, 2027.

* **Scope:** The 2017-2021 settlement applied to permanent workmen in salary/wage categories MC1 to MC6, MT1 to MT4, and G03 to G10.  The 2025 settlement also includes the resolution of deman

### SAVE THE MODEL

In [4]:
import os
from langchain_community.document_loaders import DirectoryLoader, TextLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import Chroma
from langchain_community.embeddings import HuggingFaceEmbeddings

def save_vector_store(persist_directory="vector_db"):
    """
    Loads documents, creates embeddings, and saves the vector store to a directory.
    """
    print("Saving vector store...")
    
    # 1. Load documents
    loader = DirectoryLoader(
        "./content/",
        glob="*.txt",
        loader_cls=TextLoader
    )
    data = loader.load()

    # 2. Split documents into chunks
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=2000, chunk_overlap=200)
    docs = text_splitter.split_documents(data)

    # 3. Create embeddings
    embedding = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")

    # 4. Create and persist the vector store
    # This automatically saves the embeddings to the specified directory
    vectorstore = Chroma.from_documents(
        documents=docs,
        embedding=embedding,
        persist_directory=persist_directory
    )
    vectorstore.persist()
    print(f"Vector store saved to '{persist_directory}'")

# Execute the function to save the vector store
save_vector_store()

Saving vector store...
Vector store saved to 'vector_db'


  vectorstore.persist()


### THIS CODE IS FOR INFERENCING AFTER SAVING THE MODEL

In [5]:
# --- ONE-TIME SETUP ---

# Import necessary libraries
import os
from langchain.prompts import PromptTemplate
from langchain_google_genai import ChatGoogleGenerativeAI
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain_community.vectorstores import Chroma
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain.chains import create_retrieval_chain
from dotenv import load_dotenv

# Load environment variables
load_dotenv()

# Define the persistence directory for your vector store
PERSIST_DIRECTORY = "vector_db"

# 1. Initialize the embedding model (only once)
embedding = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")

# 2. Load the persisted vector store (only once)
if not os.path.exists(PERSIST_DIRECTORY):
    print(f"Error: Vector store directory '{PERSIST_DIRECTORY}' not found.")
else:
    vectorstore = Chroma(
        persist_directory=PERSIST_DIRECTORY,
        embedding_function=embedding
    )

# 3. Create the retriever (only once)
retriever = vectorstore.as_retriever(search_kwargs={"k": 5})

# 4. Define the RAG prompt (only once)
template = """
You are a helpful assistant. Search for the following context and answer to the point for the given question..
If you don't know the answer, just say that you don't know. Be very detailed and comprehensive in your answer, providing a detailed on the given context.

Context:
{context}

Question:
{input}

Helpful Answer:"""
prompt = PromptTemplate.from_template(template)

# 5. Initialize the LLM (only once)
llm = ChatGoogleGenerativeAI(
    model="gemini-1.5-flash-latest",
    temperature=0.7,
    google_api_key=os.environ.get('GOOGLE_API_KEY')
)

# 6. Build the RAG chain (only once)
question_answer_chain = create_stuff_documents_chain(llm, prompt)
rag_chain = create_retrieval_chain(retriever, question_answer_chain)

# --- INFERENCE FUNCTION ---

def inference_with_rag(query):
    """
    Performs a RAG query using the pre-loaded chain.
    """
    print(f"\nQuerying: '{query}'")
    if 'rag_chain' in globals():
        result = rag_chain.invoke({"input": query})
        return result['answer']
    else:
        return "RAG chain not initialized. Check for errors during setup."


  vectorstore = Chroma(
E0000 00:00:1758098251.139389   37409 alts_credentials.cc:93] ALTS creds ignored. Not running on GCP and untrusted ALTS is not enabled.
