### This file creates embeddings based on the given text data and saves the model

In [3]:
# importing necessary libraries
import os
from langchain.document_loaders import TextLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import Chroma
from langchain import hub
from langchain_google_genai import ChatGoogleGenerativeAI
from langchain_core.runnables import RunnablePassthrough
from langchain_core.output_parsers import StrOutputParser
from langchain.prompts import PromptTemplate

In [12]:
# --- 1. Load the Document ---
# Make sure your file path is correct
from langchain_community.document_loaders import DirectoryLoader, TextLoader
from dotenv import load_dotenv

# Use a Glob pattern to load all .txt files from the 'content' directory
loader = DirectoryLoader(
    "./content/",
    glob="*.txt",
    loader_cls=TextLoader
)
data = loader.load()

# --- 2. Split the Document (with larger chunk size) ---
# We use a larger chunk size to get more context per document chunk
text_splitter = RecursiveCharacterTextSplitter(chunk_size=2000, chunk_overlap=200)
docs = text_splitter.split_documents(data)
print(f"Total number of documents (chunks): {len(docs)}")

# --- 3. Create Embeddings and Vector Store ---
# The embedding model converts text to numerical vectors
embedding = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")

# Create the vector store and a retriever that gets more documents
vectorstore = Chroma.from_documents(documents=docs, embedding=embedding)
# We set 'k' to 10 to retrieve the top 10 most relevant documents
retriever = vectorstore.as_retriever(search_kwargs={"k": 10})

# --- 4. Define the RAG Prompt and Language Model ---
# We define a custom prompt to encourage a more detailed answer
template = """
You are a helpful assistant. Use the following context to answer the question.
If you don't know the answer, just say that you don't know. Be very detailed and comprehensive in your answer, providing a thorough summary based on the given context.

Context:
{context}

Question:
{question}

Helpful Answer:"""

prompt = PromptTemplate.from_template(template)
load_dotenv()
# Initialize the Gemini Language Model
# Note: You need to set up your Google API key in Colab secrets
llm = ChatGoogleGenerativeAI(model="gemini-1.5-flash-latest", temperature=0.7, google_api_key=os.environ.get('GOOGLE_API_KEY'))

# --- 5. Build the RAG Chain ---
def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)

rag_chain = (
    {"context": retriever | format_docs, "question": RunnablePassthrough()}
    | prompt
    | llm
    | StrOutputParser()
)


Total number of documents (chunks): 7


E0000 00:00:1758041892.351137   10385 alts_credentials.cc:93] ALTS creds ignored. Not running on GCP and untrusted ALTS is not enabled.


### Inferencing with the model

In [13]:
# --- 6. Invoke the Chain ---
# The final result will be a more detailed summary
print("\nGenerating summary...\n")
summary_question = "Generate a summary of the complete document"
result = rag_chain.invoke(summary_question)
print(result)


Generating summary...

This document presents a blend of fictional narratives and recipes, creating a cohesive story-driven culinary experience.  The core narrative is "The Ballad of the Last Starship," detailing the Odyssey's journey to a "black hole" that turns out to be an event horizon leading to a dimension of pure information. This dimension is described as a vast, sentient crystalline mind containing all knowledge, including the recipe for a specific dessert.  The crew's final log entry reveals their integration into this cosmic network, concluding their journey not in silence but in a chorus of whispered secrets; they were never alone, merely a part of a larger interconnected system.

The story is interwoven with descriptions of other locations and cultures, enhancing the world-building. "Further Journeys: The Ochre Oasis" introduces the Silt-Dwellers, a community living in a petrified fungal forest, who communicate through rhythmic tapping and create bioluminescent tapestries

### SAVE THE MODEL

In [14]:
import os
from langchain_community.document_loaders import DirectoryLoader, TextLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import Chroma
from langchain_community.embeddings import HuggingFaceEmbeddings

def save_vector_store(persist_directory="vector_db"):
    """
    Loads documents, creates embeddings, and saves the vector store to a directory.
    """
    print("Saving vector store...")
    
    # 1. Load documents
    loader = DirectoryLoader(
        "./content/",
        glob="*.txt",
        loader_cls=TextLoader
    )
    data = loader.load()

    # 2. Split documents into chunks
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=2000, chunk_overlap=200)
    docs = text_splitter.split_documents(data)

    # 3. Create embeddings
    embedding = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")

    # 4. Create and persist the vector store
    # This automatically saves the embeddings to the specified directory
    vectorstore = Chroma.from_documents(
        documents=docs,
        embedding=embedding,
        persist_directory=persist_directory
    )
    vectorstore.persist()
    print(f"Vector store saved to '{persist_directory}'")

# Execute the function to save the vector store
save_vector_store()

Saving vector store...
Vector store saved to 'vector_db'


  vectorstore.persist()


### THIS CODE IS FOR INFERENCING AFTER SAVING THE MODEL

In [18]:
# --- ONE-TIME SETUP ---

# Import necessary libraries
import os
from langchain.prompts import PromptTemplate
from langchain_google_genai import ChatGoogleGenerativeAI
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain_community.vectorstores import Chroma
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain.chains import create_retrieval_chain
from dotenv import load_dotenv

# Load environment variables
load_dotenv()

# Define the persistence directory for your vector store
PERSIST_DIRECTORY = "vector_db"

# 1. Initialize the embedding model (only once)
embedding = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")

# 2. Load the persisted vector store (only once)
if not os.path.exists(PERSIST_DIRECTORY):
    print(f"Error: Vector store directory '{PERSIST_DIRECTORY}' not found.")
else:
    vectorstore = Chroma(
        persist_directory=PERSIST_DIRECTORY,
        embedding_function=embedding
    )

# 3. Create the retriever (only once)
retriever = vectorstore.as_retriever(search_kwargs={"k": 10})

# 4. Define the RAG prompt (only once)
template = """
You are a helpful assistant. Use the following context to answer the question.
If you don't know the answer, just say that you don't know. Be very detailed and comprehensive in your answer, providing a thorough summary based on the given context.

Context:
{context}

Question:
{input}

Helpful Answer:"""
prompt = PromptTemplate.from_template(template)

# 5. Initialize the LLM (only once)
llm = ChatGoogleGenerativeAI(
    model="gemini-1.5-flash-latest",
    temperature=0.7,
    google_api_key=os.environ.get('GOOGLE_API_KEY')
)

# 6. Build the RAG chain (only once)
question_answer_chain = create_stuff_documents_chain(llm, prompt)
rag_chain = create_retrieval_chain(retriever, question_answer_chain)

# --- INFERENCE FUNCTION ---

def inference_with_rag(query):
    """
    Performs a RAG query using the pre-loaded chain.
    """
    print(f"\nQuerying: '{query}'")
    if 'rag_chain' in globals():
        result = rag_chain.invoke({"input": query})
        return result['answer']
    else:
        return "RAG chain not initialized. Check for errors during setup."


E0000 00:00:1758042691.554042   10385 alts_credentials.cc:93] ALTS creds ignored. Not running on GCP and untrusted ALTS is not enabled.


In [19]:
# --- EXAMPLE USAGE (looping for multiple queries) ---

while True:
    user_query = input("\nEnter your query (or type 'exit' to quit): ")
    if user_query.lower() == 'exit':
        print("Exiting chatbot. Goodbye!")
        break
    
    response = inference_with_rag(user_query)
    print("\n-------------------\n")
    print(f"Answer: {response}")
    print("\n-------------------\n")


Querying: 'What is in the given context?'

-------------------

Answer: The provided text contains several distinct pieces of fictional writing, including:

1. **The Ballad of the Last Starship:** A science fiction story about a starship, the Odyssey, nearing the end of its journey.  The crew discovers a "perfect sine wave of pure thought" emanating from a black hole system, leading them to a sphere of light which turns out to be a node in a vast cosmic network. The ship is absorbed into this network, suggesting a connection between all things across space and time.  The final log entry hints at this network being a conscious entity containing all knowledge and history.

2. **The Unforeseen Synthesis of Cryptobotanical Mycelia and Quantum Entanglement in the Anthropocene:** A scientific paper exploring the theoretical and practical application of using underground fungal networks (mycelia) as a communication medium.  By leveraging quantum entanglement, the researchers aim to achieve i