In [None]:
!pip install -qU langchain-google-genai langchain-core langgraph langchain-community

In [None]:
import os

os.environ["LANGCHAIN_TRACING_V2"] = "true"
os.environ["LANGCHAIN_ENDPOINT"] = "https://api.smith.langchain.com"
os.environ["LANGCHAIN_API_KEY"] = "<your-api-key>"

In [None]:
api_key = ""

In [None]:
import getpass

if not os.environ.get("GOOGLE_API_KEY"):
  os.environ["GOOGLE_API_KEY"] = getpass.getpass("Enter API key for Google Gemini: ")


In [None]:
from langchain_google_genai import GoogleGenerativeAIEmbeddings

embeddings = GoogleGenerativeAIEmbeddings(model="models/embedding-001")

In [None]:
from langchain_core.vectorstores import InMemoryVectorStore

vector_store = InMemoryVectorStore(embeddings)

In [None]:
from langchain_community.document_loaders import TextLoader
from langchain_community.vectorstores import FAISS
from langchain_text_splitters import RecursiveCharacterTextSplitter

loader = TextLoader("/content/Tagore10.txt")

data = loader.load()
from langchain.text_splitter import RecursiveCharacterTextSplitter

text_splitterr = RecursiveCharacterTextSplitter(
    chunk_size=100,
    chunk_overlap=20,
    separators=["।"],  # Prioritize paragraph breaks
    add_start_index=True
)
texts = text_splitterr.split_documents(data)
len(texts), texts[200:203]

In [None]:
from langchain_huggingface import HuggingFaceEmbeddings

model_name = "intfloat/multilingual-e5-small"
model_kwargs = {'device': 'cpu'}
encode_kwargs = {'normalize_embeddings': True}
hf = HuggingFaceEmbeddings(
    model_name=model_name,
    model_kwargs=model_kwargs,
    encode_kwargs=encode_kwargs
)

In [None]:
from langchain_google_genai import GoogleGenerativeAIEmbeddings

embeddings = GoogleGenerativeAIEmbeddings(model="models/gemini-embedding-exp-03-07")

In [None]:
from langchain_experimental.text_splitter import SemanticChunker

text_splitters = SemanticChunker(hf, breakpoint_threshold_type = 'gradient', number_of_chunks = 1000,
                                 )

In [None]:
# This is a long document we can split up.
with open("/Tagore10.txt") as f:
    tagore = f.read()


In [None]:
docs = text_splitters.create_documents([tagore])


In [None]:
len(docs) , texts[:3]

In [None]:
vectorstorer = FAISS.from_documents(texts, hf)


In [None]:
vectorstores = FAISS.from_documents(docs, hf)


In [None]:
retriever = vectorstorer.as_retriever(search_kwargs={"k": 10})


In [None]:
docs = retriever.invoke("ভারতের পশ্চিম অঞ্চলকে কী বলে?")
for doc in docs:
    print(doc.page_content)
    print("-------------")

In [None]:
prompt = """You are an assistant for question-answering tasks. Use the following pieces of retrieved context to answer the question in 1 or 2 words.
Context: {context}
Answer:
"""


from langchain_core.prompts import ChatPromptTemplate

template = ChatPromptTemplate([
    ("human", prompt),
])
template

In [None]:
from langchain_google_genai import ChatGoogleGenerativeAI, HarmBlockThreshold, HarmCategory

In [None]:
### Generate

from langchain import hub
from langchain_core.output_parsers import StrOutputParser
from langchain_google_genai import ChatGoogleGenerativeAI
from langchain_core.prompts import ChatPromptTemplate
from google.generativeai.types import HarmCategory, HarmBlockThreshold


# Prompt
# prompt = hub.pull("rlm/rag-prompt")

# LLM
llm = ChatGoogleGenerativeAI(
    model="gemini-1.5-flash",
    temperature=0,
    max_output_tokens=100,
    stream=True,
    google_api_key=api_key,
    safety_settings={
        HarmCategory.HARM_CATEGORY_HATE_SPEECH: HarmBlockThreshold.BLOCK_LOW_AND_ABOVE,
        HarmCategory.HARM_CATEGORY_HARASSMENT: HarmBlockThreshold.BLOCK_LOW_AND_ABOVE,
        HarmCategory.HARM_CATEGORY_SEXUALLY_EXPLICIT: HarmBlockThreshold.BLOCK_LOW_AND_ABOVE,
        HarmCategory.HARM_CATEGORY_DANGEROUS_CONTENT: HarmBlockThreshold.BLOCK_LOW_AND_ABOVE,
    },
)
# Chain
rag_chain = template | llm | StrOutputParser()


In [None]:
from langchain_core.documents import Document
from typing_extensions import List, TypedDict


class State(TypedDict):
    question: str
    context: List[Document]
    answer: str

In [None]:
def retrieve(state: State):
    retrieved_docs = vectorstorer.similarity_search(state["question"], k=10)
    return {"context": retrieved_docs}


def generate(state: State):
    docs_content = "\n\n".join(doc.page_content for doc in state["context"])
    messages = template.invoke({"question": state["question"], "context": docs_content})
    response = llm.invoke(messages)
    return {"answer": response.content}

In [None]:
from langgraph.checkpoint.memory import MemorySaver

memory = MemorySaver()


In [None]:
from langgraph.graph import START, StateGraph

graph_builder = StateGraph(State).add_sequence([retrieve, generate])
graph_builder.add_edge(START, "retrieve")
graph = graph_builder.compile(checkpointer=memory)

In [None]:
from IPython.display import Image, display

display(Image(graph.get_graph().draw_mermaid_png()))

In [None]:
# Specify an ID for the thread
config = {"configurable": {"thread_id": "abc125"}}
input_message = "ভারতের পশ্চিম অঞ্চলকে কী বলে?"

final_answer = None
for step in graph.stream(
    {"question": input_message},
    stream_mode="values",
    config=config,
):
    if "answer" in step:
        final_answer = step["answer"]

print(final_answer)