## Installation

In [None]:
%%capture --no-stderr
!pip install --quiet langchain_community
!pip install --quiet pypdf
!pip install --quiet langchain_weaviate
!pip install --quiet rank_bm25
!pip install --quiet bitsandbytes
!pip install --quiet accelerate
!pip install --quiet transformers datasets accelerate nvidia-ml-py3
!pip install --quiet transformers
!pip install --quiet numpy==1.24.4
!pip install -U sentence-transformers
!pip install  --quiet langchain_huggingface
!pip install --quiet langchain_experimental langchain_openai
# !pip install transformers_stream_generator
! pip install --upgrade transformers

In [None]:
%%capture --no-stderr
%pip install -U langgraph langsmith

# Used for this tutorial; not a requirement for LangGraph
%pip install -U langchain_anthropic

## Library importing

In [None]:
# import weaviate
# from langchain.retrievers.weaviate_hybrid_search import WeaviateHybridSearchRetriever
# from langchain.chains import RetrievalQA
# from langchain.llms import OpenAI
# from langchain.document_loaders import PyPDFLoader
# from langchain.text_splitter import RecursiveCharacterTextSplitter
import torch
from transformers import ( AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, pipeline,TextStreamer,TextIteratorStreamer,AutoModelForSequenceClassification )
from langchain import HuggingFacePipeline

from typing import Annotated

from typing_extensions import TypedDict

from langgraph.graph import StateGraph, START, END
from langgraph.graph.message import add_messages

# from langchain.retrievers.multi_query import MultiQueryRetriever
from threading import Thread
from langchain_core.callbacks import streaming_stdout
from langchain_core.prompts import PromptTemplate


## LLM model importing

In [None]:
def load_quantized_model(model_name: str):
    """
    model_name: Name or path of the model to be loaded.
    return: Loaded quantized model.
    """
    bnb_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_use_double_quant=True,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_compute_dtype=torch.bfloat16,
    )

    model = AutoModelForCausalLM.from_pretrained(
        model_name,
        torch_dtype=torch.bfloat16,
        quantization_config=bnb_config,)
    return model

# initializing tokenizer
def initialize_tokenizer(model_name: str):
    """
    model_name: Name or path of the model for tokenizer initialization.
    return: Initialized tokenizer.
    """
    tokenizer = AutoTokenizer.from_pretrained(model_name, return_token_type_ids=False)
    tokenizer.bos_token_id = 1  # Set beginning of sentence token id
    return tokenizer

# loading quantized model
model_path = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"

model = load_quantized_model(model_path)

# initializing tokenizer
tokenizer = initialize_tokenizer(model_path)


In [None]:
streamer = TextIteratorStreamer(tokenizer)
pipe = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    use_cache=True,
    device_map="auto",
    max_length=2048,
    do_sample=True,
    top_k=5,
    num_return_sequences=1,
    eos_token_id=tokenizer.eos_token_id,
    pad_token_id=tokenizer.pad_token_id,
    return_full_text=False,
    # streamer=streamer
)

callbacks = [streaming_stdout.StreamingStdOutCallbackHandler()]

llm = HuggingFacePipeline(pipeline=pipe,pipeline_kwargs = {"streaming":True})

# llm = HuggingFacePipeline(
#     pipeline=pipe,
#     # callbacks=callbacks,  # Pass your callbacks here
#     streaming=True,  # Enable streaming
#     verbose=True)

In [None]:
# template = """Question: {question}

# Answer: Let's think step by step."""
# prompt = PromptTemplate.from_template(template)

# chain = prompt | llm

# question = "What is electroencephalography?"

# for chunk in chain.stream(question):
#     if chunk and chunk!="\n":
#         print(chunk)

## LangGraph defination

In [None]:
class State(TypedDict):
    messages: Annotated[list, add_messages]


def chatbot(state: State):
    final_chunk = ""
    for data in llm.stream(state["messages"]):
        final_chunk += data
    print("===> ",final_chunk)
    print("===> ",state["messages"])
    # print(state["messages"])
    return {"messages": [llm.invoke(state["messages"])]}



graph_builder = StateGraph(State)
graph_builder.add_node("chatbot", chatbot)


In [None]:
graph_builder.add_edge(START, "chatbot")

graph_builder.add_edge("chatbot", END)
graph = graph_builder.compile()


In [None]:
results = graph.invoke({"messages": ("user", "what is the function calling means ?")})

In [None]:
## if llm using alot then what we do is llm caching we can use. rather than simple string
## can't we cache embedding it self ? and check how much closer it would be with caching that present making us some kind of threshould for the response ?

In [None]:
print([data for data in results])