In [1]:
!pip install transformers langchain langchain-community duckduckgo-search
!pip install bitsandbytes accelerate

Collecting bitsandbytes
  Downloading bitsandbytes-0.46.1-py3-none-manylinux_2_24_x86_64.whl.metadata (10 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch<3,>=2.2->bitsandbytes)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch<3,>=2.2->bitsandbytes)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch<3,>=2.2->bitsandbytes)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch<3,>=2.2->bitsandbytes)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch<3,>=2.2->bitsandbytes)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-c

In [16]:
!pip install pydantic



## import libraries

In [1]:
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
from langchain.agents import initialize_agent, Tool
from langchain_community.tools import DuckDuckGoSearchResults
from langchain import LLMChain, PromptTemplate
import torch
import os

## Load model and Tokenizer

In [2]:
quant_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_use_double_quant=True
)

model_name = "unsloth/Qwen2.5-7B-Instruct"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=quant_config,
    device_map="auto"
)

print(f"Model is on: {model.device}")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Fetching 4 files:   0%|          | 0/4 [00:00<?, ?it/s]

model-00002-of-00004.safetensors:   0%|          | 0.00/4.93G [00:00<?, ?B/s]

model-00004-of-00004.safetensors:   0%|          | 0.00/1.09G [00:00<?, ?B/s]

model-00001-of-00004.safetensors:   0%|          | 0.00/4.88G [00:00<?, ?B/s]

model-00003-of-00004.safetensors:   0%|          | 0.00/4.33G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/271 [00:00<?, ?B/s]

Model is on: cuda:0


## Custom Hugging Face LLM Integration for LangChain

In [58]:
from langchain_core.outputs import LLMResult, Generation
from pydantic import Field

class CustomHuggingFaceLLM(BaseLLM):
    model: object = Field(...)
    tokenizer: object = Field(...)

    class Config:
        extra = "allow"

    def __init__(self, model, tokenizer):
        super().__init__(model=model, tokenizer=tokenizer)

    def _generate(self, prompts, stop=None):
        generations = []
        for prompt in prompts:
            inputs = self.tokenizer(prompt, return_tensors="pt").to(self.model.device)
            outputs = self.model.generate(**inputs, max_new_tokens=500)
            response = self.tokenizer.decode(outputs[0], skip_special_tokens=True)
            generations.append([Generation(text=response)])
        return LLMResult(generations=generations)

    def _call(self, prompt: str, stop=None) -> str:
        inputs = self.tokenizer(prompt, return_tensors="pt").to(self.model.device)
        outputs = self.model.generate(**inputs, max_new_tokens=500)
        return self.tokenizer.decode(outputs[0], skip_special_tokens=True)



    @property
    def _llm_type(self) -> str:
        return "custom_huggingface_llm"

In [59]:
custom_llm = CustomHuggingFaceLLM(model, tokenizer)

## DuckDuckGo Search Tool Setup

In [60]:
ddg_search = DuckDuckGoSearchResults()

tools = [
    Tool(
        name="DuckDuckGo Search",
        func=ddg_search.run,
        description="Useful to browse information from the Internet.",
    )
]

## Agent Prompt Template Definition

In [65]:
prompt_template = PromptTemplate(
    input_variables=["input", "agent_scratchpad"],
    template="""Answer the query: {input}

Agent Scratchpad: {agent_scratchpad}

Format:
Question: {input}
Thought: What to do
Action: [DuckDuckGo Search]
Action Input: Search query
Observation: Search result
Thought: Final answer
Final Answer: Answer to query"""
)

## Agent Chain Creation

In [66]:
llm_chain = LLMChain(
    llm=custom_llm,
    prompt=prompt_template
)

## Agent Initialization

In [67]:
agent = initialize_agent(
    tools=tools,
    llm=custom_llm,
    agent="zero-shot-react-description",
    verbose=True,
    handle_parsing_errors=True
)

## Query Execution

In [None]:
query = "Weather in London in the coming 3 days"
response = agent.run(query)
print(response)

In [None]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
from bitsandbytes import BitsAndBytesConfig
from duckduckgo_search import DDGS
import re

quant_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_use_double_quant=True
)

model_name = "unsloth/Qwen2.5-7B-Instruct"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=quant_config,
    device_map="auto"
)

print(f"Model is on: {model.device}")

def duckduckgo_search(query):
    try:
        with DDGS() as ddgs:
            results = [r for r in ddgs.text(query, max_results=1)]
            return results[0].get("body", "No results found") if results else "No results found"
    except Exception as e:
        return f"Search error: {str(e)}"

def parse_model_output(output):
    action_match = re.search(r"Action: (.*?)\nAction Input: (.*?)(?:\n|$)", output, re.DOTALL)
    if action_match:
        return {"action": action_match.group(1).strip(), "action_input": action_match.group(2).strip()}
    final_answer_match = re.search(r"Final Answer: (.*?)(?:\n|$)", output, re.DOTALL)
    if final_answer_match:
        return {"final_answer": final_answer_match.group(1).strip()}
    return None

def run_agent(query, max_iterations=3):
    prompt_template = """Answer the query: {query}

Available tool: DuckDuckGo Search
Use this format if searching:
Action: DuckDuckGo Search
Action Input: [your search query]

If you know the final answer:
Final Answer: [your answer]
"""
    current_prompt = prompt_template.format(query=query)
    intermediate_steps = []

    for i in range(max_iterations):
        inputs = tokenizer(current_prompt, return_tensors="pt").to(model.device)
        input_length = inputs["input_ids"].shape[1]
        print(f"Input token length: {input_length}")

        outputs = model.generate(
            **inputs,
            max_new_tokens=500,
            do_sample=True,
            temperature=0.7,
            top_p=0.9,
            pad_token_id=tokenizer.eos_token_id
        )
        response = tokenizer.decode(outputs[0], skip_special_tokens=True)

        response = response[len(current_prompt):].strip()

        parsed_output = parse_model_output(response)

        if not parsed_output:
            return "Error: Invalid model output"

        if "final_answer" in parsed_output:
            return parsed_output["final_answer"]

        if parsed_output["action"] == "DuckDuckGo Search":
            search_query = parsed_output["action_input"]
            search_result = duckduckgo_search(search_query)
            current_prompt += f"\nAction: DuckDuckGo Search\nAction Input: {search_query}\nObservation: {search_result}\n"
            intermediate_steps.append((search_query, search_result))
        else:
            return "Error: Invalid action"

    return "Error: Max iterations reached without final answer"

# تست agent
query = "Weather in London in the coming 3 days"
response = run_agent(query)
print("Final Answer:", response)
