In [16]:
from langchain.llms import HuggingFacePipeline
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
import torch
from accelerate import init_empty_weights, load_checkpoint_and_dispatch
import os

In [22]:
def save_gemma_locally(save_path="D:/models/gemma-2b", model_name="google/gemma-2b"):
    print("📥 Downloading and saving model locally...")
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForCausalLM.from_pretrained(model_name)

    tokenizer.save_pretrained(save_path)
    model.save_pretrained(save_path)

    print(f"✅ Model saved to {save_path}")

In [13]:
def load_gemma_pipeline_pre(token, model_name="google/gemma-2b"):
    # Load tokenizer with token authentication
    tokenizer = AutoTokenizer.from_pretrained(model_name, token=token, trust_remote_code=True)

    # Load model with token authentication
    model = AutoModelForCausalLM.from_pretrained(
        model_name,
        token=token,
        device_map="auto",
        torch_dtype=torch.bfloat16 if torch.cuda.is_available() else torch.float32,
        trust_remote_code=True
    )

    # Create text generation pipeline
    gemma_pipe = pipeline(
        "text-generation",
        model=model,
        tokenizer=tokenizer,
        max_new_tokens=512,
        temperature=0.1,
        top_p=0.95,
        repetition_penalty=1.1,
        do_sample=True
    )

    return HuggingFacePipeline(pipeline=gemma_pipe)

In [17]:
def load_gemma_pipeline(token, model_path="D:/models/gemma-2b", model_name="google/gemma-2b"):
    """
    Loads Gemma model from local path or Hugging Face with disk offloading.
    """

    # If model not saved locally, download and save it first
    if not os.path.exists(model_path):
        save_gemma_locally(save_path=model_path, model_name=model_name)

    print("🧠 Loading Gemma model with disk offloading...")

    # Load tokenizer from local folder
    tokenizer = AutoTokenizer.from_pretrained(model_path, token=token, trust_remote_code=True)

    # Load model using disk offloading
    model = AutoModelForCausalLM.from_pretrained(
        model_path,
        device_map="auto",
        torch_dtype=torch.bfloat16 if torch.cuda.is_available() else torch.float32,
        trust_remote_code=True,
        offload_folder="./offload",  # Folder for offloaded weights
        offload_state_dict=True      # Enable full offloading
    )

    # Alternatively: use `accelerate` for more control
    # model = load_checkpoint_and_dispatch(
    #     model,
    #     model_path,
    #     device_map="auto",
    #     offload_folder="./offload",
    #     dtype=torch.float16
    # )

    # Create text generation pipeline
    gemma_pipe = pipeline(
        "text-generation",
        model=model,
        tokenizer=tokenizer,
        max_new_tokens=512,
        temperature=0.1,
        top_p=0.95,
        repetition_penalty=1.1,
        do_sample=True
    )

    return HuggingFacePipeline(pipeline=gemma_pipe)

In [None]:
import os
from dotenv import load_dotenv
from langchain.utilities import SQLDatabase

load_dotenv()

db_uri=os.getenv("POSTGRES_URI")

if db_uri is None:
    raise ValueError("POSTGRES_URI ot found in .env")

db=SQLDatabase.from_uri(db_uri,sample_rows_in_table_info=3)

print(db.table_info)
print(db)

In [3]:
load_dotenv()
hf_token=os.getenv("HF_TOKEN")
if hf_token is None:
    raise ValueError("could not find hugging face token")

In [19]:
from langchain_experimental.sql import SQLDatabaseChain

In [None]:
load_dotenv()
hf_token=os.getenv("HF_TOKEN")
if hf_token is None:
    raise ValueError("could not find hugging face token")

In [None]:
llm = load_gemma_pipeline(token=hf_token)

In [None]:
db_chain = SQLDatabaseChain.from_llm(llm, db, verbose=True)
qns1 = db_chain("How many t-shirts do we have left for nike in extra small size and white color?")

In [26]:
def load_tinyLlama_pipeline(token, model_path="D:/models/TinyLlama-1.1B-Chat-v1.0", model_name="TinyLlama/TinyLlama-1.1B-Chat-v1.0"):
   
    # If model not saved locally, download and save it first
    if not os.path.exists(model_path):
        save_gemma_locally(save_path=model_path, model_name=model_name)

    print("🧠 Loading TinyLlama model with disk offloading...")

    # Load tokenizer from local folder
    tokenizer = AutoTokenizer.from_pretrained(model_path, token=token, trust_remote_code=True)

    # Load model using disk offloading
    model = AutoModelForCausalLM.from_pretrained(
        model_path,
        device_map="auto",
        torch_dtype=torch.bfloat16 if torch.cuda.is_available() else torch.float32,
        trust_remote_code=True,
        offload_folder="./offload",  # Folder for offloaded weights
        offload_state_dict=True      # Enable full offloading
    )

    # Alternatively: use `accelerate` for more control
    model = load_checkpoint_and_dispatch(
        model,
        model_path,
        device_map="auto",
        offload_folder="./offload",
        dtype=torch.float16
    )

    # Create text generation pipeline
    tinyllama_pipe = pipeline(
        "text-generation",
        model=model,
        tokenizer=tokenizer,
        max_new_tokens=512,
        temperature=0.1,
        top_p=0.95,
        repetition_penalty=1.1,
        do_sample=True
    )

    return HuggingFacePipeline(pipeline=tinyllama_pipe)

In [27]:
llm = load_tinyLlama_pipeline(token=hf_token)

🧠 Loading TinyLlama model with disk offloading...


Device set to use cpu


In [None]:
db_chain = SQLDatabaseChain.from_llm(llm, db, verbose=True)
qns1 = db_chain("How many t-shirts do we have left for nike in extra small size and white color?")

In [None]:
qns2 = db_chain.run("SELECT SUM(price*stock_quantity) FROM t_shirts WHERE size = 'S'")

In [None]:
response = db_chain.invoke("SELECT SUM(price*stock_quantity) FROM t_shirts WHERE size = 'S'")
qns2 = response["result"]

In [None]:
db_chain = SQLDatabaseChain.from_llm(llm, db, verbose=True)
qns3 = db_chain("How many records are in the database?")