In [30]:
from langchain_huggingface import HuggingFaceEndpoint, HuggingFacePipeline
from langchain_huggingface.chat_models import ChatHuggingFace
from dotenv import load_dotenv
from pydantic_settings import BaseSettings
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
import torch
import os
from huggingface_hub import login
load_dotenv("./.env", override=True)
login(token=os.getenv("HF_TOKEN"))

Note: Environment variable`HF_TOKEN` is set and is the current active token independently from the token you've just configured.


In [31]:
class HuggingFaceConfig(BaseSettings):
    hf_token: str
    model_id: str

config = HuggingFaceConfig()

In [None]:
class HuggingFaceChatmodel(HuggingFacePipeline):
    def __init__(self, config: HuggingFaceConfig):
        # Set manual seed for reproducibility
        torch.manual_seed(0)
        
        # Load tokenizer with proper authentication
        tokenizer = AutoTokenizer.from_pretrained(
            config.model_id,
            token=config.hf_token
        )
        
        # Try loading model with GPU support, fallback to CPU if needed
        try:
            # First try with device_map="auto"
            model = AutoModelForCausalLM.from_pretrained(
                config.model_id,
                torch_dtype=torch.float16,
                device_map="auto",
                token=config.hf_token
            )
            print("Model loaded with GPU support")
        except Exception as e:
            print(f"GPU loading failed ({e}), trying CPU...")
            # Fallback to CPU
            model = AutoModelForCausalLM.from_pretrained(
                config.model_id,
                torch_dtype=torch.float32,
                token=config.hf_token
            )
            # Move to GPU manually if available
            if torch.cuda.is_available():
                model = model.cuda()
                print("Model moved to GPU manually")
            else:
                print("Using CPU")
        
        # Create pipeline
        pipe = pipeline(
            "text-generation",
            model=model,
            tokenizer=tokenizer,
            max_new_tokens=256,
            temperature=0.7,
            do_sample=True,
            return_full_text=False
        )

        super().__init__(pipeline=pipe)

In [33]:
llm = HuggingFaceChatmodel(config=config)

tokenizer_config.json:   0%|          | 0.00/55.4k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/296 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/855 [00:00<?, ?B/s]

`torch_dtype` is deprecated! Use `dtype` instead!


ValueError: Using a `device_map`, `tp_plan`, `torch.device` context manager or setting `torch.set_default_device(device)` requires `accelerate`. You can install it with `pip install accelerate`

In [None]:
# Test your configuration first
print(f"Model ID: {config.model_id}")
print(f"Token (first 10 chars): {config.hf_token[:10]}...")

# Check if you have access to the Llama model
from huggingface_hub import model_info
try:
    info = model_info(config.model_id, token=config.hf_token)
    print(f"✅ Access confirmed to {config.model_id}")
    print(f"Model type: {info.config.get('model_type', 'Unknown')}")
except Exception as e:
    print(f"❌ Cannot access {config.model_id}: {e}")
    print("\nOptions:")
    print("1. Request access to Llama at: https://huggingface.co/meta-llama/Llama-3.1-8B-Instruct")
    print("2. Or switch to a public model like microsoft/DialoGPT-medium")
    print("3. Or use GPT-2 (completely free)")