In [1]:
from dotenv import load_dotenv, find_dotenv 
import transformers
from transformers import AutoModelForCausalLM, AutoTokenizer

from langchain.vectorstores import FAISS, Weaviate
from langchain_huggingface import HuggingFaceEmbeddings
from langchain.document_loaders import WebBaseLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.llms import HuggingFaceHub
from langchain.chains import RetrievalQA
from langchain.schema import Document

#import weaviate
#from weaviate.embedded import EmbeddedOptions

from huggingface_hub import list_models
import os


  from .autonotebook import tqdm as notebook_tqdm
USER_AGENT environment variable not set, consider setting it to identify your requests.


In [2]:

_ = load_dotenv(find_dotenv())

# 1. Set your Hugging Face token
os.environ["HUGGINGFACEHUB_API_TOKEN"] = os.getenv("HUGGINGFACEHUB_API_TOKEN")

# 2. Retrieve Hugging Face model cards (limited to top N)
def fetch_model_cards(n=5):
    model_infos = list_models(filter="text-classification", sort="downloads", limit=n)
    urls = [f"https://huggingface.co/{model.modelId}" for model in model_infos]
    return urls

# 3. Load model cards from the web
def load_and_split_documents(urls):
    loader = WebBaseLoader(urls)
    docs = loader.load()

    splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
    return splitter.split_documents(docs)

# 4. Create a retriever using FAISS
def create_retriever(documents):
    #embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
    model_name = "sentence-transformers/all-mpnet-base-v2"
    model_kwargs = {'device': 'cpu'}
    encode_kwargs = {'normalize_embeddings': False}
    embeddings = HuggingFaceEmbeddings(
        model_name=model_name,
        model_kwargs=model_kwargs,
        encode_kwargs=encode_kwargs
    )
    vectordb = FAISS.from_documents(documents, embeddings)
    return vectordb.as_retriever()

# 4.2. Create a retriever using Weaviate
def create_retriever_weaviate(documents):
    
    client = weaviate.Client(
        embedded_options = EmbeddedOptions()
    )

    vectorstore = Weaviate.from_documents(
        client = client,    
        documents = documents,
        embedding = OpenAIEmbeddings(),
        by_text = False
    )
    return vectorstore.as_retriever()

# 5. Use LLama 4 from Hugging Face Hub
def load_llama_llm():
    return HuggingFaceHub(
        repo_id="meta-llama/Meta-Llama-3-8B-Instruct",  # meta-llama/Llama-4-Scout-17B-16E-Instruct
        model_kwargs={"temperature": 0.5, "max_new_tokens": 512},
    )
def load_llama4():
    access_token = os.environ["HUGGINGFACEHUB_API_TOKEN"]
    model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-4-Scout-17B-16E-Instruct",
                                                 #load_in_4bit=True,
                                                 #device_map='auto',
                                                 token=access_token)
    tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-4-Scout-17B-16E-Instruct", token=access_token)    
    text_generation_pipeline = transformers.pipeline(
        model=model,
        tokenizer=tokenizer,
        task='text-generation',
        do_sample=True,
        temperature=0.2,
        repetition_penalty=1.1,
        return_full_text=True,
        max_new_tokens=1000
    )
    return HuggingFacePipeline(pipeline=text_generation_pipeline)

In [3]:

# 6. Create RAG pipeline
def create_rag_pipeline():
    urls = fetch_model_cards(n=10)
    docs = load_and_split_documents(urls)
    retriever = create_retriever(docs)
    llm = load_llama4()

    rag_chain = RetrievalQA.from_chain_type(
        llm=llm,
        retriever=retriever,
        return_source_documents=True
    )
    return rag_chain


In [None]:

# 7. Query the RAG system
if __name__ == "__main__":
    rag = create_rag_pipeline()
    query = "What are the key features of Meta-Llama models?"
    result = rag(query)

    print("\nGenerated Answer:\n", result['result'])
    print("\nSources:\n", [doc.metadata.get('source') for doc in result['source_documents']])


Fetching 50 files:  18%|███████████████████████                                                                                                         | 9/50 [08:47<32:34, 47.67s/it]