### RAG GOld and silver News Search Bot

files that will be used in  
https://pubs.usgs.gov/periodicals/mcs2025/mcs2025-gold.pdf
https://pubs.usgs.gov/periodicals/mcs2025/mcs2025-silver.pdf
https://silverinstitute.org/wp-content/uploads/2025/04/World_Silver_Survey-2025.pdf
https://thedocs.worldbank.org/en/doc/718001587677508339-0050022020/render/CMOApril2020preciousmetals.txt
https://github.com/datasets/gold-prices/raw/main/data/monthly.csv

In [18]:
# === URL fetching ===
import requests
import os

# === Document loading (PDF with tables + TXT) ===
from langchain_community.document_loaders import PDFPlumberLoader, TextLoader, CSVLoader
# Alternative for simple PDFs (no tables): PyPDFLoader

# === Chunking ===
from langchain_text_splitters import RecursiveCharacterTextSplitter

# === Embeddings ===
from langchain_community.embeddings import HuggingFaceEmbeddings  # or HuggingFaceBgeEmbeddings

# === Vector store ===
from langchain_community.vectorstores import FAISS

# === LLM + RAG chain ===
from langchain_classic.chains import RetrievalQA
from langchain_classic.prompts import PromptTemplate
# + your LLM: HuggingFacePipeline, or Ollama, or ChatOpenAI

from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.vectorstores import FAISS
from langchain_classic.chains import RetrievalQA
from langchain_classic.prompts import PromptTemplate

### 1-load documents 

In [9]:
# URLs and download folder
URLS = [
    "https://pubs.usgs.gov/periodicals/mcs2025/mcs2025-gold.pdf",
    "https://pubs.usgs.gov/periodicals/mcs2025/mcs2025-silver.pdf",
    "https://silverinstitute.org/wp-content/uploads/2025/04/World_Silver_Survey-2025.pdf",
    "https://thedocs.worldbank.org/en/doc/718001587677508339-0050022020/render/CMOApril2020preciousmetals.txt",
    "https://github.com/datasets/gold-prices/raw/main/data/monthly.csv",
]

DATA_DIR = "data"
os.makedirs(DATA_DIR, exist_ok=True)

In [10]:
def download_file(url: str, save_dir: str) -> str:
    """Download file from URL and return local path. Writes binary to preserve encoding."""
    filename = url.rpartition("/")[-1].split("?")[0]
    filepath = os.path.join(save_dir, filename)
    response = requests.get(url, headers={"User-Agent": "Mozilla/5.0"}, timeout=60)
    response.raise_for_status()
    with open(filepath, "wb") as f:
        f.write(response.content)
    return filepath


downloaded = []
for url in URLS:
    try:
        path = download_file(url, DATA_DIR)
        downloaded.append(path)
        print(f"Downloaded: {path}")
    except Exception as e:
        print(f"Failed {url}: {e}")

Downloaded: data\mcs2025-gold.pdf
Downloaded: data\mcs2025-silver.pdf
Downloaded: data\World_Silver_Survey-2025.pdf
Downloaded: data\CMOApril2020preciousmetals.txt
Downloaded: data\monthly.csv


In [11]:
# Verify downloads
for p in downloaded:
    size = os.path.getsize(p) if os.path.exists(p) else 0
    print(f"{p}: {size:,} bytes")

data\mcs2025-gold.pdf: 743,291 bytes
data\mcs2025-silver.pdf: 744,783 bytes
data\World_Silver_Survey-2025.pdf: 12,951,882 bytes
data\CMOApril2020preciousmetals.txt: 26,324 bytes
data\monthly.csv: 35,492 bytes


In [12]:
# Load all documents
all_docs = []

for path in downloaded:
    if not os.path.exists(path):
        continue
    ext = os.path.splitext(path)[1].lower()
    try:
        if ext == ".pdf":
            loader = PDFPlumberLoader(path)
        elif ext == ".csv":
            loader = CSVLoader(path)
        else:
            loader = TextLoader(path, encoding="utf-8", autodetect_encoding=True)
        docs = loader.load()
        all_docs.extend(docs)
        print(f"Loaded {len(docs)} doc(s) from {path}")
    except Exception as e:
        print(f"Error loading {path}: {e}")

Loaded 2 doc(s) from data\mcs2025-gold.pdf
Loaded 2 doc(s) from data\mcs2025-silver.pdf
Loaded 92 doc(s) from data\World_Silver_Survey-2025.pdf
Loaded 1 doc(s) from data\CMOApril2020preciousmetals.txt
Loaded 2311 doc(s) from data\monthly.csv


In [16]:
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=700,
    chunk_overlap=50,
    length_function=len,
    separators=["\n\n", "\n", ". ", " ", ""]
)

chunks = text_splitter.split_documents(all_docs)
print(f"Total chunks: {len(chunks)}")
print(f"Sample chunk ({len(chunks[0].page_content)} chars):\n{chunks[0].page_content[:300]}...")

Total chunks: 2744
Sample chunk (627 chars):
82
GOLD
(Data in metric tons,1 gold content, unless otherwise specified)
Domestic Production and Use: In 2024, domestic gold mine production was estimated to be 160 tons; the value
was estimated to be $12 billion, a 9% increase from the value in 2023. Gold was produced at more than 40 lode
mines in ...


## 2. Embed and Store

In [14]:
# Embedding model (sentence-transformers under the hood)
# all-MiniLM-L6-v2: fast, ~384 dims | all-mpnet-base-v2: better quality, slower
EMBED_MODEL = "all-MiniLM-L6-v2"

embeddings = HuggingFaceEmbeddings(
    model_name=EMBED_MODEL,
    model_kwargs={"device": "cpu"},
    encode_kwargs={"normalize_embeddings": True},
)
print(f"Embeddings ready: {EMBED_MODEL}")

  embeddings = HuggingFaceEmbeddings(


Embeddings ready: all-MiniLM-L6-v2


In [None]:
# Create FAISS vector store from chunks
vectorstore = FAISS.from_documents(chunks, embeddings)

# Optional: save to disk for reuse (no need to re-embed next time)
#Why save it?
# 	Avoid re-embedding and re-building the index every run, which can be time-consuming. You can load the saved index later to quickly restore the vector store without reprocessing the documents.
FAISS_INDEX = "faiss_index"
vectorstore.save_local(FAISS_INDEX)
print(f"Vector store created and saved to {FAISS_INDEX}/ ({len(chunks)} chunks)")

Vector store created and saved to faiss_index/ (2744 chunks)


## 3. Query & Retrieve + Generate

In [31]:
# Retriever: embed question, search FAISS, return top-k chunks
TOP_K = 4
retriever = vectorstore.as_retriever(
    search_type="mmr",  # "similarity" for pure relevance, "mmr" to balance relevance and diversity
    search_kwargs={"k": TOP_K},
)
print(f"Retriever ready (top-{TOP_K} chunks)")
# search_type="mmr" â€“ balances relevance and diversity.

Retriever ready (top-4 chunks)


In [42]:
 
from langchain_community.llms import HuggingFacePipeline
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, pipeline

model_id = "google/flan-t5-small"
tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForSeq2SeqLM.from_pretrained(model_id)

pipe = pipeline(
    "text2text-generation",
    model=model,
    tokenizer=tokenizer,
    max_length=512,
    do_sample=True,
    temperature=0.3, 
)
#pipe = pipeline("text2text-generation", model=model, tokenizer=tokenizer, max_length=256)
llm = HuggingFacePipeline(pipeline=pipe)
print(f"LLM ready: {model_id}")

Device set to use cpu


LLM ready: google/flan-t5-small


In [None]:
 
 
prompt_template = """Use the context below to answer ONLY the specific question asked. Do NOT use choice letters (A, B, 1, 2, 3, 4) or numbers alone
Context:
{context}

Question: {question}

Answer:"""

PROMPT = PromptTemplate(template=prompt_template, input_variables=["context", "question"])

In [None]:
# RAG chain: retriever + LLM
qa_chain = RetrievalQA.from_chain_type(
    llm=llm,
    chain_type="stuff",
    retriever=retriever,
    return_source_documents=True,
    chain_type_kwargs={"prompt": PROMPT},
)
print("RAG chain ready")

RAG chain ready


In [None]:
# Query
query = "What is the current gold production  ?"
result = qa_chain.invoke({"query": query})

print("Answer:", result["result"])
query = "What is the current gold price  ?"
result = qa_chain.invoke({"query": query})

print("Answer:", result["result"])
print("\n--- Source chunks (top-k) ---")
for i, doc in enumerate(result["source_documents"][:2], 1):
    print(f"\n[{i}] {doc.page_content[:200]}...")

Token indices sequence length is longer than the specified maximum sequence length for this model (637 > 512). Running this sequence through the model will result in indexing errors


Answer: 3.300 tons
Answer: The estimated price in 2024 increased by 23% and reached a new record-high annual price compared with the previous record-high annual price in 2023.

--- Source chunks (top-k) ---

[1] in coins, and net bullion flow (in tons) to market from foreign stocks at the New York Federal Reserve Bank.
3Includes gold used in the production of consumer purchased bars, coins, and jewelry. Exclu...

[2] Date: 2020-01
Price: 1560.670...


### Gradio

In [None]:
import gradio as gr

def ask(question):
    if not question.strip():
        return "Please enter a question."
    result = qa_chain.invoke({"query": question})
    return result["result"]

demo = gr.Interface(
    fn=ask,
    inputs=gr.Textbox(label="Ask about gold & silver", placeholder="e.g. What is US gold production?"),
    outputs=gr.Textbox(label="Answer"),
    title="Gold & Silver RAG Bot",
    description="Ask questions about gold and silver markets. Data from USGS, Silver Institute, World Bank.",
)
demo.launch(share=False)  # share=True for public link