In [7]:
import os
import streamlit as st
import pickle
import time
import faiss
import numpy as np
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.document_loaders import UnstructuredURLLoader
from sentence_transformers import SentenceTransformer
from transformers import pipeline, AutoModelForSeq2SeqLM, AutoTokenizer

# Constants
FILE_PATH = "faiss_store.pkl"

st.title("RockyBot: News Research Tool 📈")
st.sidebar.title("News Article URLs")

# User Input: Dynamic URL Entry
urls = st.sidebar.text_area("Enter article URLs (one per line)").split("\n")
process_url_clicked = st.sidebar.button("Process URLs")

# Load Hugging Face LLM (Replaces OpenAI)
@st.cache_resource
def load_llm():
    model_name = "google/flan-t5-base"
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
    return pipeline("text2text-generation", model=model, tokenizer=tokenizer)

# Load Sentence Transformer model for embeddings
@st.cache_resource
def load_embedding_model():
    return SentenceTransformer("all-MiniLM-L6-v2")

# Process URLs and store FAISS index
def process_urls(urls, embedding_model):
    if not any(urls):
        st.error("Please enter at least one valid URL.")
        return

    try:
        st.text("Fetching articles... ⏳")
        loader = UnstructuredURLLoader(urls=urls)
        data = loader.load()

        st.text("Splitting text into chunks... ⏳")
        text_splitter = RecursiveCharacterTextSplitter(separators=['\n\n', '\n', '.', ','], chunk_size=1000)
        docs = text_splitter.split_documents(data)

        st.text("Generating embeddings... ⏳")
        embeddings = np.array([embedding_model.encode(doc.page_content) for doc in docs])

        index = faiss.IndexFlatL2(embeddings.shape[1])
        index.add(embeddings)

        with open(FILE_PATH, "wb") as f:
            pickle.dump((index, docs), f)

        st.text("Processing complete! ✅")
        time.sleep(2)

    except ModuleNotFoundError:
        st.error("Missing dependencies. Install using:")
        st.code("pip install unstructured pdfminer.six unstructured-inference unstructured-pytesseract")
    except Exception as e:
        st.error(f"Error processing URLs: {e}")

# Retrieve answers from FAISS
def query_llm(question, generator, embedding_model):
    if not os.path.exists(FILE_PATH):
        return "No data available. Please process URLs first."

    try:
        with open(FILE_PATH, "rb") as f:
            index, docs = pickle.load(f)

        question_embedding = embedding_model.encode(question).reshape(1, -1)
        D, I = index.search(question_embedding, k=3)

        relevant_texts = " ".join([docs[i].page_content for i in I[0]])

        prompt = f"Context: {relevant_texts}\n\nQuestion: {question} Answer:"
        response = generator(prompt, max_length=200, do_sample=True)

        return response[0]['generated_text']

    except FileNotFoundError:
        return "Error: FAISS index not found. Please process URLs first."
    except Exception as e:
        return f"Error: {str(e)}"

# Load models
generator = load_llm()
embedding_model = load_embedding_model()

# Process URLs
if process_url_clicked:
    process_urls(urls, embedding_model)

# Question Input
query = st.text_input("Ask a question:")
if query:
    answer = query_llm(query, generator, embedding_model)
    st.subheader("Answer:")
    st.write(answer)


2025-02-25 16:06:50.476 
  command:

    streamlit run C:\Users\rgurr\anaconda3\Lib\site-packages\ipykernel_launcher.py [ARGUMENTS]
2025-02-25 16:06:50.495 Session state does not function when running a script without `streamlit run`
Device set to use cpu


In [2]:
pip install faiss-cpu


Collecting faiss-cpu
  Using cached faiss_cpu-1.10.0-cp312-cp312-win_amd64.whl.metadata (4.5 kB)
Using cached faiss_cpu-1.10.0-cp312-cp312-win_amd64.whl (13.7 MB)
Installing collected packages: faiss-cpu
Successfully installed faiss-cpu-1.10.0
Note: you may need to restart the kernel to use updated packages.


In [4]:

pip install --upgrade streamlit transformers sentence-transformers faiss-cpu langchain unstructured pdfminer.six unstructured-inference unstructured-pytesseract


Collecting streamlit
  Downloading streamlit-1.42.2-py2.py3-none-any.whl.metadata (8.9 kB)
Collecting transformers
  Using cached transformers-4.49.0-py3-none-any.whl.metadata (44 kB)
Collecting sentence-transformers
  Using cached sentence_transformers-3.4.1-py3-none-any.whl.metadata (10 kB)
Collecting langchain
  Using cached langchain-0.3.19-py3-none-any.whl.metadata (7.9 kB)
Collecting unstructured
  Using cached unstructured-0.16.23-py3-none-any.whl.metadata (24 kB)
Collecting pdfminer.six
  Using cached pdfminer.six-20240706-py3-none-any.whl.metadata (4.1 kB)
Collecting unstructured-inference
  Using cached unstructured_inference-0.8.7-py3-none-any.whl.metadata (5.3 kB)
Collecting unstructured-pytesseract
  Using cached unstructured.pytesseract-0.3.13-py3-none-any.whl.metadata (11 kB)
Collecting huggingface-hub<1.0,>=0.26.0 (from transformers)
  Using cached huggingface_hub-0.29.1-py3-none-any.whl.metadata (13 kB)
Collecting tokenizers<0.22,>=0.21 (from transformers)
  Using cach

In [6]:
pip install -U langchain-community

Collecting langchain-community
  Using cached langchain_community-0.3.18-py3-none-any.whl.metadata (2.4 kB)
Collecting pydantic-settings<3.0.0,>=2.4.0 (from langchain-community)
  Using cached pydantic_settings-2.8.0-py3-none-any.whl.metadata (3.5 kB)
Collecting httpx-sse<1.0.0,>=0.4.0 (from langchain-community)
  Using cached httpx_sse-0.4.0-py3-none-any.whl.metadata (9.0 kB)
Using cached langchain_community-0.3.18-py3-none-any.whl (2.5 MB)
Using cached httpx_sse-0.4.0-py3-none-any.whl (7.8 kB)
Using cached pydantic_settings-2.8.0-py3-none-any.whl (30 kB)
Installing collected packages: httpx-sse, pydantic-settings, langchain-community
Successfully installed httpx-sse-0.4.0 langchain-community-0.3.18 pydantic-settings-2.8.0
Note: you may need to restart the kernel to use updated packages.


In [8]:
!streamlit run main.py


^C
