In [2]:
import os
import requests
from bs4 import BeautifulSoup
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.memory import ConversationBufferMemory
from langchain_groq import ChatGroq
from pinecone import Pinecone
from langchain.embeddings import SentenceTransformerEmbeddings
from langchain.vectorstores import Pinecone as PineconeVectorStore
from langchain.chains import ConversationalRetrievalChain

from google.colab import userdata

os.environ["GROQ_API_KEY"] = "xxxxx"
os.environ["PINECONE_API_KEY"] = "xxxxx"

api_key = os.environ["PINECONE_API_KEY"]
pc = Pinecone(api_key=api_key)

In [3]:
embeddings = SentenceTransformerEmbeddings(model_name="all-mpnet-base-v2")

index_name = "read-file-chatbot"

vectorstore = PineconeVectorStore.from_existing_index(index_name, embeddings)

retriever = vectorstore.as_retriever()

memory = ConversationBufferMemory(memory_key="chat_history", return_messages=True)

qa_chain = ConversationalRetrievalChain.from_llm(
    llm=ChatGroq(model_name="llama3-8b-8192"),
    retriever=retriever,
    memory=memory
)

  embeddings = SentenceTransformerEmbeddings(model_name="all-mpnet-base-v2")
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
  memory = ConversationBufferMemory(memory_key="chat_history", return_messages=True)


In [4]:
def fetch_arxiv_html(url):
    response = requests.get(url)
    if response.status_code == 200:
        return response.text
    else:
        raise Exception(f"Failed to fetch URL {url} - Status Code: {response.status_code}")

def extract_text_from_html(html_content, num_pages=3):
    soup = BeautifulSoup(html_content, "html.parser")

    paragraphs = soup.find_all("p")
    extracted_text = [p.get_text() for p in paragraphs]

    text_chunk = " ".join(extracted_text[:num_pages * 500])

    return text_chunk

def index_text_in_pinecone(text):
    """Splits text and indexes it in Pinecone."""
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
    documents = text_splitter.create_documents([text])

    # Store in Pinecone
    vectorstore.add_texts([doc.page_content for doc in documents])
    print("Text indexed successfully!")

def ask_question(query):
    """Queries the chatbot with conversation memory."""
    response = qa_chain.run({"question": query})
    return response

In [5]:
if __name__ == "__main__":
    # Fetch HTML content
    url = "https://arxiv.org/html/2412.19437v1"
    html_content = fetch_arxiv_html(url)

    # Extract text from first 3 pages
    extracted_text = extract_text_from_html(html_content)#, num_pages=3)

    # Index text in Pinecone
    index_text_in_pinecone(extracted_text)

    # Chat loop with memory
    print("\nChatbot ready! Type 'exit' to stop.\n")
    while True:
        query = input("You: ")
        if query.lower() in ["exit", "quit"]:
            print("Chatbot: Goodbye!")
            break
        response = ask_question(query)
        print("Chatbot:", response)

Extracted Text Preview:
 001 




























































































































































































 







































 We present DeepSeek-V3, a strong Mixture-of-Experts (MoE) language model with 671B total parameters with 37B activated for each token.
To achieve efficient inference and cost-effective training, DeepSeek-V3 adopts Multi-head Latent Attention (MLA) and DeepSeekMoE a
Text indexed successfully!

Chatbot ready! Type 'exit' to stop.

You: What is the main contribution of the paper?


  response = qa_chain.run({"question": query})


Chatbot: I don't know the specific paper being referred to, but based on the context provided, it appears to be related to a neural network architecture, possibly a gated fusion network. The notation and symbols used suggest that the paper may be discussing a specific type of expert-based model, where experts are routed or shared based on input tokens.

Without more specific information about the paper, I'm unable to determine the main contribution of the paper. If you could provide more context or details about the paper, I'd be happy to try and help answer your question.
You: Can you explain that in simpler terms?
Chatbot: I'm not entirely sure, as the provided context appears to be a technical description of the paper in a mathematical notation. However, I can try to simplify it:

The paper seems to be about improving the efficiency of a specific type of artificial intelligence model called a transformer. Specifically, it focuses on reducing the memory usage of the model during infe