# 1. Using HTTP Request

In [5]:
from bs4 import BeautifulSoup
import urllib.request

url = 'http://export.arxiv.org/api/query?search_query=all:electron&start=0&max_results=100'
data = urllib.request.urlopen(url)

# Parse the XML data
soup = BeautifulSoup(data, 'xml')

# Initialize an empty list to store the results
results = []

# Iterate over each entry in the XML data
for entry in soup.find_all('entry'):
    # Initialize an empty dictionary for each entry
    result = {}
    
    # Extract the abstract, title, and authors
    result['abstract'] = entry.find('summary').text.strip()
    result['title'] = entry.find('title').text.strip()
    
    # Extract the author names and get only the first name
    authors = entry.find_all('author')
    result['authors'] = [author.find('name').text.strip().split('\n')[0] for author in authors]
    
    # Append the result to the list of results
    results.append(result)

# Print the list of results
print(results)



[]


# 2. Using arxiv package

In [12]:
import arxiv 
import logging
import PyPDF2
import requests
import io
import os
import re
import time
from dataclasses import dataclass
logging.basicConfig(level=logging.INFO)

# @dataclass
# class Paper:
#   id: str
#   title: str
#   abstract: str
#   authors: list
#   pdf_url: str
#   doi: str
#   updated: str
#   published: str
#   categories: list
#   text: str
#   ref: list  
    
# @dataclass
# class References:
#     references : list

class ArxivScraper(object):
  def __init__(self, query, max_results=10):
    
    self.query = query
    self.client = arxiv.Client()
    self.search = arxiv.Search(
      query = query,
      max_results = max_results,
    )    
    self.result = []
    
  def _get_text_from_pdf(self, pdf_url):
    req = requests.get(pdf_url)
    
    file_bin = io.BytesIO(req.content)
    pdf_reader = PyPDF2.PdfReader(file_bin)
    text = "\n".join([pdf_reader.pages[i].extract_text() for i in range(len(pdf_reader.pages))])
    
    return text
    
  
  def _get_references(self, text):
    ref_text = text.split("\nReferences\n")[-1] if "\nReferences\n" in text else text.split("\nREFERENCES\n")[-1]
    derived_ids = re.findall(r"\d{4}\.\d+", ref_text)
    
    # pop references from text
    text = "".join(text.split("\nReferences\n")[:-1]) if "\nReferences\n" in text \
            else "".join(text.split("\nREFERENCES\n")[:-1])
    
    return text,derived_ids
    
  def scrape(self, id, reference):

    url = f"https://export.arxiv.org/pdf/{id}"
    text = self._get_text_from_pdf(url)
    
    file_name = f"{id}.txt"
    #save text as txt where the name is the entry_id contained in folder named by query
    if not os.path.exists(self.query):
      os.makedirs(self.query)
      
    with open(f"{self.query}/{file_name}", "w", encoding = 'utf-8') as f:
      f.write(text)
      
    id_search = arxiv.Search(
      id_list=[id],
      max_results = 1,
    )
    
    paper = list(self.client.results(id_search))[0]
    
    self.result.append({
      'id': id,
      'title': paper.title,
      'abstract': paper.summary,
      'authors': [author.name for author in paper.authors],
      'pdf_url': paper.pdf_url,
      'doi': paper.doi,
      'updated': paper.updated,
      'published': paper.published,
      'categories': paper.categories,
      'text': text,
      'ref': self._get_references(text) if reference else []
    })
    
  def run(self):
    self._init_result = self.client.results(self.search)
    for paper in self._init_result:
      time.sleep(3)
      self.scrape(paper.entry_id.split("/")[-1], reference = True)
      
    for paper in self.result:
      for ref in paper['ref']:
        time.sleep(3)
        self.scrape(ref, reference = False)
    
    return self.result
    



In [None]:
from arxiv_bot.search import ProcessPDF

ProcessPDF()

In [2]:
scraper = ArxivScraper("Mixture of Experts", 5)

In [3]:
results = scraper.run()

INFO:arxiv:Requesting page (first: True, try: 0): https://export.arxiv.org/api/query?search_query=Mixture+of+Experts&id_list=&sortBy=relevance&sortOrder=descending&start=0&max_results=100
INFO:arxiv:Got first page: 100 of 2368832 total results
INFO:arxiv:Requesting page (first: True, try: 0): https://export.arxiv.org/api/query?search_query=&id_list=1806.08200v1&sortBy=relevance&sortOrder=descending&start=0&max_results=100
INFO:arxiv:Got first page: 1 of 1 total results
INFO:arxiv:Requesting page (first: True, try: 0): https://export.arxiv.org/api/query?search_query=&id_list=1312.4314v3&sortBy=relevance&sortOrder=descending&start=0&max_results=100
INFO:arxiv:Got first page: 1 of 1 total results
INFO:arxiv:Requesting page (first: True, try: 0): https://export.arxiv.org/api/query?search_query=&id_list=2008.09662v1&sortBy=relevance&sortOrder=descending&start=0&max_results=100
INFO:arxiv:Got first page: 1 of 1 total results
INFO:arxiv:Requesting page (first: True, try: 0): https://export.ar

In [8]:
from datetime import datetime
import json
import os

# Custom JSON encoder to serialize datetime objects as strings
class DateTimeEncoder(json.JSONEncoder):
    def default(self, obj):
        if isinstance(obj, datetime):
            return obj.isoformat()
        return super().default(obj)

# Export the results to a JSON file with the custom JSON encoder
folder_name = scraper.query
file_path = f"{folder_name}/results.json"
with open(file_path, "w") as f:
    json.dump(results, f, cls=DateTimeEncoder)

print(f"Results exported to {file_path}")


Results exported to Mixture of Experts/results.json


# GPT3.5 No RAG

In [18]:
import dotenv
import os
from langchain.chat_models import ChatOpenAI
from datetime import datetime

dotenv.load_dotenv()

chat = ChatOpenAI(
    openai_api_key=os.environ["OPENAI_API_KEY"],
    model = 'gpt-3.5-turbo'
)


# Initialize the client
from langchain.schema import(
    SystemMessage,
    HumanMessage,
    AIMessage
)


init_messages = [
    SystemMessage(
        content="Hi, I'm a chatbot that can answer questions about the arXiv!",
        timestamp=datetime.now(),
    ),
    SystemMessage(
        content="What would you like to know?",
        timestamp=datetime.now(),
    ),
    HumanMessage(
        content="What is the latest research about Mixture of Experts?",
        timestamp=datetime.now()
    )
]

res = chat(messages=init_messages)

In [19]:
print(res.content)

As an AI language model, I don't have real-time access to the latest research papers. However, I can provide you with a general overview of the Mixture of Experts (MoE) technique.

Mixture of Experts is a machine learning approach that combines multiple "expert" models to make predictions. Each expert focuses on a specific region of the input space and provides predictions for that region. A gating network is used to determine the relevance or weight of each expert's prediction based on the input.

MoE has been used in various applications, including computer vision, natural language processing, and reinforcement learning. Some recent research papers have explored different aspects of MoE, such as improving the training process, adapting the gating mechanism, or applying it to specific tasks.

To find the latest research papers on Mixture of Experts, I recommend visiting the arXiv website (https://arxiv.org/) and searching for relevant keywords such as "Mixture of Experts," "expert gat

In [8]:
import chromadb

chroma_client = chromadb.PersistentClient(path='.')

collection = chroma_client.get_or_create_collection('arxiv')

# Define the batch size
batch_size = 1000

# Iterate over each row in the DataFrame
for i in tqdm(range(0, len(df), batch_size)):
    i_end = min(i + batch_size, len(df))
    
    batch = df[i:i_end]
    chunk_ids = [x['chunk_id'] for _,x in batch.iterrows()]
    texts = [x['chunk'] for _,x in batch.iterrows()]
    embeddings = embed_model.embed_documents(texts)
    metadata = [{'paper_id': x['paper_id'], 'title': x['title']} for _,x in batch.iterrows()]
    collection.add(
        ids = chunk_ids,
        documents= texts,
        embeddings = embeddings,
        metadatas = metadata
    )
    

  0%|          | 0/9 [00:00<?, ?it/s]

Insert of existing embedding ID: 1806.08200v1-0
Insert of existing embedding ID: 1806.08200v1-1
Insert of existing embedding ID: 1806.08200v1-2
Insert of existing embedding ID: 1806.08200v1-3
Insert of existing embedding ID: 1806.08200v1-4
Insert of existing embedding ID: 1806.08200v1-5
Insert of existing embedding ID: 1806.08200v1-6
Insert of existing embedding ID: 1806.08200v1-7
Insert of existing embedding ID: 1806.08200v1-8
Insert of existing embedding ID: 1806.08200v1-9
Insert of existing embedding ID: 1806.08200v1-10
Insert of existing embedding ID: 1806.08200v1-11
Insert of existing embedding ID: 1806.08200v1-12
Insert of existing embedding ID: 1806.08200v1-13
Insert of existing embedding ID: 1806.08200v1-14
Insert of existing embedding ID: 1806.08200v1-15
Insert of existing embedding ID: 1806.08200v1-16
Insert of existing embedding ID: 1806.08200v1-17
Insert of existing embedding ID: 1806.08200v1-18
Insert of existing embedding ID: 1806.08200v1-19
Insert of existing embedding I

In [30]:

import json

with open('Mixture of Experts/results.json') as json_file:
    results = json.load(json_file) 
from langchain.text_splitter import TokenTextSplitter
from tqdm import tqdm

CHUNK_SIZE = 1000
CHUNK_OVERLAP = 0

splitter = TokenTextSplitter.from_tiktoken_encoder(
    chunk_size = CHUNK_SIZE, chunk_overlap = CHUNK_OVERLAP
)

for result in tqdm(results):
    result['chunks'] = splitter.split_text(result['text'])

100%|██████████| 49/49 [00:00<00:00, 92.47it/s]


In [34]:
import pandas as pd

# Initialize an empty list to store the data
data = []

# Iterate over each paper in the results
for paper in results:
    # Iterate over each chunk in the paper
    for i, chunk in enumerate(paper['chunks']):
        # Append the chunk, title, and id to the data list
        chunk_id = f"{paper['id']}-{i}"
        data.append({'chunk_id': chunk_id, 'chunk': chunk, 'title': paper['title'], 'paper_id': paper['id']})



In [35]:
# delte duplicate paper-ids in data with dict
data = [dict(t) for t in {tuple(d.items()) for d in data}]

In [4]:
import gpt4all

In [2]:
from langchain.vectorstores import chroma
from langchain.llms import gpt4all
from langchain.embeddings import OpenAIEmbeddings
from langchain.chains import RetrievalQAWithSourcesChain

In [3]:
PERSIST_DIR = "vdb_arxiv2"
MISTRAL = "models/mistral-7b-instruct-v0.1.Q4_0.gguf"



In [21]:
from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler
from langchain.agents import AgentType, initialize_agent, load_tools
from langchain_community.chat_models import ChatOpenAI
from langchain.chains import LLMChain
from langchain.prompts import PromptTemplate
from dotenv import load_dotenv

# embeddings_open = OpenAIEmbeddings(
#     model = 'text-embedding-ada-002'
# )

llm_open = gpt4all.GPT4All(
    model = MISTRAL    
)

# prompt = PromptTemplate(
#     template = """ 
#     You are a chatbot that can answer questions about the arXiv. You will be asked questions about the latest research. The format
#     is:
    
#     Question: {question}
    
#     Answer:
#     """,
    
#     input_variables = ['question']
# )
# callbacks = [StreamingStdOutCallbackHandler()]

# llm_chain = LLMChain(prompt=prompt, llm=llm_open, callbacks=callbacks)

# llm_chain.run(question = "What are the latest research about Selective State Spaces in NLP?")


llm_openai = ChatOpenAI(
    model = 'gpt-3.5-turbo'
)


tools = load_tools(
    ['arxiv']
)

agent_chain = initialize_agent(
    tools,
    llm_openai,
    agent = AgentType.ZERO_SHOT_REACT_DESCRIPTION,
    verbose=True)

print(agent_chain.run(
    "What are some recent research about transformers for LLMs? Give me summaries of each paper and their significance relative to current literature"
    ))



[1m> Entering new AgentExecutor chain...[0m


INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
INFO:arxiv:Requesting page (first: True, try: 0): https://export.arxiv.org/api/query?search_query=transformers+for+LLMs&id_list=&sortBy=relevance&sortOrder=descending&start=0&max_results=100


[32;1m[1;3mI should search for recent research papers about transformers for LLMs on arxiv.org.
Action: arxiv
Action Input: transformers for LLMs[0m

INFO:arxiv:Got first page: 100 of 1907362 total results



Observation: [36;1m[1;3mPublished: 2023-07-05
Title: Several categories of Large Language Models (LLMs): A Short Survey
Authors: Saurabh Pahune, Manoj Chandrasekharan
Summary: Large Language Models(LLMs)have become effective tools for natural language
processing and have been used in many different fields. This essay offers a
succinct summary of various LLM subcategories. The survey emphasizes recent
developments and efforts made for various LLM kinds, including task-based
financial LLMs, multilingual language LLMs, biomedical and clinical LLMs,
vision language LLMs, and code language models. The survey gives a general
summary of the methods, attributes, datasets, transformer models, and
comparison metrics applied in each category of LLMs. Furthermore, it highlights
unresolved problems in the field of developing chatbots and virtual assistants,
such as boosting natural language processing, enhancing chatbot intelligence,
and resolving moral and legal dilemmas. The purpose of this st

INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


[32;1m[1;3mI have found three recent research papers about transformers for LLMs on arxiv.org. The first paper is titled "Several categories of Large Language Models (LLMs): A Short Survey" by Saurabh Pahune and Manoj Chandrasekharan. It provides a summary of various subcategories of LLMs and highlights recent developments and efforts made in different fields. The second paper is titled "Can LLM-Generated Misinformation Be Detected?" by Canyu Chen and Kai Shu. It explores the potential for LLMs to generate misinformation and discusses the difficulty in detecting it compared to human-written misinformation. The third paper is titled "INT-FP-QSim: Mixed Precision and Formats For Large Language Models and Vision Transformers" by Lakshmi Nair et al. It proposes an open-source simulator that allows flexible evaluation of LLMs and vision transformers at various numerical precisions and formats. The simulator is used to study the impact of different numerical formats on performance and comp

In [28]:
llm_open("Hi")

INFO:gpt4all.pyllmodel:LLModel.prompt_model -- prompt:
Hi
===/LLModel.prompt_model -- prompt/===
Exception ignored on calling ctypes callback function: <function LLModel._callback_decoder.<locals>._raw_callback at 0x000001F506FA8670>
Traceback (most recent call last):
  File "c:\Users\muraf\Courses\arxiv_llm\venv\lib\site-packages\gpt4all\pyllmodel.py", line 438, in _raw_callback
    def _raw_callback(token_id: int, response: bytes) -> bool:
KeyboardInterrupt: 


In [4]:
from langchain_community.document_loaders import ArxivLoader
from langchain_core.documents import Document
from langchain_community.document_loaders.base import BaseLoader
from arxiv_custom import ArxivAPIWrapper
from langchain.vectorstores import chroma
from langchain_community.embeddings import OpenAIEmbeddings
from langchain.retrievers import ParentDocumentRetriever
from langchain.text_splitter import RecursiveCharacterTextSplitter, TokenTextSplitter
from langchain.storage import InMemoryStore

from typing import Optional, Any, List
from tqdm import tqdm
from dotenv import load_dotenv
import re

# logging.basicConfig(level=logging.INFO)

class ArxivLoaderWithRefs(BaseLoader):
    def __init__(
            self, 
            query: str, 
            load_max_refs: Optional[int] = None, 
            doc_content_chars_max: Optional[int] = None,
            **kwargs: Any
        ):
            self.query = query
            self.load_max_refs = load_max_refs
            self.client = ArxivAPIWrapper(
                doc_content_chars_max=doc_content_chars_max, **kwargs
            )
            self.results = []
            # self.logger = logging.getLogger()
            
    def _remove_ref_from_content(self, content : str) -> str:
        """Remove the references from the content."""
        
        splitted_text = content.split("\nReferences\n") if "\nReferences\n" in content \
                    else content.split("\nREFERENCES\n")
        #Split references from content
        content_new = "".join(splitted_text[:-1])
        refs = splitted_text[-1]
        return content_new, refs
            
    def _get_references(self, paper : Document) -> List[Document]:
        """Get references from a paper."""
        
        # ARXIV_ID_REGEX =  r"\d{2}(0[1-9]|1[0-2])\.\d{4,5}(v\d+|)|\d{7}.*"
        ARXIV_ID_REGEX =  r"\d{4}\.\d{4,5}"
        
        content_new, ref_text = self._remove_ref_from_content(paper.page_content)
        
        # Get the ids from the references
        derived_ids = re.findall(ARXIV_ID_REGEX, ref_text)
        
        print("Lenght of derived_ids: ", len(derived_ids))
        # Remove the references from the content
        
                    
        if self.load_max_refs is not None:
            derived_ids = derived_ids[:self.load_max_refs]
        
         
        refs_docs = []
        print("Lenght of derived_ids: ", len(derived_ids))
        if len(derived_ids) == 0:
            return content_new, refs_docs
        else:
            for id in derived_ids:
                print(id)
                try:
                    refs_docs.append(self.client.load(query=id)[0])
                except:
                    pass
            
        return content_new, refs_docs
            
    def load(self) -> List[Document]:
        """Load documents from Arxiv API."""
        
        # ARXIV_ID_REGEX =  r"\d{2}(0[1-9]|1[0-2])\.\d{4,5}(v\d+|)|\d{7}.*"
        
        # self.logger.info(f"Loading documents from Arxiv API")
        init_results = self.client.load(query=self.query)
            
        # self.logger.info(f"Loaded {len(init_results)} documents from Arxiv API")
        
        # Get the initial results
        self.results.extend(init_results)
        
        # Iterate over each result and get the references ids
        # self.logger.info(f"Getting references")

        for paper in init_results:
            content_new, refs_docs = self._get_references(paper)
            paper.page_content = content_new
            self.results.extend(refs_docs)
            
        return self.results
        
            
docs = ArxivLoaderWithRefs(
    query="1706.03762", 
    load_max_docs=3,
    load_max_refs=0,
).load()




Lenght of derived_ids:  22
Lenght of derived_ids:  0


In [1]:
from grobid_client.grobid_client import GrobidClient

client = GrobidClient(config_path="./grobid_client_python/config.json")

GROBID server is up and running


In [2]:
client.process("processFulltextDocument", "./pdfs/", output="./output/", force=True)

In [4]:
from dataclasses import dataclass
from bs4 import BeautifulSoup
import matplotlib.pyplot as plt

def read_tei(tei_file):
    with open(tei_file, 'r', encoding='utf-8') as tei:
        soup = BeautifulSoup(tei, 'lxml')
        return soup
    raise RuntimeError('Cannot generate a soup from the input')

def elem_to_text(elem, default=''):
    if elem:
        return elem.getText()
    else:
        return default



@dataclass
class Person:
    firstname: str
    middlename: str
    surname: str

class TEIFile(object):
    def __init__(self, filename):
        self.filename = filename
        self.soup = read_tei(filename)
        self._text = None
        self._title = ''
        self._abstract = ''

    @property
    def doi(self):
        idno_elem = self.soup.find('idno', type='DOI')
        if not idno_elem:
            return ''
        else:
            return idno_elem.getText()

    @property
    def title(self):
        if not self._title:
            self._title = self.soup.title.getText()
        return self._title

    @property
    def abstract(self):
        if not self._abstract:
            abstract = self.soup.abstract.getText(separator=' ', strip=True)
            self._abstract = abstract
        return self._abstract

    @property
    def authors(self):
        authors_in_header = self.soup.analytic.find_all('author')

        result = []
        for author in authors_in_header:
            persname = author.persname
            if not persname:
                continue
            firstname = elem_to_text(persname.find("forename", type="first")).strip()
            middlename = elem_to_text(persname.find("forename", type="middle")).strip()
            surname = elem_to_text(persname.surname).strip()
            if middlename == '':
                full_name = f"{firstname} {surname}".strip()
            else:
                full_name = f"{firstname} {middlename} {surname}".strip()
            result.append(full_name)
        return result
    
    @property
    def text(self):
        if not self._text:
            divs_text = []
            for div in self.soup.body.find_all("div"):
                # div is neither an appendix nor references, just plain text.
                if not div.get("type"):
                    text = div.getText(separator=': ', strip=True).replace("\n", "")
                    
                    divs_text.append(text)
            plain_text = "\n\n".join(divs_text)
            self._text = plain_text
        return self._text



In [5]:
from langchain_community.utilities import GoogleSearchAPIWrapper
from dotenv import load_dotenv
from grobid_client.grobid_client import GrobidClient
from langchain_community.vectorstores import chroma
import chromadb
from langchain_openai.embeddings import OpenAIEmbeddings
from langchain.text_splitter import SpacyTextSplitter
from langchain_core.vectorstores import VectorStore
from langchain.schema.retriever import BaseRetriever
from langchain.schema.document import Document
from langchain_core.callbacks.manager import CallbackManagerForRetrieverRun
import os
import time
import re
import arxiv
import logging
from typing import List
logging.basicConfig(level = logging.INFO)
logger = logging.getLogger("IndexArxivPapers")

load_dotenv()

embeddings = OpenAIEmbeddings(model="text-embedding-ada-002")

COLLECTION_NAME = "arxiv"
PERSIST_DIR = "arxiv_vdb"

vectordb = chroma.Chroma(
    collection_name=COLLECTION_NAME,
    persist_directory=PERSIST_DIR,
    embedding_function=embeddings)

splitter = SpacyTextSplitter(chunk_size = 512, chunk_overlap = 0, separator="\n\n")

class IndexNewArxivPapers:
    def __init__(self, vectordb: VectorStore, n_search_results: int = 2):
        self.google_api = GoogleSearchAPIWrapper()
        self.grobid_client = GrobidClient(config_path="./grobid_client_python/config.json")
        self.arxiv_client = arxiv.Client(delay_seconds=0)
        self.vectordb = vectordb
        self.chromadb_client = chromadb.PersistentClient("arxiv_vdb").get_collection("arxiv")
        self.n_search_results = n_search_results
    
    def _get_paper_ids(self, query):
        ARXIV_ID_REGEX =  r"\d{4}\.\d{4,5}"
        
        ids = list({re.findall(ARXIV_ID_REGEX, result['link'])[0] for result in self.google_api.results(query, self.n_search_results)})
        
        return ids
    
        
    
    def _run(self, query):
        logger.info(f"Getting paper ids from google search")
        ids = self._get_paper_ids(query)
        logger.info(f"Got the following ids: {ids}")
        
        os.makedirs(f"./output/", exist_ok=True)
        os.makedirs(f"./pdfs/", exist_ok=True)
        
        for id in ids:
            if not len(self.chromadb_client.get(where={'paper_id': id})['ids']) > 0:
                paper = list(self.arxiv_client.results(arxiv.Search(id_list=[id])))[0]
                logger.info(f"Downloading paper with id {id}")
                paper.download_pdf(dirpath=f"./pdfs/", filename=f"{id}.pdf")
                while not os.path.exists(f"./pdfs/{id}.pdf"):
                    time.sleep(1)
                logger.info(f"Downloaded paper with id {id}")
                
                logger.info(f"Processing papers with GROBID")
                self.grobid_client.process("processFulltextDocument", f"./pdfs/", output=f"./output/", force=True)            
                
                while not os.path.exists(f"./output/{id}.grobid.tei.xml"):
                    time.sleep(1)
                logger.info(f"Processed papers with GROBID")
            
                logger.info(f"Parsing TEI file for id {id}")
                tei_object = TEIFile(f"./output/{id}.grobid.tei.xml")
                chunks = splitter.split_text(tei_object.text)
                metadata = [{'paper_id': id, 'authors': ", ".join(tei_object.authors),'title': tei_object.title, 'chunk_id': f"{id}-{i}"} for i in range(len(chunks))]
            
                logger.info(f"Adding paper with id {id} to vectorstore")
                self.vectordb.add_texts(
                    chunks,
                    metadata
                )

            else:
                logger.info(f"Paper with id {id} already exists in vectorstore")
                continue
                            
    def _arun(self, query: str):
        raise NotImplementedError("This tool does not support asynchronous execution.")
        
    

class RAGWithSearch(BaseRetriever):
    retriever: BaseRetriever
    def _get_relevant_documents(
        self, query: str, *, run_manager: CallbackManagerForRetrieverRun
    ) -> List[Document]:
        #Index new papers using the tool
        index_tool = IndexNewArxivPapers(vectordb)
        index_tool._run(query)
        
        # Use your existing retriever to get the documents
        documents = self.retriever.get_relevant_documents(query, callbacks=run_manager.get_child())
        
        return documents


In [13]:
import chromadb
chromadb.PersistentClient(path='arxiv_vdb').delete_collection('arxiv')

In [7]:
retriever = vectordb.as_retriever(
    search_type="mmr", search_kwargs={"lambda_mult": 0.5, "k": 4}
)

In [8]:
retriever_with_search = RAGWithSearch(retriever=retriever)

In [None]:
retriever.get_relevant_documents("What is the Mamba model?")

In [16]:
retriever_with_search.get_relevant_documents("What is LLama Pro?")

INFO:googleapiclient.discovery_cache:file_cache is only supported with oauth2client<4.0.0
INFO:IndexArxivPapers:Getting paper ids from google search


GROBID server is up and running


INFO:IndexArxivPapers:Got the following ids: ['2401.02415']
INFO:arxiv:Requesting page (first: True, try: 0): https://export.arxiv.org/api/query?search_query=&id_list=2401.02415&sortBy=relevance&sortOrder=descending&start=0&max_results=100
INFO:arxiv:Got first page: 1 of 1 total results
INFO:IndexArxivPapers:Downloading paper with id 2401.02415
INFO:IndexArxivPapers:Downloaded paper with id 2401.02415
INFO:IndexArxivPapers:Processing papers with GROBID
INFO:IndexArxivPapers:Processed papers with GROBID
INFO:IndexArxivPapers:Parsing TEI file for id 2401.02415
INFO:IndexArxivPapers:Adding paper with id 2401.02415 to vectorstore
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
INFO:langchain_community.utils.math:Unable to import simsimd, defaulting to NumPy implementation. If you want to use simsimd please install with `pip install simsimd`.
INFO:langchain_community.uti

[Document(page_content='We introduce LLAMA PRO and LLAMA PRO -INSTRUCT, versatile LLMs that well integrate natural and programming languages, excelling in general tasks, programming, and mathematics.\n\n:\n\n• We benchmark the family of LLAMA PRO on extensive datasets, including both traditional and agentoriented tasks, demonstrating its superiority and great potential in broader complex applications.\n\n\n\nRelated Work: Advancements in Large Language Models.', metadata={'authors': 'Chengyue Wu, Yukang Gan, Yixiao Ge, Zeyu Lu, Jiahao Wang, Ye Feng, Ping Luo, Ying Shan', 'chunk_id': '2401.02415-10', 'paper_id': '2401.02415', 'title': 'LLAMA PRO: Progressive LLaMA with Block Expansion'}),
 Document(page_content='Situated on the Pareto frontier, LLAMA PRO has undergone fine-tuning with an additional 80B tokens in conjunction with LLaMA2, which more than doubles the code tasks average performance.\n\nIn contrast, CodeLLaMA is fine-tuned with 500B tokens.', metadata={'authors': 'Chengyue W

In [9]:
# implement a retrievalqa chain
from langchain.memory import ConversationBufferMemory
from langchain.chains import ConversationalRetrievalChain, RetrievalQAWithSourcesChain
from langchain_openai import ChatOpenAI
from langchain.prompts import PromptTemplate
from langchain.agents.initialize import initialize_agent
from langchain.tools.retriever import create_retriever_tool

llm = ChatOpenAI(
    model = 'gpt-3.5-turbo',
    temperature=0.0
)

retrieval_tool = create_retriever_tool(
    name = 'retriever',
    description = 'Retrieves relevant documents vector database to answer a question',
    retriever = retriever
)

retriever_with_search_tool = create_retriever_tool(
    name = 'retriever_with_search',
    description = 'Retrieves relevant documents from the web when needed and index them vector database to answer a question',
    retriever = retriever_with_search
)

tools = [retrieval_tool, retriever_with_search_tool]

AGENT_INSTRUCTIONS = """You are an agent that is very knowledgable in the latest research papers in Arxiv and 
you provide detailed and in depth answers to questions regarding a specific topic. Answer the following questions as best as you can.

Answer common knowledge questions without any of the tools.
If more specific knowledge is needed, TRY RETRIEVING THE INFORMATION USING THE 'retriever' tool AND CONTEXT FIRST.
If no documetns are retrieved, try the 'retriever_with_search' tool.

You have access to the following tools:

retriever: Retrieves relevant documents from a vector database. This inputs a query and outputs a list of documents.
retriever_with_search: Searches the arxiv website for relevant documetns, indexes them in a vector database and then retrieves them. 
                            This inputs a query and outputs a list of documents.

Use the following format:
Question: the input question you must answer
Thought: you should always think about what to do
Action: the action you should take. Should be one of the tools [retriever, retriever_with_search, document_summarizer]
Action Input: the input to the action
This Thought-Action-Input loop should be repeated until you have enough information to answer the question.
Thought: I now have enough information to answer the question
Final Answer: the final answer to the question              

Begin!



Chat History:
{chat_history},

Input: {input}
Thought: {agent_scratchpad}
"""

prompt = PromptTemplate(
    template = AGENT_INSTRUCTIONS,
    input_variables = ['chat_history', 'input', 'agent_scratchpad']
)


agent = initialize_agent(
    tools,
    llm,
    agent = "chat-conversational-react-description",
    agent_kwargs = {"prompt": prompt},
    memory = ConversationBufferMemory(memory_key='chat_history', 
                                      input_key='input',
                                      output_key='output',
                                      return_messages=True
                                      ),
    verbose=True,
    handle_parsing_errors=True,
    return_intermediate_steps=True
)


  warn_deprecated(


In [10]:
agent

AgentExecutor(memory=ConversationBufferMemory(output_key='output', input_key='input', return_messages=True, memory_key='chat_history'), verbose=True, tags=['chat-conversational-react-description'], agent=ConversationalChatAgent(llm_chain=LLMChain(prompt=ChatPromptTemplate(input_variables=['agent_scratchpad', 'chat_history', 'input'], input_types={'chat_history': typing.List[typing.Union[langchain_core.messages.ai.AIMessage, langchain_core.messages.human.HumanMessage, langchain_core.messages.chat.ChatMessage, langchain_core.messages.system.SystemMessage, langchain_core.messages.function.FunctionMessage, langchain_core.messages.tool.ToolMessage]], 'agent_scratchpad': typing.List[typing.Union[langchain_core.messages.ai.AIMessage, langchain_core.messages.human.HumanMessage, langchain_core.messages.chat.ChatMessage, langchain_core.messages.system.SystemMessage, langchain_core.messages.function.FunctionMessage, langchain_core.messages.tool.ToolMessage]]}, messages=[SystemMessagePromptTemplat

In [11]:
agent.invoke({"input": 'Hi'})




[1m> Entering new AgentExecutor chain...[0m


INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


[32;1m[1;3m{
    "action": "Final Answer",
    "action_input": "Hello! How can I assist you today?"
}[0m

[1m> Finished chain.[0m


{'input': 'Hi',
 'chat_history': [HumanMessage(content='Hi'),
  AIMessage(content='Hello! How can I assist you today?')],
 'output': 'Hello! How can I assist you today?',
 'intermediate_steps': []}

In [12]:
agent.invoke({"input": 'Can you tell me about the Mixtral model?'})



[1m> Entering new AgentExecutor chain...[0m


INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


[32;1m[1;3mCould not parse LLM output: Sure! The Mixtral model is a language model developed by OpenAI. It is a variant of the GPT (Generative Pre-trained Transformer) model, which is a type of deep learning model that is trained on large amounts of text data to generate human-like text.

The Mixtral model is trained using a combination of supervised fine-tuning and reinforcement learning. It is trained on a diverse range of data sources, including books, articles, and websites, to develop a broad understanding of human language and knowledge.

The Mixtral model is designed to be able to assist with a wide range of tasks, from answering questions and providing explanations to engaging in natural-sounding conversations. It is able to generate coherent and relevant responses based on the input it receives, and can provide accurate and informative information on a wide range of topics.

Overall, the Mixtral model is a powerful language model that can provide valuable assistance and info

INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
INFO:googleapiclient.discovery_cache:file_cache is only supported with oauth2client<4.0.0
INFO:IndexArxivPapers:Getting paper ids from google search


[32;1m[1;3m{
    "action": "retriever_with_search",
    "action_input": "Mixtral model"
}[0mGROBID server is up and running


INFO:IndexArxivPapers:Got the following ids: ['2401.04088', '2312.14557']
INFO:arxiv:Requesting page (first: True, try: 0): https://export.arxiv.org/api/query?search_query=&id_list=2401.04088&sortBy=relevance&sortOrder=descending&start=0&max_results=100
INFO:arxiv:Got first page: 1 of 1 total results
INFO:IndexArxivPapers:Downloading paper with id 2401.04088
INFO:IndexArxivPapers:Downloaded paper with id 2401.04088
INFO:IndexArxivPapers:Processing papers with GROBID
INFO:IndexArxivPapers:Processed papers with GROBID
INFO:IndexArxivPapers:Parsing TEI file for id 2401.04088
INFO:IndexArxivPapers:Adding paper with id 2401.04088 to vectorstore
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
INFO:arxiv:Requesting page (first: True, try: 0): https://export.arxiv.org/api/query?search_query=&id_list=2312.14557&sortBy=relevance&sortOrder=descending&start=0&max_results=100
INFO:arxiv:Got first page: 1 of 1 total results
INFO:IndexArxivPapers:Downloading paper


Observation: [33;1m[1;3m[Document(page_content='We introduce Mixtral 8x7B, a Sparse Mixture of Experts (SMoE) language model.\n\nMixtral has the same architecture as Mistral 7B, with the difference that each layer is composed of 8 feedforward blocks (i.e. experts).\n\nFor every token, at each layer, a router network selects two experts to process the current state and combine their outputs.\n\nEven though each token only sees two experts, the selected experts can be different at each timestep.', metadata={'authors': 'Albert Q Jiang, Alexandre Sablayrolles, Antoine Roux, Arthur Mensch, Blanche Savary, Chris Bamford, Singh Devendra, Diego Chaplot, Emma Bou De Las Casas, Florian Hanna, Gianna Bressand, Guillaume Lengyel, Guillaume Bour, Lample, Renard Lélio, Lucile Lavaud, Marie-Anne Saulnier, Pierre Lachaux, Sandeep Stock, Sophia Subramanian, Szymon Yang, Teven Antoniak, Théophile Le Scao, Thibaut Gervet, Thomas Lavril, Timothée Wang, William Lacroix, El Sayed', 'chunk_id': '2401.0408

INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


[32;1m[1;3m{
    "action": "Final Answer",
    "action_input": "The Mixtral model is a Sparse Mixture of Experts (SMoE) language model. It has the same architecture as Mistral 7B, but each layer is composed of 8 feedforward blocks (experts). For every token, at each layer, a router network selects two experts to process the current state and combine their outputs. Mixtral outperforms Llama 2 70B in several languages and demonstrates reduced biases and a more balanced sentiment profile. It supports a fully dense context length of 32k tokens and uses Mixture-of-Expert layers. The model architecture parameters are summarized in Table 1 of the paper 'Mixtral of Experts' by Albert Q Jiang et al."
}[0m

[1m> Finished chain.[0m


{'input': 'Can you tell me about the Mixtral model?',
 'chat_history': [HumanMessage(content='Hi'),
  AIMessage(content='Hello! How can I assist you today?'),
  HumanMessage(content='Can you tell me about the Mixtral model?'),
  AIMessage(content="The Mixtral model is a Sparse Mixture of Experts (SMoE) language model. It has the same architecture as Mistral 7B, but each layer is composed of 8 feedforward blocks (experts). For every token, at each layer, a router network selects two experts to process the current state and combine their outputs. Mixtral outperforms Llama 2 70B in several languages and demonstrates reduced biases and a more balanced sentiment profile. It supports a fully dense context length of 32k tokens and uses Mixture-of-Expert layers. The model architecture parameters are summarized in Table 1 of the paper 'Mixtral of Experts' by Albert Q Jiang et al.")],
 'output': "The Mixtral model is a Sparse Mixture of Experts (SMoE) language model. It has the same architecture

In [21]:
agent.invoke({"input": 'What about the new Mamba model that uses linear state spaces?'})



[1m> Entering new AgentExecutor chain...[0m


INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


[32;1m[1;3mCould not parse LLM output: I apologize, but I am not familiar with the "Mamba" model that uses linear state spaces. It is possible that this model is not widely known or may be a recent development. If you have any additional information or context about the Mamba model, I may be able to provide a more accurate response.[0m
Observation: Invalid or incomplete response
Thought:

INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
INFO:googleapiclient.discovery_cache:file_cache is only supported with oauth2client<4.0.0
INFO:IndexArxivPapers:Getting paper ids from google search


[32;1m[1;3m{
    "action": "retriever_with_search",
    "action_input": "Mamba model that uses linear state spaces"
}[0mGROBID server is up and running


INFO:IndexArxivPapers:Got the following ids: ['2312.00752']
INFO:arxiv:Requesting page (first: True, try: 0): https://export.arxiv.org/api/query?search_query=&id_list=2312.00752&sortBy=relevance&sortOrder=descending&start=0&max_results=100
INFO:arxiv:Got first page: 1 of 1 total results
INFO:IndexArxivPapers:Downloading paper with id 2312.00752
INFO:IndexArxivPapers:Downloaded paper with id 2312.00752
INFO:IndexArxivPapers:Processing papers with GROBID
INFO:IndexArxivPapers:Processed papers with GROBID
INFO:IndexArxivPapers:Parsing TEI file for id 2312.00752
INFO:IndexArxivPapers:Adding paper with id 2312.00752 to vectorstore
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
INFO:langchain_community.utils.math:Unable to import simsimd, defaulting to NumPy implementation. If you want to use simsimd please install with `pip install simsimd`.
INFO:langchain_community.uti


Observation: [33;1m[1;3m[Document(page_content='Scaling\n\nConclusion: We introduce a selection mechanism to structured state space models, allowing them to perform context-dependent reasoning while scaling linearly in sequence length.\n\nWhen incorporated into a simple attention-free architecture, Mamba achieves state-of-the-art results on a diverse set of domains, where it matches or exceeds the performance of strong Transformer models.', metadata={'authors': 'Albert Gu, Tri Dao', 'chunk_id': '2312.00752-99', 'paper_id': '2312.00752', 'title': 'Mamba: Linear-Time Sequence Modeling with Selective State Spaces'}), Document(page_content='On long-form audio waveforms, it in fact significantly hampers performance, which may be intuitive from the point of view that audio is uniformly sampled and very smooth, and therefore benefits from continuous linear time-invariant (LTI) methods.\n\nAfter ablating away the selection mechanism, note that the resulting model is the S4 layer inside the 

INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


[32;1m[1;3m{
    "action": "Final Answer",
    "action_input": "The Mamba model is a linear-time sequence modeling approach that uses selective state spaces. It introduces a selection mechanism to structured state space models, allowing them to perform context-dependent reasoning while scaling linearly in sequence length. Mamba achieves state-of-the-art results on various domains and matches or exceeds the performance of strong Transformer models. You can find more details in the paper 'Mamba: Linear-Time Sequence Modeling with Selective State Spaces' by Albert Gu and Tri Dao."
}[0m

[1m> Finished chain.[0m


{'input': 'What about the new Mamba model that uses linear state spaces?',
 'chat_history': [HumanMessage(content='Hi'),
  AIMessage(content='Hello! How can I assist you today?'),
  HumanMessage(content='Can you tell me about the Mixtral model?'),
  AIMessage(content="The Mixtral model is a Sparse Mixture of Experts (SMoE) language model. It has the same architecture as Mistral 7B, but each layer is composed of 8 feedforward blocks (experts). For every token, at each layer, a router network selects two experts to process the current state and combine their outputs. Mixtral outperforms Llama 2 70B in several languages and demonstrates reduced biases and a more balanced sentiment profile. It supports a fully dense context length of 32k tokens and uses Mixture-of-Expert layers. The model architecture parameters are summarized in Table 1 of the paper 'Mixtral of Experts' by Albert Q Jiang et al."),
  HumanMessage(content='What about the new Mamba model that uses linear state spaces?'),
  A

In [17]:
agent.invoke({'input': 'How does the Mamba model compare to the Mixtral model?'})



[1m> Entering new AgentExecutor chain...[0m


INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


[32;1m[1;3m{
    "action": "retriever",
    "action_input": "Comparison between Mamba model and Mixtral model"
}[0m

INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
INFO:langchain_community.utils.math:Unable to import simsimd, defaulting to NumPy implementation. If you want to use simsimd please install with `pip install simsimd`.
INFO:langchain_community.utils.math:Unable to import simsimd, defaulting to NumPy implementation. If you want to use simsimd please install with `pip install simsimd`.
INFO:langchain_community.utils.math:Unable to import simsimd, defaulting to NumPy implementation. If you want to use simsimd please install with `pip install simsimd`.
INFO:langchain_community.utils.math:Unable to import simsimd, defaulting to NumPy implementation. If you want to use simsimd please install with `pip install simsimd`.



Observation: [36;1m[1;3m[Document(page_content='As a general sequence model backbone, Mamba achieves state-of-the-art performance across several modalities such as language, audio, and genomics.\n\nOn language modeling, our Mamba-3B model outperforms Transformers of the same size and matches Transformers twice its size, both in pretraining and downstream evaluation.\n\n\n\nIntroduction: Foundation models (FMs), or large models pretrained on massive data then adapted for downstream tasks, have emerged as an effective paradigm in modern machine learning.', metadata={'chunk_id': '2312.00752-3', 'paper_id': '2312.00752', 'title': 'Mamba: Linear-Time Sequence Modeling with Selective State Spaces'}), Document(page_content='Detailed results for Mixtral, Mistral 7B and Llama 2 7B/13B/70B and Llama 1 34B: 2: are reported in Table: 2: .\n\nFigure: 2: compares the performance of Mixtral with the Llama models in different categories.\n\nMixtral surpasses Llama 2 70B across most metrics.\n\nIn p

INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


[32;1m[1;3m{
    "action": "Final Answer",
    "action_input": "The Mamba model achieves state-of-the-art performance across several modalities such as language, audio, and genomics. In language modeling, the Mamba model outperforms Transformers of the same size and matches Transformers twice its size in both pretraining and downstream evaluation. Mixtral, on the other hand, surpasses Llama 2 70B across most metrics and displays superior performance in code and mathematics benchmarks. Both models have their strengths and achievements in different areas."
}[0m

[1m> Finished chain.[0m


{'input': 'How does the Mamba model compare to the Mixtral model?',
 'chat_history': [HumanMessage(content='Hi'),
  AIMessage(content='Hello! How can I assist you today?'),
  HumanMessage(content='Can you tell me about the Mixtral model?'),
  AIMessage(content='The Mixtral model is an open-source mixture-of-experts model. It uses a set of 8 distinct groups of parameters and each feedforward block picks two of these groups to process the token. It has been reported to achieve comparable accuracy to GPT 3.5 Turbo on several tasks. Mixtral significantly outperforms Llama 2 70B in French, German, Spanish, and Italian. It is pretrained with multilingual data using a context size of 32k tokens. Mixtral either matches or exceeds the performance of Llama 2 70B and GPT-3.5 on several benchmarks.'),
  HumanMessage(content='What about the new Mamba model that uses linear state spaces?'),
  AIMessage(content='The Mamba model is a linear-time sequence modeling approach that uses selective state spa

In [18]:
agent.invoke({'input': 'Can you tell me how both models work?'})



[1m> Entering new AgentExecutor chain...[0m


INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


[32;1m[1;3mCould not parse LLM output: Sure! To provide a detailed explanation of how both the Mixtral and Mamba models work, I will need to retrieve relevant information. Please give me a moment to gather the necessary details.[0m
Observation: Invalid or incomplete response
Thought:

INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


[32;1m[1;3m{
    "action": "retriever",
    "action_input": "Mixtral model working principle"
}[0m

INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
INFO:langchain_community.utils.math:Unable to import simsimd, defaulting to NumPy implementation. If you want to use simsimd please install with `pip install simsimd`.
INFO:langchain_community.utils.math:Unable to import simsimd, defaulting to NumPy implementation. If you want to use simsimd please install with `pip install simsimd`.
INFO:langchain_community.utils.math:Unable to import simsimd, defaulting to NumPy implementation. If you want to use simsimd please install with `pip install simsimd`.
INFO:langchain_community.utils.math:Unable to import simsimd, defaulting to NumPy implementation. If you want to use simsimd please install with `pip install simsimd`.



Observation: [36;1m[1;3m[Document(page_content='Mixtral in contrast, is an open-source mixture-of-experts model, where each feedforward block picks from a set of 8 distinct groups of parameters and uses two to process the token:\n\n[Mistral AI team, 2023]: .\n\nIt has been reported to achieve comparable accuracy to GPT 3.5 Turbo on several tasks, including some examined in this paper.\n\nWe use the mistralai/Mixtral-8x7b-Instruct-v0.1 version of the model.', metadata={'chunk_id': '2312.11444-8', 'paper_id': '2312.11444', 'title': "An In-depth Look at Gemini's Language Abilities"}), Document(page_content='As it only uses a subset of its parameters for every token, Mixtral allows faster inference speed at low batch-sizes, and higher throughput at large batch-sizes.\n\n:\n\nMixtral is a sparse mixture-of-experts network.\n\nIt is a decoder-only model where the feedforward block picks from a set of 8 distinct groups of parameters.\n\nAt every layer, for every token, a router network cho

INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


[32;1m[1;3m{
    "action": "retriever",
    "action_input": "Mamba model working principle"
}[0m

INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
INFO:langchain_community.utils.math:Unable to import simsimd, defaulting to NumPy implementation. If you want to use simsimd please install with `pip install simsimd`.
INFO:langchain_community.utils.math:Unable to import simsimd, defaulting to NumPy implementation. If you want to use simsimd please install with `pip install simsimd`.
INFO:langchain_community.utils.math:Unable to import simsimd, defaulting to NumPy implementation. If you want to use simsimd please install with `pip install simsimd`.
INFO:langchain_community.utils.math:Unable to import simsimd, defaulting to NumPy implementation. If you want to use simsimd please install with `pip install simsimd`.



Observation: [36;1m[1;3m[Document(page_content="Although the main underlying mechanism of Mamba differs significantly from the attention mechanism used in Transformers, Mamba retains the high-level, block-based structure of Transformer models.\n\nIn this paradigm, identical blocks comprising one or more layers are stacked one after another, with each layer's output being added to the residual stream (Figure: 2: ).\n\nThe final value of the residual stream can subsequently be used to predict the next token in the language modeling task.", metadata={'chunk_id': '2401.04081-16', 'paper_id': '2401.04081', 'title': 'Towards Understanding Mixture of Experts in Deep Learning'}), Document(page_content='We are excited about the broad applications of selective state space models to build foundation models for different domains, especially in emerging modalities requiring long context such as genomics, audio, and video.\n\nOur results suggest that Mamba is a strong candidate to be a general se

INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


[32;1m[1;3m{
    "action": "Final Answer",
    "action_input": "The Mixtral model is an open-source mixture-of-experts model where each feedforward block picks from a set of 8 distinct groups of parameters to process the token. It achieves comparable accuracy to GPT 3.5 Turbo on several tasks. On the other hand, the Mamba model is a linear-time sequence modeling approach that uses selective state spaces. It retains the high-level, block-based structure of Transformer models but differs significantly in its underlying mechanism. Both models have shown impressive performance in their respective domains."
}[0m

[1m> Finished chain.[0m


{'input': 'Can you tell me how both models work?',
 'chat_history': [HumanMessage(content='Hi'),
  AIMessage(content='Hello! How can I assist you today?'),
  HumanMessage(content='Can you tell me about the Mixtral model?'),
  AIMessage(content='The Mixtral model is an open-source mixture-of-experts model. It uses a set of 8 distinct groups of parameters and each feedforward block picks two of these groups to process the token. It has been reported to achieve comparable accuracy to GPT 3.5 Turbo on several tasks. Mixtral significantly outperforms Llama 2 70B in French, German, Spanish, and Italian. It is pretrained with multilingual data using a context size of 32k tokens. Mixtral either matches or exceeds the performance of Llama 2 70B and GPT-3.5 on several benchmarks.'),
  HumanMessage(content='What about the new Mamba model that uses linear state spaces?'),
  AIMessage(content='The Mamba model is a linear-time sequence modeling approach that uses selective state spaces. It achieves 

In [19]:
agent.invoke({'input': 'What are the advantages of the Mamba model compared to the self attention mechanism'})



[1m> Entering new AgentExecutor chain...[0m


INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


[32;1m[1;3m{
    "action": "retriever",
    "action_input": "Advantages of Mamba model compared to self-attention mechanism"
}[0m

INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
INFO:langchain_community.utils.math:Unable to import simsimd, defaulting to NumPy implementation. If you want to use simsimd please install with `pip install simsimd`.
INFO:langchain_community.utils.math:Unable to import simsimd, defaulting to NumPy implementation. If you want to use simsimd please install with `pip install simsimd`.
INFO:langchain_community.utils.math:Unable to import simsimd, defaulting to NumPy implementation. If you want to use simsimd please install with `pip install simsimd`.
INFO:langchain_community.utils.math:Unable to import simsimd, defaulting to NumPy implementation. If you want to use simsimd please install with `pip install simsimd`.



Observation: [36;1m[1;3m[Document(page_content='The advantages of Mamba over the attention mechanism are especially prominent during inference, as not only the computational complexity is lowered, but also the memory usage is not dependent on the context length.\n\n:\n\nMamba addresses the fundamental trade-off between efficiency and effectiveness in sequence models, emphasizing the significance of state compression.\n\nEfficient models necessitate a small state, while effective models require a state containing all crucial information from the context.', metadata={'chunk_id': '2401.04081-9', 'paper_id': '2401.04081', 'title': 'Towards Understanding Mixture of Experts in Deep Learning'}), Document(page_content="Although the main underlying mechanism of Mamba differs significantly from the attention mechanism used in Transformers, Mamba retains the high-level, block-based structure of Transformer models.\n\nIn this paradigm, identical blocks comprising one or more layers are stacked 

INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


[32;1m[1;3m{
    "action": "Final Answer",
    "action_input": "The advantages of the Mamba model compared to the self-attention mechanism include lower computational complexity and memory usage, especially during inference. Mamba addresses the trade-off between efficiency and effectiveness in sequence models by emphasizing state compression. It retains the high-level, block-based structure of Transformer models while offering potential gains in performance and scalability."
}[0m

[1m> Finished chain.[0m


{'input': 'What are the advantages of the Mamba model compared to the self attention mechanism',
 'chat_history': [HumanMessage(content='Hi'),
  AIMessage(content='Hello! How can I assist you today?'),
  HumanMessage(content='Can you tell me about the Mixtral model?'),
  AIMessage(content='The Mixtral model is an open-source mixture-of-experts model. It uses a set of 8 distinct groups of parameters and each feedforward block picks two of these groups to process the token. It has been reported to achieve comparable accuracy to GPT 3.5 Turbo on several tasks. Mixtral significantly outperforms Llama 2 70B in French, German, Spanish, and Italian. It is pretrained with multilingual data using a context size of 32k tokens. Mixtral either matches or exceeds the performance of Llama 2 70B and GPT-3.5 on several benchmarks.'),
  HumanMessage(content='What about the new Mamba model that uses linear state spaces?'),
  AIMessage(content='The Mamba model is a linear-time sequence modeling approach 

In [20]:
agent.invoke({'input': 'Who are the authors of the Mamba model?'})



[1m> Entering new AgentExecutor chain...[0m


INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


[32;1m[1;3m{
    "action": "retriever",
    "action_input": "Authors of the Mamba model"
}[0m

INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
INFO:langchain_community.utils.math:Unable to import simsimd, defaulting to NumPy implementation. If you want to use simsimd please install with `pip install simsimd`.
INFO:langchain_community.utils.math:Unable to import simsimd, defaulting to NumPy implementation. If you want to use simsimd please install with `pip install simsimd`.
INFO:langchain_community.utils.math:Unable to import simsimd, defaulting to NumPy implementation. If you want to use simsimd please install with `pip install simsimd`.
INFO:langchain_community.utils.math:Unable to import simsimd, defaulting to NumPy implementation. If you want to use simsimd please install with `pip install simsimd`.



Observation: [36;1m[1;3m[Document(page_content='As a general sequence model backbone, Mamba achieves state-of-the-art performance across several modalities such as language, audio, and genomics.\n\nOn language modeling, our Mamba-3B model outperforms Transformers of the same size and matches Transformers twice its size, both in pretraining and downstream evaluation.\n\n\n\nIntroduction: Foundation models (FMs), or large models pretrained on massive data then adapted for downstream tasks, have emerged as an effective paradigm in modern machine learning.', metadata={'chunk_id': '2312.00752-3', 'paper_id': '2312.00752', 'title': 'Mamba: Linear-Time Sequence Modeling with Selective State Spaces'}), Document(page_content="Contributions: Maciej integrated Mamba into the codebase, ran preliminary experiments, and oversaw the course of the project.\n\nKamil ran the bulk of the experiments.\n\nKrystian explored alternative Mamba block designs with Jan's help.\n\nSebastian supervised the proj

INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


[32;1m[1;3m{
    "action": "Final Answer",
    "action_input": "The authors of the Mamba model are Maciej Korzeń, Kamil Ciosek, Krystian Bacławski, and Sebastian Ruder."
}[0m

[1m> Finished chain.[0m


{'input': 'Who are the authors of the Mamba model?',
 'chat_history': [HumanMessage(content='Hi'),
  AIMessage(content='Hello! How can I assist you today?'),
  HumanMessage(content='Can you tell me about the Mixtral model?'),
  AIMessage(content='The Mixtral model is an open-source mixture-of-experts model. It uses a set of 8 distinct groups of parameters and each feedforward block picks two of these groups to process the token. It has been reported to achieve comparable accuracy to GPT 3.5 Turbo on several tasks. Mixtral significantly outperforms Llama 2 70B in French, German, Spanish, and Italian. It is pretrained with multilingual data using a context size of 32k tokens. Mixtral either matches or exceeds the performance of Llama 2 70B and GPT-3.5 on several benchmarks.'),
  HumanMessage(content='What about the new Mamba model that uses linear state spaces?'),
  AIMessage(content='The Mamba model is a linear-time sequence modeling approach that uses selective state spaces. It achieve

In [None]:
response['source_documents']

[Document(page_content='We introduce Mixtral 8x7B, a Sparse Mixture of Experts (SMoE) language model.\n\nMixtral has the same architecture as Mistral 7B, with the difference that each layer is composed of 8 feedforward blocks (i.e. experts).\n\nFor every token, at each layer, a router network selects two experts to process the current state and combine their outputs.\n\nEven though each token only sees two experts, the selected experts can be different at each timestep.', metadata={'chunk_id': '2401.04088-0', 'paper_id': '2401.04088', 'title': 'Pushing Mixture of Experts to the Limit: Extremely Parameter Efficient MoE for Instruction Tuning'}),
 Document(page_content='Detailed results for Mixtral, Mistral 7B and Llama 2 7B/13B/70B and Llama 1 34B: 2: are reported in Table: 2: .\n\nFigure: 2: compares the performance of Mixtral with the Llama models in different categories.\n\nMixtral surpasses Llama 2 70B across most metrics.\n\nIn particular, Mixtral displays a superior performance in

In [None]:

from agent_search import SciPhi
from dotenv import load_dotenv

load_dotenv()

client = SciPhi()

# Generate a RAG response

rag_response = client.search(query='mamba', search_provider='agent-search')

Exception: API request failed with status 500

In [82]:
google_api = GoogleSearchAPIWrapper()

google_response = google_api.results('Comparison between LLama 2 and Llama Pro',2)

INFO:googleapiclient.discovery_cache:file_cache is only supported with oauth2client<4.0.0


In [83]:
google_response

[{'title': 'LLaMA Pro: Progressive LLaMA with Block Expansion',
  'link': 'https://arxiv.org/pdf/2401.02415',
  'snippet': "5 days ago ... Figure 4: We compare LLAMA PRO's general performance and code performance to a set of models trained around the same time, spanning from general\xa0..."},
 {'title': 'Llama 2: Open Foundation and Fine-Tuned Chat Models',
  'link': 'https://arxiv.org/pdf/2307.09288.pdf%C3%82%C2%A0',
  'snippet': 'Jul 19, 2023 ... We compare the training loss of the Llama 2 family of models. We observe that after pretraining on 2T Tokens, the models still did not show any\xa0...'}]

In [51]:
vector_db = chroma.Chroma.from_texts(
    texts = [x['chunk'] for x in data],
    ids = [x['chunk_id'] for x in data],
    metadatas = [
        {'title': x['title'],
         'paper_id': x['paper_id']} for x in data],
    embedding = embeddings_open,
    persist_directory = PERSIST_DIR
)

NameError: name 'data' is not defined

In [14]:
import paperscraper
papers = paperscraper.search_papers('bayesian model selection',
                                    limit=10,
                                    pdir='downloaded-papers')

RuntimeError: This event loop is already running

In [53]:
vector_db = None

In [54]:
vector_db = chroma.Chroma(
    persist_directory = PERSIST_DIR,
    embedding_function=embeddings_open
)

NameError: name 'PERSIST_DIR' is not defined

In [None]:
retriever = vector_db.as_retriever()

NameError: name 'vector_db' is not defined

In [None]:
docs = retriever.get_relevant_documents("How did Mixture of Experts came to be", top_k=6)

In [None]:
docs

[Document(page_content='Mixtures of Experts Models∗\nIsobel Claire Gormley†and Sylvia Frühwirth-Schnatter‡\nAbstract\nMixtures of experts models provide a framework in which covariates may be in-\ncluded in mixture models. This is achieved by modelling the parameters of the mix-\nture model as functions of the concomitant covariates. Given their mixture model\nfoundation, mixtures of experts models possess a diverse range of analytic uses, from\nclustering observations to capturing parameter heterogeneity in cross-sectional data.\nThis chapter focuses on delineating the mixture of experts modelling framework and\ndemonstrates the utility and ﬂexibility of mixtures of experts models as an analytic\ntool.\n1 Introduction\nThe terminology mixtures of experts models encapsulates a broad class of mixture models\nin which the model parameters are modelled as functions of concomitant covariates. While\nthe response variable yis modelled via a mixture model, model parameters are modelled as\nf

In [None]:
dadada