# Retrieve data from arxiv library

In this process, we can retrieve data from various sources. We can use Selenium or BeautifulSoup for web scraping, retrieve data from an API, get data from a library (as shown in this tutorial), and so on.

In [2]:
import arxiv
import datetime
import pytz  

client = arxiv.Client()

search = arxiv.Search(
  query = "rag",
  max_results = 30,
  sort_by = arxiv.SortCriterion.SubmittedDate,
  sort_order = arxiv.SortOrder.Descending
)

results = client.results(search)
all_papers = list(results)

utc_timezone = pytz.timezone('UTC')
current_datetime_utc = datetime.datetime.now(utc_timezone)

seven_days_ago = current_datetime_utc - datetime.timedelta(days=7)

this_week_papers = [paper for paper in all_papers if paper.published >= seven_days_ago]
this_week_papers

[arxiv.Result(entry_id='http://arxiv.org/abs/2406.03963v1', updated=datetime.datetime(2024, 6, 6, 11, 14, 27, tzinfo=datetime.timezone.utc), published=datetime.datetime(2024, 6, 6, 11, 14, 27, tzinfo=datetime.timezone.utc), title='A + B: A General Generator-Reader Framework for Optimizing LLMs to Unleash Synergy Potential', authors=[arxiv.Result.Author('Wei Tang'), arxiv.Result.Author('Yixin Cao'), arxiv.Result.Author('Jiahao Ying'), arxiv.Result.Author('Bo Wang'), arxiv.Result.Author('Yuyue Zhao'), arxiv.Result.Author('Yong Liao'), arxiv.Result.Author('Pengyuan Zhou')], summary='Retrieval-Augmented Generation (RAG) is an effective solution to supplement\nnecessary knowledge to large language models (LLMs). Targeting its bottleneck\nof retriever performance, "generate-then-read" pipeline is proposed to replace\nthe retrieval stage with generation from the LLM itself. Although promising,\nthis research direction is underexplored and still cannot work in the scenario\nwhen source knowled

# Use LLM to do simple pre-processing

We could do any pre-processing step in here. In this tutorial, I'm using llm to extract 3 key points from an abstract (problem, solution, result). Second, I'm using BGE embedding model to embed the text. The embedding would be used for the semantic search

## Extract problem, solution, result

In [4]:
from langchain_core.output_parsers import JsonOutputParser
from langchain_core.prompts import PromptTemplate
from langchain_core.pydantic_v1 import BaseModel, Field
from langchain_groq import ChatGroq
import os
from dotenv import load_dotenv
load_dotenv()

groq_api = os.getenv("GROQ_API_KEY")

In [5]:
model = ChatGroq(temperature=0, model_name="llama3-70b-8192", groq_api_key = groq_api)

In [6]:
class Paper(BaseModel):
    problem: str = Field(description="Extract the main research problem from the abstract")
    solution: str = Field(description="Extract the proposed method, approach, or solution from the abstract. Be concise and specific")
    result: str = Field(description="A summary of the main findings or outcomes derived from applying the proposed solution.")

In [7]:
summary = this_week_papers[1].summary

parser = JsonOutputParser(pydantic_object=Paper)

prompt = PromptTemplate(
    template="Answer the user query.\n{format_instructions}\n{query}\n",
    input_variables=["query"],
    partial_variables={"format_instructions": parser.get_format_instructions()},
)

chain = prompt | model | parser

result = chain.invoke({"query": summary})

In [8]:
result

{'problem': 'The scaling laws for designing large language models (LLMs) were studied under the assumption of unlimited computing resources, but how would a resource-constrained computing environment affect the design choices for a personalized LLM?',
 'solution': 'We study the tradeoffs among key design factors, including learning methods, personalized data, LLM types and sizes, compression methods, learning time, and difficulty levels of target use cases, to draw guidelines for deploying LLMs onto resource-constrained devices.',
 'result': 'We found that the optimal choice between parameter learning and RAG varies depending on the difficulty of the downstream task, longer fine-tuning time does not necessarily help the model, and a compressed LLM may be a better choice than an uncompressed LLM to learn from limited personalized data.'}

## Embed the text

In [9]:
from langchain_community.embeddings import HuggingFaceBgeEmbeddings

model_name = "BAAI/bge-large-en-v1.5"
model_kwargs = {"device": "cpu"}
encode_kwargs = {"normalize_embeddings": True}
embeddings = HuggingFaceBgeEmbeddings(
    model_name=model_name, model_kwargs=model_kwargs, encode_kwargs=encode_kwargs
)

  from .autonotebook import tqdm as notebook_tqdm


In [11]:
embeddings.embed_query('this is text to embed')

[0.011571184732019901,
 -0.003995150793343782,
 0.012017685920000076,
 -0.0004448783292900771,
 0.03873899579048157,
 0.0047366926446557045,
 -0.009235254488885403,
 -0.026408124715089798,
 0.04920775443315506,
 0.014838860370218754,
 -0.03845566511154175,
 0.0038882913067936897,
 0.025377603247761726,
 0.004450621083378792,
 0.011958242394030094,
 -0.0251295268535614,
 -0.008553103543817997,
 -0.02258511260151863,
 0.0003077227156609297,
 -0.00789567455649376,
 -0.06867167353630066,
 0.047635313123464584,
 -0.10204676538705826,
 0.009220322594046593,
 -0.010319092310965061,
 -4.3260410166112706e-05,
 -0.016111720353364944,
 0.013980735093355179,
 0.045737624168395996,
 0.03448636457324028,
 0.005253073293715715,
 0.052655115723609924,
 -0.005120332818478346,
 -0.02846197970211506,
 -0.014756501652300358,
 0.01373535767197609,
 0.029020046815276146,
 -0.012878031469881535,
 -0.020499661564826965,
 -0.041128113865852356,
 0.0016785095212981105,
 -0.005893446039408445,
 0.024892924353480

## Combine all of the process

In [12]:
import pandas as pd

result['author'] = ', '.join([author.name for author in this_week_papers[1].authors])
result['url'] = next((link.href for link in this_week_papers[1].links if link.title == 'pdf'), None)
result['embedding'] = embeddings.embed_query(result['solution'])
result['title'] = this_week_papers[1].title
result['published_date'] = pd.Timestamp(this_week_papers[1].published).strftime("%Y%m%d_%H%M%S")

In [13]:
result

{'problem': 'The scaling laws for designing large language models (LLMs) were studied under the assumption of unlimited computing resources, but how would a resource-constrained computing environment affect the design choices for a personalized LLM?',
 'solution': 'We study the tradeoffs among key design factors, including learning methods, personalized data, LLM types and sizes, compression methods, learning time, and difficulty levels of target use cases, to draw guidelines for deploying LLMs onto resource-constrained devices.',
 'result': 'We found that the optimal choice between parameter learning and RAG varies depending on the difficulty of the downstream task, longer fine-tuning time does not necessarily help the model, and a compressed LLM may be a better choice than an uncompressed LLM to learn from limited personalized data.',
 'author': 'Ruiyang Qin, Dancheng Liu, Zheyu Yan, Zhaoxuan Tan, Zixuan Pan, Zhenge Jia, Meng Jiang, Ahmed Abbasi, Jinjun Xiong, Yiyu Shi',
 'url': 'htt

In [15]:
result.keys()

dict_keys(['problem', 'solution', 'result', 'author', 'url', 'embedding', 'title', 'published_date'])

# Upsert the data into database

We could use any database. In this tutorial, I'm using Supabase

In [16]:
from supabase.client import Client, create_client

supabase_url = os.environ.get("SUPABASE_URL")
supabase_key = os.environ.get("SUPABASE_KEY")
supabase_client = create_client(supabase_url, supabase_key)

supabase_client.table('research_papers').upsert(result,  returning="minimal", on_conflict="title").execute()

APIResponse[~_ReturnT](data=[], count=0)