<a href="https://colab.research.google.com/github/simonecaletti/arxiv-filter/blob/main/colab/arxiv_filter_test.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install arxiv
!pip install python-dotenv

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting arxiv
  Downloading arxiv-1.4.3-py3-none-any.whl (12 kB)
Collecting feedparser
  Downloading feedparser-6.0.10-py3-none-any.whl (81 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m81.1/81.1 KB[0m [31m1.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting sgmllib3k
  Downloading sgmllib3k-1.0.0.tar.gz (5.8 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: sgmllib3k
  Building wheel for sgmllib3k (setup.py) ... [?25l[?25hdone
  Created wheel for sgmllib3k: filename=sgmllib3k-1.0.0-py3-none-any.whl size=6065 sha256=74c407d7c858dae3a1f1cab4f5aa12f94dd592dc8ba5241542a1f7de0f91ff4e
  Stored in directory: /root/.cache/pip/wheels/65/7a/a7/78c287f64e401255dff4c13fdbc672fed5efbfd21c530114e1
Successfully built sgmllib3k
Installing collected packages: sgmllib3k, feedparser, arxiv
Successfully installed arxiv-1.4.3 feedparse

In [2]:
import os
import time
import re
from datetime import datetime 
from pytz import timezone
import requests
import arxiv
from dotenv import load_dotenv

load_dotenv()


False

In [171]:
class Query(object):
    def __init__(self, result):
        self.date = result.updated
        self.url = result.entry_id
        self.title = result.title
        self.authors = ', '.join([author.name for author in result.authors])
        self.abstract = result.summary
        # self.date_str = query['published']
        self.id = result.entry_id[result.entry_id.find("/v")+5:]
        self.categories = result.categories

    @property
    def is_recent(self):
        curr_time = datetime.now(timezone('GMT'))
        delta_time = curr_time - self.date
        assert delta_time.total_seconds() > 0
        return delta_time.days < 2

    def __hash__(self):
        return self.id

    def __str__(self):
        s = ''
        s += self.title + '\n'
        s += self.url + '\n'
        s += self.authors + '\n'
        s += ', '.join(self.categories) + '\n'
        s += self.date.ctime() + ' GMT \n'
        s += '\n' + self.abstract + '\n'
        return s
    
    def printQuery(self):
      print("============")
      print(self.date)
      print(self.url)
      print("Title:", self.title)
      print("Authors:", self.authors)

In [207]:
class ArxivFilter(object):
  def __init__(self, categories, titles, authors):
    self._categories = categories
    self._titles = titles
    self._authors = authors

  @property
  def _get_previously_sent_arxivs(self, _prev_arxivs="prev_arxiv.txt"):
      if os.path.exists(_prev_arxivs):
          print("prev_arxiv.txt file does exist!")
          with open(_prev_arxivs, 'r') as f:
              return set(f.read().split('\n'))
      else:
          return set()

  def _save_previously_sent_arxivs(self, _new_queries, _prev_arxivs=".prev_arxiv.txt"):
      prev_arxivs = list(_get_previously_sent_arxivs(_prev_arxivs))
      prev_arxivs += [q.id for q in _new_queries]
      prev_arxivs = list(set(prev_arxivs))
      with open(_prev_arxivs, 'w') as f:
          f.write('\n'.join(prev_arxivs))

  def _get_queries_from_last_day(self, max_results=100):
        queries1 = []
        queries2 = []

        category_query_string = "OR ".join([f"cat:{category} " for category in self._categories]).strip()
        title_query_string = "OR ".join([f"ti:\"{title}\" " for title in self._titles]).strip()
        auth_query_string = "OR ".join([f"au:\"{author}\" " for author in self._authors]).strip()
        query_string1 = f"({category_query_string}) AND ({title_query_string})"
        query_string2 = f"({category_query_string}) AND ({auth_query_string})"
        #print("Query string 1: ", query_string1)
        #print("Query string 2", query_string2)

        # get all queries in the categories in the last day filtered by title keywords
        search1 = arxiv.Search(query=query_string1, sort_by=arxiv.SortCriterion.SubmittedDate, max_results=max_results)
        new_queries1 = [Query(result) for result in search1.results()]
        queries1 += [q for q in new_queries1 if q.is_recent]

        # get all queries in the categories in the last day filtered by author keywords
        search2 = arxiv.Search(query=query_string2, sort_by=arxiv.SortCriterion.SubmittedDate, max_results=max_results)
        new_queries2 = [Query(result) for result in search2.results()]
        queries2 += [q for q in new_queries2 if q.is_recent]

        # merge the two queries and get rid of duplicates
        queries = queries1 + queries2
        queries_dict = {q.id: q for q in queries}
        unique_keys = set(queries_dict.keys())
        unique_queries = [queries_dict[k] for k in unique_keys]

        # sort from most recent to least
        sorted_queries = sorted(unique_queries, key=lambda q: (datetime.now(timezone('GMT')) - q.date).total_seconds())

        # filter if previously sent
        prev_arxivs = _get_previously_sent_arxivs()
        prev_filtered_queries = [q for q in sorted_queries if q.id not in prev_arxivs]
        _save_previously_sent_arxivs(prev_filtered_queries)
        
        return prev_filtered_queries 
    
  def run(self):
    queries = self._get_queries_from_last_day()
    for q in queries:
      printQuery(q)

In [149]:
print("Before")
!ls .
!rm -rf categories.txt
!rm -rf authors.txt
!rm -rf titles.txt
print("After")
!ls .

Before
authors.txt  categories.txt  titles.txt
After


In [150]:
from google.colab import files

categories = files.upload()
titles = files.upload()
authors = files.upload()

Saving categories.txt to categories.txt


Saving titles.txt to titles.txt


Saving authors.txt to authors.txt


In [152]:
with open('categories.txt', 'r') as f:
  categories = [line.strip() for line in f.read().split('\n') if len(line.strip()) > 0]

  print(categories)

['hep-ph', 'hep-th', 'quant-ph']


In [153]:
with open('titles.txt', 'r') as f:
  titles = [line.strip() for line in f.read().split('\n') if len(line.strip()) > 0]

print(titles)

['qcd', 'jets', 'jet', 'parton shower', 'quark', 'gluon', 'resummation', 'thrust', 'event shape', 'angularity', 'angularities', 'jet shape', 'strong coupling', 'Gradient', 'VQA', 'optimization']


In [154]:
with open('authors.txt', 'r') as f:
  authors = [line.strip() for line in f.read().split('\n') if len(line.strip()) > 0]

print(authors)

['Simone Marzani', 'Gregory Soyez', 'Gavin Salam', 'Andrew J. Larkoski', 'Oleh Fedkevych', 'Jesse Thaler', 'Wouter Waalewijn', 'Gehrmann', 'Giovanni Stagnitto', 'Alexander Huss', 'Rhorry Gauld', 'Poncelet', 'Czakon', 'Mitov', 'Matteo Cacciari', 'Catani', 'Giulia Zanderighi', 'Andrea Banfi', 'Giovanni Ridolfi', 'Riccardo Torre']


In [208]:
af = ArxivFilter(categories=categories,
                 titles=titles,
                 authors=authors)
af.run()

2023-03-22 17:51:32+00:00
http://arxiv.org/abs/2303.12781v1
Title: Drell-Yan lepton-pair production: $q_T$ resummation at approximate N$^4$LL+N$^4$LO accuracy
Authors: Stefano Camarda, Leandro Cieri, Giancarlo Ferrera
2023-03-22 16:39:24+00:00
http://arxiv.org/abs/2303.12691v1
Title: Probing quark transverse momentum distributions in the Color Glass Condensate: quark-gluon dijets in Deep Inelastic Scattering at next-to-eikonal accuracy
Authors: Tolga Altinoluk, Nestor Armesto, Guillaume Beuf
2023-03-22 15:10:47+00:00
http://arxiv.org/abs/2303.12630v1
Title: Exclusive heavy vector meson photoproduction on nuclei in NLO perturbative QCD
Authors: K. J. Eskola, V. Guzey, T. Löytäinen, H. Paukkunen, C. A. Flett
2023-03-22 14:38:13+00:00
http://arxiv.org/abs/2303.12601v1
Title: A real world test of Portfolio Optimization with Quantum Annealing
Authors: Wolfgang Sakuler, Johannes M. Oberreuter, Riccardo Aiolfi, Luca Asproni, Branislav Roman, Jürgen Schiefer
2023-03-22 14:32:43+00:00
http://ar

In [209]:
!ls --all

.  ..  authors.txt  categories.txt  .config  .prev_arxiv.txt  titles.txt


In [212]:
!cat .prev_arxiv.txt

://arxiv.org/abs/2303.12475v1
://arxiv.org/abs/2303.12520v1
://arxiv.org/abs/2303.12691v1
://arxiv.org/abs/2303.12485v1
://arxiv.org/abs/2303.12063v1
://arxiv.org/abs/2303.12630v1
://arxiv.org/abs/2302.12266v2
://arxiv.org/abs/2303.12595v1
://arxiv.org/abs/2303.12781v1
://arxiv.org/abs/2303.12601v1
://arxiv.org/abs/2303.12140v1

In [200]:
!rm -rf .prev_arxiv.txt