In [1]:
!pip install arxiv

Collecting arxiv
  Downloading arxiv-1.4.2-py3-none-any.whl (11 kB)
Collecting feedparser
  Downloading feedparser-6.0.8-py3-none-any.whl (81 kB)
[K     |████████████████████████████████| 81 kB 3.6 MB/s 
[?25hCollecting sgmllib3k
  Downloading sgmllib3k-1.0.0.tar.gz (5.8 kB)
Building wheels for collected packages: sgmllib3k
  Building wheel for sgmllib3k (setup.py) ... [?25l[?25hdone
  Created wheel for sgmllib3k: filename=sgmllib3k-1.0.0-py3-none-any.whl size=6065 sha256=489e737ee4b1491aefc04128601912bc6c8971096e7ba43d712a547ea9a6efbc
  Stored in directory: /root/.cache/pip/wheels/73/ad/a4/0dff4a6ef231fc0dfa12ffbac2a36cebfdddfe059f50e019aa
Successfully built sgmllib3k
Installing collected packages: sgmllib3k, feedparser, arxiv
Successfully installed arxiv-1.4.2 feedparser-6.0.8 sgmllib3k-1.0.0


In [1]:
import arxiv
import pandas as pd
from tqdm import tqdm

query_keywords = [
    "\"image segmentation\"",
    "\"self-supervised learning\"",
    "\"representation learning\"",
    "\"image generation\"",
    "\"object detection\"",
    "\"transfer learning\"",
    "\"transformers\"",
    "\"adversarial training",
    "\"generative adversarial networks\"",
    "\"model compressions\"",
    "\"image segmentation\"",
    "\"few-shot learning\""
]

In [2]:
# Reuse a client with increased number of retries (3 -> 10) and increased page
# size (100->500).
client = arxiv.Client(num_retries=10, page_size=500)

def query_with_keywords(query):
    search = arxiv.Search(
        query=query,
        max_results=3000,
        sort_by=arxiv.SortCriterion.LastUpdatedDate
    )
    terms = []
    titles = []
    abstracts = []
    for res in tqdm(client.results(search), desc=query):
        if res.primary_category in ["cs.CV", "stat.ML", "cs.LG"]:
            terms.append(res.categories)
            titles.append(res.title)
            abstracts.append(res.summary)
    return terms, titles, abstracts

In [3]:
all_titles = []
all_summaries = []
all_terms = []

for query in query_keywords:
    terms, titles, abstracts = query_with_keywords(query)
    all_titles.extend(titles)
    all_summaries.extend(abstracts)
    all_terms.extend(terms)

"image segmentation": 2081it [00:39, 52.98it/s]
"self-supervised learning": 0it [00:03, ?it/s]
"representation learning": 3000it [00:58, 51.15it/s]
"image generation": 1238it [00:26, 47.09it/s]
"object detection": 3000it [01:08, 44.00it/s]
"transfer learning": 3000it [01:08, 44.11it/s]
"transformers": 3000it [01:01, 48.64it/s]
"adversarial training: 0it [00:03, ?it/s]
"generative adversarial networks": 3000it [01:08, 44.08it/s]
"model compressions": 496it [00:10, 47.54it/s]
"image segmentation": 2081it [00:42, 48.94it/s]
"few-shot learning": 0it [00:03, ?it/s]


In [4]:
data = pd.DataFrame({
    'titles': all_titles,
    'summaries': all_summaries,
    'terms': all_terms
})
data.head()

Unnamed: 0,titles,summaries,terms
0,Survey on Semantic Stereo Matching / Semantic ...,Stereo matching is one of the widely used tech...,"[cs.CV, cs.LG]"
1,FUTURE-AI: Guiding Principles and Consensus Re...,The recent advancements in artificial intellig...,"[cs.CV, cs.AI, cs.LG]"
2,Enforcing Mutual Consistency of Hard Regions f...,"In this paper, we proposed a novel mutual cons...","[cs.CV, cs.AI]"
3,Parameter Decoupling Strategy for Semi-supervi...,Consistency training has proven to be an advan...,[cs.CV]
4,Background-Foreground Segmentation for Interio...,"To ensure safety in automated driving, the cor...","[cs.CV, cs.LG]"


In [5]:
data.to_csv('arxiv_data.csv', index=False)