In [None]:
# If running on Google Colab
from google.colab import drive
drive.mount("/content/gdrive")

# Search arXiv Link

Imported from [roomylee's implementation](https://github.com/roomylee/ACL-2020-Papers/blob/master/generate_paper_list_with_arxiv_link.ipynb)

In [13]:
import json
from googlesearch import search
import urllib
from bs4 import BeautifulSoup
from difflib import SequenceMatcher
from tqdm import tqdm
import time


def similarity(a, b):
    return SequenceMatcher(None, a, b).ratio()

def search_arxiv_link(paper):
    for j in search(paper["title"], tld="co.in", num=10, stop=1, pause=1.0, user_agent="acl2020"):
        if 'arxiv.org/abs' in j:
            thepage = urllib.request.urlopen(j)
            soup = BeautifulSoup(thepage, "html.parser")
            searched_title = ' '.join(soup.title.text.lower().split()[1:])
            if similarity(paper["title"], searched_title) > 0.8:
                paper["link"] = j
                j = j.replace("https", "http")
                res = arxiv.query(query="", id_list=[j.replace("http://arxiv.org/abs/", "")])
                for ppr in res:
                  paper["summary"] = ppr["summary"]
                break
            else:
                print("NOT MATCHED")
                paper["link"] = ""
                paper["summary"] = ""
                print(paper["title"])
                print(searched_title)
    return paper

# Find arXiv Data for Extracted Papers

In [15]:
CONFERENCE = "ICML" # Can be ACL, ICML, ICLR, CVPR

In [3]:
with open("conferences/{}/data/extracted_papers.json".format(CONFERENCE)) as f:
    extracted_papers = json.load(f)

In [None]:
import multiprocessing as mp

pool = mp.Pool(mp.cpu_count())
results = [pool.apply(search_arxiv_link, (paper,)) for paper in extracted_papers]
pool.close()

In [None]:
with open("conferences/{}/data/papers_with_arixv_data.json".format(CONFERENCE), "w") as f:
        json.dump(results, f)

# Generate Statistics

In [2]:
import string
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from collections import Counter
import re

stop_words = set(stopwords.words('english'))

In [4]:
def get_data(conference):
    data = open("conferences/{}/data/papers_with_arxiv_data.json".format(conference)).read()
    return json.loads(data)

In [7]:
data = get_data("ICML")

In [8]:
counter = Counter()
for paper in data:
    title = paper["title"].lower().replace("-", "")
    splitted = title.split()
    counter.update(splitted)

In [12]:
stopwords_deep_learning = [
    "", "deep", "learning", "neural", "network", "networks", "via", "using", "based", "towards",
    "text", "natural", "language", "model", "models", "approach", "improving", "data", "fast", 
    "analysis", "methods", "method"
]

keywords = []
for w in counter.most_common():
    if w[0] not in stopwords.words('english') and w[0] not in stopwords_deep_learning:
        keywords.append(w)

keywords[:20]

[('optimization', 64),
 ('reinforcement', 58),
 ('stochastic', 45),
 ('adversarial', 45),
 ('graph', 39),
 ('optimal', 35),
 ('gradient', 34),
 ('robust', 33),
 ('efficient', 32),
 ('inference', 31),
 ('generative', 31),
 ('training', 29),
 ('linear', 29),
 ('bayesian', 29),
 ('online', 28),
 ('generalization', 25),
 ('bandits', 23),
 ('representations', 23),
 ('sampling', 23),
 ('gaussian', 23)]

In [17]:
with open("conferences/{}/data/keywords.json".format(CONFERENCE), "w") as f:
    json.dump(keywords, f)