In [67]:
import urllib
import feedparser
import pickle
import pandas as pd
from collections import defaultdict
from urllib.request import urlretrieve
import wget
import time

### Read ICLR paper titles

In [2]:
with open("../features/all_data_features_17_20.pkl", "rb") as f:
    papers_df = pickle.load(f)
    
print("Total papers: ", len(papers_df))
print(papers_df["label"].unique())

Total papers:  4897
['Reject' 'Accept']


In [3]:
iclr_papers = papers_df[['id', 'title', 'abstract', 'authors', 'label']].to_dict('index')

In [4]:
list(iclr_papers.keys())[0:5]

['2020_HJgcvJBFvB',
 '2019_S1lTg3RcFm',
 '2017_rJ0-tY5xe',
 '2018_HkbmWqxCZ',
 '2017_Hkz6aNqle']

In [5]:
iclr_papers["2017_r1VdcHcxx"]

{'abstract': 'We propose a reparameterization of LSTM that brings the benefits of batch normalization to recurrent neural networks. Whereas previous works only apply batch normalization to the input-to-hidden transformation of RNNs, we demonstrate that it is both possible and beneficial to batch-normalize the hidden-to-hidden transition, thereby reducing internal covariate shift between time steps.\n\nWe evaluate our proposal on various sequential problems such as sequence classification, language modeling and question answering. Our empirical results show that our batch-normalized LSTM consistently leads to faster convergence and improved generalization.',
 'authors': ['Tim Cooijmans',
  'Nicolas Ballas',
  'César Laurent',
  'Çağlar Gülçehre',
  'Aaron Courville'],
 'id': '2017_r1VdcHcxx',
 'label': 'Accept',
 'title': 'Recurrent Batch Normalization'}

### Find arxiv id for ICLR from existing dump

In [39]:
%%bash
cd ../../../axcell_ws/
mkdir data
cd data
wget https://github.com/paperswithcode/axcell/releases/download/v1.0/arxiv-papers.csv.xz

--2020-06-09 05:06:41--  https://github.com/paperswithcode/axcell/releases/download/v1.0/arxiv-papers.csv.xz
Resolving github.com (github.com)... 13.234.176.102
Connecting to github.com (github.com)|13.234.176.102|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://github-production-release-asset-2e65be.s3.amazonaws.com/194139116/d0a4ac80-905e-11ea-9927-3372c1ff8c95?X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Credential=AKIAIWNJYAX4CSVEH53A%2F20200608%2Fus-east-1%2Fs3%2Faws4_request&X-Amz-Date=20200608T233708Z&X-Amz-Expires=300&X-Amz-Signature=c6d55401b939747d95405c88d9e647a10a1a2ab9b0f3f49c5821811046723148&X-Amz-SignedHeaders=host&actor_id=0&repo_id=194139116&response-content-disposition=attachment%3B%20filename%3Darxiv-papers.csv.xz&response-content-type=application%2Foctet-stream [following]
--2020-06-09 05:06:42--  https://github-production-release-asset-2e65be.s3.amazonaws.com/194139116/d0a4ac80-905e-11ea-9927-3372c1ff8c95?X-Amz-Algorithm=AWS4-HMAC-SH

In [6]:
axcell_arxiv_dump = pd.read_csv("/home/shruti/Desktop/iitgn/courses/SEM2/ML/Project/code/axcell_ws/data/arxiv-papers.csv.xz")
axcell_arxiv_dump.head(3)

Unnamed: 0,arxiv_id,archive_size,sha256,title,sections,tables,status
0,0704.0004v1,9486,83b5c83d0963d796ed61fae5ed47cac55d2c942d41e03f...,A determinant of Stirling cycle numbers counts...,1,0,success
1,0704.0010v1,45695,6dd40a2af3e336e0a8e94a5a20a1075819af829f1fcef7...,"Partial cubes: structures, characterizations, ...",0,0,no-tex
2,0704.0012v1,9560,7f7997eee4e571f22551c06bf25e2315ac27fc663273c1...,Distribution of integral Fourier Coefficients ...,7,0,success


In [7]:
# Create reverse map by constructing title-to-arxivId dictionary from 10k arxiv papers

def clean_paper_title(org_title):
    return ''.join(filter(str.isalpha, org_title.strip().lower()))

arxiv_papers_subset = axcell_arxiv_dump[["arxiv_id", "title", "tables", "status"]]
arxiv_papers_subset["clean_title"] = axcell_arxiv_dump['title'].apply(clean_paper_title)

arxiv_dict = dict(zip(arxiv_papers_subset["clean_title"], arxiv_papers_subset["arxiv_id"]))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  import sys


In [32]:
# Search the titles in arxiv dump

found_paper_keys = []
not_found_year_wise = {k:defaultdict(int) for k in [2017, 2018, 2019, 2020]}
not_found_pids = []

for k,v in iclr_papers.items():
    t = v["title"].lower()
    t = ''.join(filter(str.isalpha, t))
    if t in arxiv_dict:
        iclr_papers[k]["found"] = True
        iclr_papers[k]["arxivId"] = arxiv_dict[t]
        found_paper_keys.append(k)
    else:
        iclr_papers[k]["found"] = False
        not_found_year_wise[int(k.split("_")[0])][iclr_papers[k]["label"]] += 1
        not_found_pids.append(k)
#         decision[iclr_papers[k]["label"]] += 1
        
print("Found: {}/{}".format(len(found_paper_keys), len(iclr_papers)))
print("Not found papers year wise: ", not_found_year_wise)

Found: 2506/4897
Not found papers year wise:  {2017: defaultdict(<class 'int'>, {'Reject': 117, 'Accept': 28}), 2018: defaultdict(<class 'int'>, {'Reject': 294, 'Accept': 49}), 2019: defaultdict(<class 'int'>, {'Reject': 568, 'Accept': 96}), 2020: defaultdict(<class 'int'>, {'Reject': 938, 'Accept': 301})}


In [26]:
d = {k.split("_")[0]:defaultdict(int) for k in iclr_papers}
for k, v in iclr_papers.items():
    d[k.split("_")[0]][v["label"]] += 1

print(d)

{'2019': defaultdict(<class 'int'>, {'Reject': 917, 'Accept': 502}), '2018': defaultdict(<class 'int'>, {'Reject': 486, 'Accept': 336}), '2020': defaultdict(<class 'int'>, {'Reject': 1526, 'Accept': 687}), '2017': defaultdict(<class 'int'>, {'Reject': 245, 'Accept': 198})}


| Year | NotFound:Rejected | NotFound:Accepted |
| --- | --- | --- |
| 2017 | 117/245 | 28/195 |
| 2018 | 294/486 | 49/336 |
| 2019 | 568/917 | 96/502 |
| 2020 | 938/1526 | 301/687 |

In [29]:
for k,v in iclr_papers.items():
    if not v["found"] and v["label"] == "Accept" and k.split("_")[0]=="2020":
        print(v)
        break

{'id': '2020_r1g87C4KwB', 'abstract': 'Understanding the optimization trajectory is critical to understand training of deep neural networks. We show how the hyperparameters of stochastic gradient descent influence the covariance of the gradients (K) and the Hessian of the training loss (H) along this trajectory. Based on a theoretical model, we predict that using a high learning rate or a small batch size in the early phase of training leads SGD to regions of the parameter space with (1) reduced spectral norm of K, and (2) improved conditioning of K and H. We show that the point on the trajectory after which these effects hold, which we refer to as the break-even point, is reached early during training. We demonstrate these effects empirically for a range of deep neural networks applied to multiple different tasks. Finally, we apply our analysis to networks with batch normalization (BN) layers and find that it is necessary to use a high learning rate to achieve loss smoothing effects a

### Download arxiv sources

In [34]:
ls

DownloadArxivForICLR.ipynb


In [56]:
base_url = "http://export.arxiv.org/e-print/"

for i in range(0, len(found_paper_keys), 4):
    for j in range(0, 4):
        try:
            paper_url = base_url + iclr_papers[found_paper_keys[i]]["arxivId"]
            dest_file = "./"+iclr_papers[found_paper_keys[i]]["arxivId"]
            resp = wget.download(paper_url, dest_file)
        except Exception as ex:
            print(ex.__dict__)
            if ex.code == 403:
                print(ex.msg)
                time.sleep(3)
        time.sleep(3)

In [66]:
#Scratch
try:
    resp1 = wget.download(paper_url[:-1], "./test_"+iclr_papers[found_paper_keys[i]]["arxivId"])
except Exception as ex:
    print(ex.__dict__)
    if ex.code == 403:
        print(ex.msg)

{'hdrs': <http.client.HTTPMessage object at 0x7fe264cc55f8>, 'msg': 'Not Found', 'url': 'http://export.arxiv.org/e-print/1910.05396v', 'name': '<urllib response>', 'delete': False, 'fp': <http.client.HTTPResponse object at 0x7fe2261dec88>, 'code': 404, 'file': <http.client.HTTPResponse object at 0x7fe2261dec88>, '_closer': <tempfile._TemporaryFileCloser object at 0x7fe264cf03c8>}
Not Found


### Find arxiv id

In [2]:
# Base api query url
base_url = 'http://export.arxiv.org/api/query?';

In [None]:
search_query = 'ti:electron' # search for electron in all fields
start = 0                     # retreive the first 5 results
max_results = 5

In [None]:
response = urllib.urlopen(base_url+search_query).read()

In [None]:
feedparser._FeedParserMixin.namespaces['http://a9.com/-/spec/opensearch/1.1/'] = 'opensearch'
feedparser._FeedParserMixin.namespaces['http://arxiv.org/schemas/atom'] = 'arxiv'

feed = feedparser.parse(response)

In [None]:
# print out feed information
print 'Feed title: %s' % feed.feed.title
print 'Feed last updated: %s' % feed.feed.updated

# print opensearch metadata
print 'totalResults for this query: %s' % feed.feed.opensearch_totalresults

In [None]:
for entry in feed.entries:
    print 'e-print metadata'
    print 'arxiv-id: %s' % entry.id.split('/abs/')[-1]
    print 'Published: %s' % entry.published
    print 'Title:  %s' % entry.title
    
    # feedparser v4.1 only grabs the first author
    author_string = entry.author
    
    # grab the affiliation in <arxiv:affiliation> if present
    # - this will only grab the first affiliation encountered
    #   (the first affiliation for the first author)
    # Please email the list with a way to get all of this information!
    try:
        author_string += ' (%s)' % entry.arxiv_affiliation
    except AttributeError:
        pass
    
    print 'Last Author:  %s' % author_string
    
    # feedparser v5.0.1 correctly handles multiple authors, print them all
    try:
        print 'Authors:  %s' % ', '.join(author.name for author in entry.authors)
    except AttributeError:
        pass

    # get the links to the abs page and pdf for this e-print
    for link in entry.links:
        if link.rel == 'alternate':
            print 'abs page link: %s' % link.href
        elif link.title == 'pdf':
            print 'pdf link: %s' % link.href
    
    # The journal reference, comments and primary_category sections live under 
    # the arxiv namespace
    try:
        journal_ref = entry.arxiv_journal_ref
    except AttributeError:
        journal_ref = 'No journal ref found'
    print 'Journal reference: %s' % journal_ref
    
    try:
        comment = entry.arxiv_comment
    except AttributeError:
        comment = 'No comment found'
    print 'Comments: %s' % comment