In [49]:
import urllib
import feedparser
import pickle
import pandas as pd
from collections import defaultdict
from urllib.request import urlretrieve
import wget
import time
import glob
import re

### Read ICLR paper titles

In [3]:
with open("../features/all_data_features_17_20.pkl", "rb") as f:
    papers_df = pickle.load(f)
    
print("Total papers: ", len(papers_df))
print(papers_df["label"].unique())

Total papers:  4897
['Reject' 'Accept']


In [4]:
iclr_papers = papers_df[['id', 'title', 'abstract', 'authors', 'label']].to_dict('index')

In [5]:
list(iclr_papers.keys())[0:5]

['2017_B1-Hhnslg',
 '2017_B1-q5Pqxl',
 '2017_B16dGcqlx',
 '2017_B184E5qee',
 '2017_B186cP9gx']

In [6]:
iclr_papers["2017_r1VdcHcxx"]

{'id': '2017_r1VdcHcxx',
 'title': 'Recurrent Batch Normalization',
 'abstract': 'We propose a reparameterization of LSTM that brings the benefits of batch normalization to recurrent neural networks. Whereas previous works only apply batch normalization to the input-to-hidden transformation of RNNs, we demonstrate that it is both possible and beneficial to batch-normalize the hidden-to-hidden transition, thereby reducing internal covariate shift between time steps.\n\nWe evaluate our proposal on various sequential problems such as sequence classification, language modeling and question answering. Our empirical results show that our batch-normalized LSTM consistently leads to faster convergence and improved generalization.',
 'authors': ['Tim Cooijmans',
  'Nicolas Ballas',
  'César Laurent',
  'Çağlar Gülçehre',
  'Aaron Courville'],
 'label': 'Accept'}

### Find arxiv id for ICLR from existing dump

In [8]:
%%bash
cd ../../../axcell_ws/
mkdir data
cd data
wget https://github.com/paperswithcode/axcell/releases/download/v1.0/arxiv-papers.csv.xz

--2020-06-09 08:40:02--  https://github.com/paperswithcode/axcell/releases/download/v1.0/arxiv-papers.csv.xz
Resolving github.com (github.com)... 13.234.210.38
Connecting to github.com (github.com)|13.234.210.38|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://github-production-release-asset-2e65be.s3.amazonaws.com/194139116/d0a4ac80-905e-11ea-9927-3372c1ff8c95?X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Credential=AKIAIWNJYAX4CSVEH53A%2F20200609%2Fus-east-1%2Fs3%2Faws4_request&X-Amz-Date=20200609T031003Z&X-Amz-Expires=300&X-Amz-Signature=d318ee8f2b29bce7523fd24a697b50583fdde24444ef6cb2fca71c20575e472c&X-Amz-SignedHeaders=host&actor_id=0&repo_id=194139116&response-content-disposition=attachment%3B%20filename%3Darxiv-papers.csv.xz&response-content-type=application%2Foctet-stream [following]
--2020-06-09 08:40:03--  https://github-production-release-asset-2e65be.s3.amazonaws.com/194139116/d0a4ac80-905e-11ea-9927-3372c1ff8c95?X-Amz-Algorithm=AWS4-HMAC-SHA2

In [9]:
axcell_arxiv_dump = pd.read_csv("../../../axcell_ws/data/arxiv-papers.csv.xz")
axcell_arxiv_dump.head(3)

Unnamed: 0,arxiv_id,archive_size,sha256,title,sections,tables,status
0,0704.0004v1,9486,83b5c83d0963d796ed61fae5ed47cac55d2c942d41e03f...,A determinant of Stirling cycle numbers counts...,1,0,success
1,0704.0010v1,45695,6dd40a2af3e336e0a8e94a5a20a1075819af829f1fcef7...,"Partial cubes: structures, characterizations, ...",0,0,no-tex
2,0704.0012v1,9560,7f7997eee4e571f22551c06bf25e2315ac27fc663273c1...,Distribution of integral Fourier Coefficients ...,7,0,success


In [10]:
# Create reverse map by constructing title-to-arxivId dictionary from 10k arxiv papers

def clean_paper_title(org_title):
    return ''.join(filter(str.isalpha, org_title.strip().lower()))

arxiv_papers_subset = axcell_arxiv_dump[["arxiv_id", "title", "tables", "status"]]
arxiv_papers_subset["clean_title"] = axcell_arxiv_dump['title'].apply(clean_paper_title)

arxiv_dict = dict(zip(arxiv_papers_subset["clean_title"], arxiv_papers_subset["arxiv_id"]))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  import sys


In [11]:
# Search the titles in arxiv dump

found_paper_keys = []
not_found_year_wise = {k:defaultdict(int) for k in [2017, 2018, 2019, 2020]}
not_found_pids = []

for k,v in iclr_papers.items():
    t = v["title"].lower()
    t = ''.join(filter(str.isalpha, t))
    if t in arxiv_dict:
        iclr_papers[k]["found"] = True
        iclr_papers[k]["arxivId"] = arxiv_dict[t]
        found_paper_keys.append(k)
    else:
        iclr_papers[k]["found"] = False
        not_found_year_wise[int(k.split("_")[0])][iclr_papers[k]["label"]] += 1
        not_found_pids.append(k)
#         decision[iclr_papers[k]["label"]] += 1
        
print("Found: {}/{}".format(len(found_paper_keys), len(iclr_papers)))
print("Not found papers year wise: ", not_found_year_wise)

Found: 2506/4897
Not found papers year wise:  {2017: defaultdict(<class 'int'>, {'Reject': 117, 'Accept': 28}), 2018: defaultdict(<class 'int'>, {'Reject': 294, 'Accept': 49}), 2019: defaultdict(<class 'int'>, {'Reject': 568, 'Accept': 96}), 2020: defaultdict(<class 'int'>, {'Accept': 301, 'Reject': 938})}


In [12]:
# OUTDATED
d = {k.split("_")[0]:defaultdict(int) for k in iclr_papers}
for k, v in iclr_papers.items():
    d[k.split("_")[0]][v["label"]] += 1

print(d)

{'2017': defaultdict(<class 'int'>, {'Reject': 245, 'Accept': 198}), '2018': defaultdict(<class 'int'>, {'Accept': 336, 'Reject': 486}), '2019': defaultdict(<class 'int'>, {'Reject': 917, 'Accept': 502}), '2020': defaultdict(<class 'int'>, {'Accept': 687, 'Reject': 1526})}


| Year | NotFound:Rejected | NotFound:Accepted |
| --- | --- | --- |
| 2017 | 117/245 | 28/195 |
| 2018 | 294/486 | 49/336 |
| 2019 | 568/917 | 96/502 |
| 2020 | 938/1526 | 301/687 |

In [13]:
for k,v in iclr_papers.items():
    if not v["found"] and v["label"] == "Accept" and k.split("_")[0]=="2020":
        print(v)
        break

{'id': '2020_B1e3OlStPB', 'title': 'DeepSphere: a graph-based spherical CNN', 'abstract': 'Designing a convolution for a spherical neural network requires a delicate tradeoff between efficiency and rotation equivariance. DeepSphere, a method based on a graph representation of the discretized sphere, strikes a controllable balance between these two desiderata. This contribution is twofold. First, we study both theoretically and empirically how equivariance is affected by the underlying graph with respect to the number of pixels and neighbors. Second, we evaluate DeepSphere on relevant problems. Experiments show state-of-the-art performance and demonstrates the efficiency and flexibility of this formulation. Perhaps surprisingly, comparison with previous work suggests that anisotropic filters might be an unnecessary price to pay.', 'authors': ['Michaël Defferrard', 'Martino Milani', 'Frédérick Gusset', 'Nathanaël Perraudin'], 'label': 'Accept', 'found': False}


### Save the iclr paper arxiv ids dict

In [154]:
# 1806.03852v4 ['2018_ByJWeR1AW', '2020_H1eqOnNYDH']
iclr_papers["2018_ByJWeR1AW"]["arxivId"] = "1806.03852v1"

In [157]:
# 1802.04412v4 ['2018_Bk6qQGWRb', '2019_B1e7hs05Km']
iclr_papers["2018_Bk6qQGWRb"]["arxivId"] = "1802.04412v1"

In [159]:
# 1810.05934v4 ['2018_S1Y7OOlRZ', '2019_S1MAriC5F7']
iclr_papers["2018_S1Y7OOlRZ"]["arxivId"] = "1810.05934v1"

In [158]:
# 1706.01566v4 ['2018_HyBbjW-RW', '2019_SJf_XhCqKm']
iclr_papers["2018_HyBbjW-RW"]["arxivId"] = "1706.01566v1"

In [160]:
# 1802.04948v3 ['2018_SJvu-GW0b', '2019_Ske7ToC5Km']
iclr_papers["2018_SJvu-GW0b"]["arxivId"] = "1802.04948v1"

In [161]:
# 1711.01970v2 ['2018_SyBBgXWAZ', '2019_BklCusRct7']
iclr_papers["2018_SyBBgXWAZ"]["arxivId"] = "1711.01970v1"

In [165]:
# 1812.10607v1 ['2019_Bkeuz20cYm', '2020_ByedzkrKvH']
iclr_papers["2019_Bkeuz20cYm"]["arxivId"] = "1812.10607v1"
iclr_papers["2020_ByedzkrKvH"]["found"] = False
# del iclr_papers["2020_ByedzkrKvH"]["arxivId"]
print(iclr_papers["2020_ByedzkrKvH"].keys())

dict_keys(['id', 'title', 'abstract', 'authors', 'label', 'found'])


In [166]:
# 1806.04640v2 ['2019_H1eRBoC9FX', '2020_S1et1lrtwr']
iclr_papers["2019_H1eRBoC9FX"]["arxivId"] = "1806.04640v1"

In [168]:
# 1907.03179v1 ['2019_S14h9sCqYm', '2020_SygfNCEYDH']
iclr_papers["2020_SygfNCEYDH"]["arxivId"] = "1907.03179v1"
iclr_papers["2019_S14h9sCqYm"]["found"] = False
del iclr_papers["2019_S14h9sCqYm"]["arxivId"]
print(iclr_papers["2019_S14h9sCqYm"].keys())

dict_keys(['id', 'title', 'abstract', 'authors', 'label', 'found'])


In [169]:
# 1805.09980v2 ['2019_SJz6MnC5YQ', '2020_r1e0G04Kvr']
iclr_papers["2019_SJz6MnC5YQ"]["arxivId"] = "1805.09980v1"

In [170]:
# 1810.06544v4 ['2019_SyehMhC9Y7', '2020_Skl4mRNYDr']
iclr_papers["2019_SyehMhC9Y7"]["arxivId"] = "1810.06544v1"

In [171]:
# 1805.09208v2 ['2019_rklwwo05Ym', '2020_rJxwDTVFDB']
iclr_papers["2019_rklwwo05Ym"]["arxivId"] = "1805.09208v1"

In [172]:
with open("../features/iclr_arxiv_map.pkl", "wb") as f:
    pickle.dump(iclr_papers, f)

### Download arxiv sources

In [14]:
ls

DownloadArxivForICLR.ipynb


In [21]:
for k in found_paper_keys:
#     print(k)
    if iclr_papers[k]["arxivId"] == "1703.05175v2":
        print(iclr_papers[k])
        break

{'id': '2017_B1-Hhnslg', 'title': 'Prototypical Networks for Few-shot Learning', 'abstract': 'A recent approach to few-shot classification called matching networks has demonstrated the benefits of coupling metric learning with a training procedure that mimics test. This approach relies on a complicated fine-tuning procedure and an attention scheme that forms a distribution over all points in the support set, scaling poorly with its size. We propose a more streamlined approach, prototypical networks, that learns a metric space in which few-shot classification can be performed by computing Euclidean distances to prototype representations of each class, rather than individual points. Our method is competitive with state-of-the-art one-shot classification approaches while being much simpler and more scalable with the size of the support set. We empirically demonstrate the performance of our approach on the Omniglot and mini-ImageNet datasets. We further demonstrate that a similar idea can 

In [25]:
len(glob.glob("/home/singh_shruti/workspace/ICLR_arxiv_dump/*"))

82

In [37]:
base_url = "http://export.arxiv.org/e-print/"
total_count = len(found_paper_keys)
existing_files = glob.glob("/home/singh_shruti/workspace/ICLR_arxiv_dump/*")

for i in range(0, total_count, 4):
    print(i)    
    #if i%10 == 0:
    #    print("Done {} out of {}".format(i, len(found_paper_keys)))
    for j in range(0, 4):
        try:
            paper_url = base_url + iclr_papers[found_paper_keys[i+j]]["arxivId"]
            dest_file = "/home/singh_shruti/workspace/ICLR_arxiv_dump/" + iclr_papers[found_paper_keys[i+j]]["arxivId"]
            if not dest_file in existing_files:
                resp = wget.download(paper_url, dest_file)
        except Exception as ex:
            print(ex.__dict__)
            if "code" in ex.__dict__ and ex.code == 403:
                print(ex.msg)
                time.sleep(3)
    time.sleep(2)

0
4
8
12
16
20
24
28
32
36
40
44
48
52
56
60
64
68
72
76
80
84
88
92
96
100
104
108
112
116
120
124
128
132
136
140
144
148
152
156
160
164
168
172
176
180
184
188
192
196
200
204
208
212
216
220
224
228
232
236
240
244
248
252
256
260
264
268
272
276
280
284
288
292
296
300
304
308
312
316
320
324
328
332
336
340
344
348
352
356
360
364
368
372
376
380
384
388
392
396
400
404
408
412
416
420
424
428
432
436
440
444
448
452
456
460
464
468
472
476
480
484
488
492
496
500
504
508
512
516
520
524
528
532
536
540
544
548
552
556
560
564
568
572
576
580
584
588
592
596
600
604
608
612
616
620
624
628
632
636
640
644
648
652
656
660
664
668
672
676
680
684
688
692
696
700
704
708
712
716
720
724
728
732
736
740
744
748
752
756
760
764
768
772
776
780
784
788
792
796
800
804
808
812
816
820
824
828
832
836
840
844
848
852
856
860
864
868
872
876
880
884
888
892
896
900
904
908
912
916
920
924
928
932
936
940
944
948
952
956
960
964
968
972
976
980
984
988
992
996
1000
1004
1008
1012
1016
102

#### Inspect downloaded data

In [40]:
%%bash
cd /home/singh_shruti/workspace/ICLR_arxiv_dump/
ls -l | wc -l

2509


In [48]:
tmp_files = glob.glob("/home/singh_shruti/workspace/ICLR_arxiv_dump/*.tmp")
print(tmp_files)

['/home/singh_shruti/workspace/ICLR_arxiv_dump/1812.04606v3oq0_2u9i.tmp', '/home/singh_shruti/workspace/ICLR_arxiv_dump/1910.08264v1j47xz40w.tmp', '/home/singh_shruti/workspace/ICLR_arxiv_dump/1909.05352v26hesqrjy.tmp', '/home/singh_shruti/workspace/ICLR_arxiv_dump/1909.11321v1m1bn33_v.tmp', '/home/singh_shruti/workspace/ICLR_arxiv_dump/1906.04304v1m89heh_l.tmp', '/home/singh_shruti/workspace/ICLR_arxiv_dump/1812.11240v27rf8ovsc.tmp', '/home/singh_shruti/workspace/ICLR_arxiv_dump/1910.14634v2opgkmer7.tmp', '/home/singh_shruti/workspace/ICLR_arxiv_dump/1901.09491v2ssabjlx9.tmp', '/home/singh_shruti/workspace/ICLR_arxiv_dump/1902.00275v2n564cf6t.tmp']


In [93]:
re_down = []

for f in tmp_files:
    ext = re.search(".*/[0-9]*\.[0-9]*v[0-9][0-9]?", f)
    if ext:
        org_arxiv_id = ext[0].rsplit("/", 1)[1]
#         print('"{}"'.format(f.rsplit("/", 1)[1]), end=" ")
        print(org_arxiv_id)
        re_down.append(org_arxiv_id)
    else:
        print(f)

1812.04606v3
1910.08264v1
1909.05352v26
1909.11321v1
1906.04304v1
1812.11240v27
1910.14634v2
1901.09491v2
1902.00275v2


In [90]:
#!/bin/bash
# files_array = ("1812.04606v3oq0_2u9i.tmp" "1910.08264v1j47xz40w.tmp" "1909.05352v26hesqrjy.tmp" "1909.11321v1m1bn33_v.tmp" "1906.04304v1m89heh_l.tmp" "1812.11240v27rf8ovsc.tmp" "1910.14634v2opgkmer7.tmp" "1901.09491v2ssabjlx9.tmp" "1902.00275v2n564cf6t.tmp")
# for val in "${files_array[*]}"; do
#     echo $val
# done

In [144]:
%%bash
cd /home/singh_shruti/workspace/ICLR_arxiv_dump/
ls 1902.00275v2* 

1902.00275v2
1902.00275v2n564cf6t.tmp


In [145]:
%%bash
cd /home/singh_shruti/workspace/ICLR_arxiv_dump/
rm 1902.00275v2n564cf6t.tmp
# wget http://export.arxiv.org/e-print/1902.00275v2

In [147]:
tmp_files = glob.glob("/home/singh_shruti/workspace/ICLR_arxiv_dump/*.tmp")
print(tmp_files)

[]


In [32]:
#Scratch
try:
    resp1 = wget.download(paper_url[:-1], "./test_"+iclr_papers[found_paper_keys[i]]["arxivId"])
except Exception as ex:
    print(ex.__dict__)
    if 'code' in ex.__dict__ and ex.code == 404:
        print(ex.msg)

{'code': 404, 'msg': 'Not Found', 'hdrs': <http.client.HTTPMessage object at 0x7fab74491fd0>, 'fp': <http.client.HTTPResponse object at 0x7fab74491208>, 'file': <http.client.HTTPResponse object at 0x7fab74491208>, 'name': '<urllib response>', 'delete': False, '_closer': <tempfile._TemporaryFileCloser object at 0x7fab74491d30>, 'url': 'http://export.arxiv.org/e-print/1705.02670v'}
Not Found


### Find arxiv id

In [2]:
# Base api query url
base_url = 'http://export.arxiv.org/api/query?';

In [None]:
search_query = 'ti:electron' # search for electron in all fields
start = 0                     # retreive the first 5 results
max_results = 5

In [None]:
response = urllib.urlopen(base_url+search_query).read()

In [None]:
feedparser._FeedParserMixin.namespaces['http://a9.com/-/spec/opensearch/1.1/'] = 'opensearch'
feedparser._FeedParserMixin.namespaces['http://arxiv.org/schemas/atom'] = 'arxiv'

feed = feedparser.parse(response)

In [None]:
# print out feed information
print 'Feed title: %s' % feed.feed.title
print 'Feed last updated: %s' % feed.feed.updated

# print opensearch metadata
print 'totalResults for this query: %s' % feed.feed.opensearch_totalresults

In [None]:
for entry in feed.entries:
    print 'e-print metadata'
    print 'arxiv-id: %s' % entry.id.split('/abs/')[-1]
    print 'Published: %s' % entry.published
    print 'Title:  %s' % entry.title
    
    # feedparser v4.1 only grabs the first author
    author_string = entry.author
    
    # grab the affiliation in <arxiv:affiliation> if present
    # - this will only grab the first affiliation encountered
    #   (the first affiliation for the first author)
    # Please email the list with a way to get all of this information!
    try:
        author_string += ' (%s)' % entry.arxiv_affiliation
    except AttributeError:
        pass
    
    print 'Last Author:  %s' % author_string
    
    # feedparser v5.0.1 correctly handles multiple authors, print them all
    try:
        print 'Authors:  %s' % ', '.join(author.name for author in entry.authors)
    except AttributeError:
        pass

    # get the links to the abs page and pdf for this e-print
    for link in entry.links:
        if link.rel == 'alternate':
            print 'abs page link: %s' % link.href
        elif link.title == 'pdf':
            print 'pdf link: %s' % link.href
    
    # The journal reference, comments and primary_category sections live under 
    # the arxiv namespace
    try:
        journal_ref = entry.arxiv_journal_ref
    except AttributeError:
        journal_ref = 'No journal ref found'
    print 'Journal reference: %s' % journal_ref
    
    try:
        comment = entry.arxiv_comment
    except AttributeError:
        comment = 'No comment found'
    print 'Comments: %s' % comment