In [133]:
# File utils
import json
import pickle
import glob

# Preprocessing utils
import unidecode
import re
from collections import defaultdict
import editdistance

# Debugging
from IPython.core.debugger import Tracer

In [132]:
!pip install editdistance

Collecting editdistance
  Using cached editdistance-0.5.3-cp37-cp37m-manylinux1_x86_64.whl (179 kB)
Installing collected packages: editdistance
Successfully installed editdistance-0.5.3


In [3]:
from axcell.helpers.datasets import read_arxiv_papers

# Read all paper titles and create a hashtable

##### a. Read axcell paper data

In [4]:
arxiv_axcell = read_arxiv_papers("../../data/arxiv-papers.csv.xz")
arxiv_axcell.head(2)

Unnamed: 0,arxiv_id,archive_size,sha256,title,sections,tables,status
0,0704.0004v1,9486,83b5c83d0963d796ed61fae5ed47cac55d2c942d41e03f...,A determinant of Stirling cycle numbers counts...,1,0,success
1,0704.0010v1,45695,6dd40a2af3e336e0a8e94a5a20a1075819af829f1fcef7...,"Partial cubes: structures, characterizations, ...",0,0,no-tex


In [5]:
yearwise_axcell = defaultdict(int)

for aid in arxiv_axcell["arxiv_id"]:
    yearwise_axcell[aid[0:2]] += 1

In [6]:
yearwise_axcell

defaultdict(int,
            {'07': 48,
             '08': 87,
             '09': 122,
             '10': 189,
             '11': 298,
             '12': 652,
             '13': 3575,
             '14': 5193,
             '15': 6975,
             '16': 10845,
             '17': 16177,
             '18': 24760,
             '19': 35144,
             '20': 645})

##### b. Read full arxiv data from the ECIR leaderboard paper 

In [7]:
with open("/home/singh_shruti/data/arxiv/All_Title_Dump.json", "r") as f:
    arxiv_ecir = json.load(f)

In [8]:
print(len(arxiv_ecir), list(arxiv_ecir.items())[0:2])

1152195 [('1405.1888', 'One positive and two negative results for derived categories of\n  algebraic stacks'), ('nucl-th_0512069', 'Helicity amplitudes and electromagnetic decays of strange baryon\n  resonances')]


In [9]:
yearwise_ecir = defaultdict(int)

for aid in arxiv_ecir:
    yearwise_ecir[aid[0:2]] += 1

In [10]:
list(arxiv_ecir.keys())[0:5]

['1405.1888', 'nucl-th_0512069', '1504.05139', '1504.05138', '1112.0655']

In [11]:
sorted(yearwise_ecir.items(), key=lambda x: x[0])

[('07', 30196),
 ('08', 43896),
 ('09', 59045),
 ('10', 62827),
 ('11', 67544),
 ('12', 75470),
 ('13', 81528),
 ('14', 83613),
 ('15', 89883),
 ('16', 97725),
 ('17', 107288),
 ('18', 34326),
 ('ac', 2),
 ('ad', 304),
 ('al', 1164),
 ('ao', 13),
 ('as', 34667),
 ('at', 67),
 ('ba', 11),
 ('ch', 1881),
 ('cm', 892),
 ('co', 67679),
 ('cs', 7090),
 ('dg', 543),
 ('fu', 257),
 ('gr', 16048),
 ('he', 77200),
 ('ma', 53208),
 ('mt', 151),
 ('nl', 5175),
 ('nu', 14511),
 ('pa', 440),
 ('ph', 15248),
 ('pl', 28),
 ('q-', 3120),
 ('qu', 18320),
 ('so', 767),
 ('su', 68)]

##### c. Collate both data

Steps:
1. Decode all accented chars from title
2. Clean all non alpha-numeric chars from the title
3. Collate titles from both dataset and add both ids

In [385]:
paper_id_titles = defaultdict(list)

for aid, title in arxiv_ecir.items():
    title_lower = title.lower()
    unaccented_title = unidecode.unidecode(title_lower)
    clean_title = re.sub('[\W_]', '', unaccented_title)
    
    if clean_title:
        paper_id_titles[clean_title].append("e_" + aid)
    else:
        print(title)

for entry in arxiv_axcell[["arxiv_id", "title"]].iterrows():
    aid = entry[1].arxiv_id
    title_lower = entry[1].title.lower()
    
    unaccented_title = unidecode.unidecode(title_lower)
    clean_title = re.sub('[\W_]', '', unaccented_title)
    
    if clean_title:
        paper_id_titles[clean_title].append("a_" + aid)
    else:
        print(title)

In [386]:
paper_global_id_dict = {}

for seqid, k in enumerate(paper_id_titles):
    paper_global_id_dict[k] = seqid

In [607]:
c = 0
for k, v in paper_id_titles.items():
    if len(k) < 20:
        print(k, v)
        c += 1
print(c)

rayleighmatroids ['e_math_0307096']
informationloss ['e_1703.02140']
mesh ['e_0904.3715']
statusofstrongchpt ['e_0904.3713']
subsetseedautomaton ['e_1408.6198']
stablequasimaps ['e_1106.0804']
minimaltoriins3 ['e_math_0407304']
regulators ['e_math_0407308']
therackspace ['e_math_0304228']
soficdyckshifts ['e_1305.7413']
anarrayalgebra ['e_0812.4986']
im2cad ['e_1608.05137', 'a_1608.05137v2']
chainminorsarefpt ['e_1304.5849']
asteroseismology ['e_1205.6407']
multilinearpagerank ['e_1409.1465']
notesontheoctonions ['e_1005.2820']
fermiandszilard ['e_physics_0207094']
serreweightsforun ['e_1405.3014']
isrphysicsatbabar ['e_0911.0235', 'e_hep-ex_0601020']
globalactions ['e_1506.08876']
polyexponentials ['e_0710.1332']
nominalcunification ['e_1709.05384']
tracetest ['e_1608.00540']
arithmeticoncurves ['e_math_0302158']
nonextremalbranes ['e_1412.5547']
probabilistictime ['e_1002.2593']
sumsoftoricideals ['e_1211.5386']
maximaltrees ['e_1611.08150']
q ['e_1611.08152']
thequantumsphaleron ['e

lvdhighlights ['e_hep-ex_0608061']
aninaccessiblegraph ['e_1006.3852']
neutrinos ['e_hep-ph_9804426', 'e_physics_0103091', 'e_hep-ph_0012312', 'e_1310.4340']
galaxyclustershapes ['e_astro-ph_9912533']
quantummatrixpairs ['e_math_9911015']
gcain2d ['e_0912.1090']
onhigherstructures ['e_1509.00403']
phenixhighlights ['e_nucl-ex_0404009', 'e_1211.0324']
scaledfreeobjects ['e_1011.0717']
theeosfamilyhalo ['e_1302.1447']
paxosmadeswitchy ['e_1511.04985']
clepercolations ['e_1602.03884']
tetracomposition ['e_math_0105231']
newphysicsat1tev ['e_1602.02380']
liftingcurvessimply ['e_1501.00295']
cryptanalysisofhfe ['e_cs_0305034']
thepowerofmtheory ['e_hep-th_9510086']
virtualknotgroups ['e_math_9907172']
theslippageparadox ['e_1103.2214']
octtreemethodongpu ['e_0909.0541']
theprimalframeworki ['e_math_9201241']
holographicunhiggs ['e_0810.4940']
singularities ['e_math_9801123']
thewhilelanguage ['e_1603.08949']
beyondlyapunov ['e_1008.2664']
theweavelofarsurvey ['e_1611.02706']
andtherewasligh

angularlens ['e_1711.03266']
activebrownianrods ['e_1512.07567']
thedarkmassproblem ['e_1308.0249']
arithmeticalmeadows ['e_0909.2088']
attainableknowledge ['e_1705.10760']
ongnhilbofnhilb ['e_1108.2310']
blackholesqueezers ['e_1706.09117']
galoisgothisgun ['e_1111.0655']
algorithmsforgroups ['e_math_9406203']
kermions ['e_1310.5064']
howtoencodeatree ['e_1710.08463']
gaugedqfields ['e_hep-th_9906135']
cmbmaprestoration ['e_1111.3149']
bayesianadaptation ['e_1407.2219']
onm9branes ['e_hep-th_9806069']
adualroleforsex ['e_0809.0029']
minkowskigames ['e_1609.07048']
liveanddeadnodes ['e_physics_0505049']
nomadsofthegalaxy ['e_1201.2687']
whoseknowledge ['e_quant-ph_0107151']
quantumblobs ['e_1106.5468']
largencmeansnc3 ['e_hep-ph_0204252']
leftmodularelements ['e_math_0001055']
higgsbosononyourown ['e_1303.2732']
blpisneveramenable ['e_0907.3984']
1dominationofknots ['e_1511.07073']
tilingphosphorene ['e_1411.6318']
vaccinateyourtrees ['e_1801.08705']
scalartrefoils ['e_0708.2288']
vivap

secondsetofspaces ['e_1006.5941']
kaonraredecays ['e_hep-ex_0112016']
faintbluegalaxies ['e_astro-ph_9704019']
practicalstatistics ['e_1708.01007']
aliceoverview ['e_1112.0700', 'e_1603.03320', 'e_1711.03369']
nlgvstemplates ['e_cmp-lg_9504013']
algebraiccuts ['e_alg-geom_9608028']
theinfatidata ['e_cs_0410001']
threadablecurves ['e_1801.08003']
thehuntforoldnovae ['e_1303.1712']
minorsanddimension ['e_1407.4066']
hdecompositions ['e_0709.2525']
anyonsonatorus ['e_hep-th_9210112']
nucleonexcitations ['e_1001.2997']
maximumfidelity ['e_1301.5186']
tbranesandmonodromy ['e_1010.5780']
onurkdvandurkp ['e_solv-int_9402004']
isnatureoo ['e_1108.0014']
thegroundaxiom ['e_math_0609064', 'e_1607.00723']
darkmatter2013 ['e_1310.5217']
generalltesequence ['e_1509.03288']
newphysicsatcdf ['e_1006.1142']
theechosciencecase ['e_1502.05747']
problemswithpopper ['e_1202.4792']
thompsonsgroupf ['e_0708.3609']
cubicaltokensystems ['e_math_0612696']
newsfromkm3net ['e_1403.4065']
onplanarvaluedcsps ['e_1

beyonduncountable ['e_math_0312360']
physicsattesla ['e_hep-ex_0104044']
mathbbf1foreveryone ['e_1801.05337']
divisionbyfour ['e_1504.01402']
bestbuddiestracking ['e_1611.00148', 'a_1611.00148v1']
onbimeasurings ['e_math_0409522']
cloudcomputing ['e_1003.4074']
s3coversofschemes ['e_0804.4658']
fidealsofdegree2 ['e_1101.1780']
lowxqcdfromcms ['e_1310.3154']
onpowerstableideals ['e_0705.1286']
gapforcing ['e_math_9808011']
coisotropicpairs ['e_1408.5620']
onmathmatterandmind ['e_physics_0510188']
diamonddicing ['e_1006.3726']
frozenfootprints ['e_0811.4603']
akindofmagic ['e_1707.02072']
semanticadvertising ['e_1309.5018', 'a_1309.5018v1']
dinvandarea ['e_1609.04480']
muonsgravityandtime ['e_1508.02339']
nanopercolation ['e_cond-mat_0412089']
paperwavesinthewind ['e_1511.01750']
classicalmechanics ['e_math-ph_0504085', 'e_physics_9909035', 'e_1208.5402']
mixedgraphstates ['e_1506.03635']
bosonickernels ['e_1204.3693']
newresultsinbdecays ['e_1408.6060']
radiativespacetimes ['e_gr-qc_020

halflifeof14o ['e_nucl-ex_0601028']
goldoilandstocks ['e_1308.0210']
periodicgraphs ['e_0806.2074']
dbraneprimer ['e_hep-th_0007170']
thedensestgalaxy ['e_1307.7707']
aquantumfredkingate ['e_1603.08086']
newcharmresonances ['e_hep-ex_0612042']
supersolutions ['e_hep-th_9901094']
ontheszegometric ['e_1109.3484']
hybridbaryons ['e_nucl-th_0204031']
topicsinjetphysics ['e_hep-ph_9610234']
thehubbledeepfields ['e_astro-ph_0004319']
computingztop ['e_1408.1240']
quantumspaceattacks ['e_0711.3019']
inflationunloaded ['e_hep-th_0412055']
linesinhypergraphs ['e_1112.0376']
qualitonsfromqcd ['e_hep-ph_9311244']
statusoflatticeqcd ['e_hep-ph_9311242']
rationalityandpower ['e_1409.3790']
spinsumrulesatlowq2 ['e_0907.1475']
acanonicalds2317 ['e_hep-ph_0608011']
cohiggsbundlesonp1 ['e_1010.2526']
thestatusofcpt ['e_hep-ph_9810365']
cbsecaseenvironment ['e_1508.06208']
palindromicdensity ['e_1604.02327']
iloveirrotationally ['e_1504.07335']
katetovfunctors ['e_1412.1850']
creatableuniverses ['e_0705

qcdinthedeltaregime ['e_1103.3311']
dependencespaces ['e_0906.1132']
inducedqcditheory ['e_1609.06466']
na57mainresults ['e_0710.2849']
minimalcosmography ['e_1511.02169']
colorchiralsolitons ['e_hep-ph_0212385']
cobeandsusy ['e_hep-ph_9211235']
aseparatehiggs ['e_hep-ph_9211234']
dissolutioninafield ['e_cond-mat_0105615']
whatisaperiod ['e_1407.2388']
gaiaarchive ['e_1603.07347']
messageinthesky ['e_physics_0510102']
mathfrakp0spaces ['e_1311.1468']
thetensortrackiii ['e_1311.1461']
spherepackingsi ['e_math_9811073']
blockingwythoffnim ['e_1010.5816']
takahasisemigroups ['e_1504.00219']
thedoors ['e_1105.5809']
onmesonmasses ['e_hep-ph_0512196']
fullfieldalgebras ['e_math_0511328']
arithmeticdynamics ['e_math_0203024']
anoteonshelling ['e_math_0203025']
jumpingsequences ['e_0807.2890']
spectraofcoronae ['e_1111.1200']
photonicclusters ['e_cond-mat_0501733']
znquasialgebras ['e_math_9903128']
ag2qcdneutronstar ['e_1609.06979']
committeeranking ['e_1508.04013']
bloisvsummarytalk ['e_hep

riskwithoutreturn ['e_1307.0114']
qchkahlersurfacesii ['e_1802.10470']
yangmillsandbeyond ['e_1111.1247']
strict2toposes ['e_math_0606393']
jetsatcdf ['e_hep-ex_0608021']
gpgpucomputing ['e_1408.6923']
theherbrandtopos ['e_1112.3837']
renormingsoflplq ['e_math_9804002']
anewsymmetryforqed ['e_hep-th_9306132']
updateonfb ['e_hep-lat_9608092']
compoundpolarcodes ['e_1302.0265']
2dsolarmodeling ['e_0912.4998']
rhicspinphysics ['e_hep-ex_9807033']
darkmatterparticles ['e_astro-ph_9610263']
toolbox ['e_0811.3402']
smartphoneschlieren ['e_1609.04298']
threefermionsinabox ['e_0810.2331']
tverbergplusminus ['e_1612.05630']
yangbaxterequations ['e_math-ph_0606053']
oncosmicrayssources ['e_0809.3670']
smallhorizons ['e_1109.1566']
sotformot ['e_1712.01059', 'a_1712.01059v1']
freeextremevalues ['e_math_0501274']
theinformationsieve ['e_1507.02284', 'a_1507.02284v3']
clusterlenses ['e_1202.0185']
dirbecomettrails ['e_1408.1466']
gelfondbeziercurves ['e_1111.3405']
randomgexpectations ['e_1009.2168

In [391]:
with open("IMP_paper_global_id_dict.pkl", "wb") as f:
    pickle.dump(paper_global_id_dict, f)

with open("IMP_paper_id_titles.pkl", "wb") as f:
    pickle.dump(paper_id_titles, f)

In [387]:
#latest after lowercasing titles which I catastrophically forgot to do :/
len(paper_id_titles), len(paper_global_id_dict)

(1205268, 1205268)

In [388]:
len(paper_id_titles), len(paper_global_id_dict)

(1205268, 1205268)

In [389]:
ecir_dups = 0
axcell_dups = 0

for compressed_title, id_list in paper_id_titles.items():
    if len(id_list) > 1:
        
        dups_e = list(filter(lambda x: x.startswith('e_'), id_list))
        if dups_e and len(dups_e) > 1:
            ecir_dups +=1
            #print(compressed_title, id_list, arxiv_ecir[dups_e[0][2:]], arxiv_ecir[dups_e[1][2:]])
        
        dups_a = list(filter(lambda x: x.startswith('a_'), id_list))
        if dups_a and len(dups_a) > 1:
            axcell_dups +=1
        
print("Duplicate counts:\nECIR: {}\nAXCELL:{}".format(ecir_dups, axcell_dups))

Duplicate counts:
ECIR: 2817
AXCELL:256


In [99]:
s=0
for i in ttttt:
#     if "count" in ttttt[i] and ttttt[i]["count"] > 0:
#         s += 1
    if "refs" in ttttt[i] and "abl" in ttttt[i]["refs"] and ttttt[i]["refs"]["abl"]:
        s+=1
print(s)

170


# Read the 3-way split refs data

##### a. SPLIT_1: Table cell refs

In [15]:
with open("leaderboard_table_refs.pkl", "rb") as f:
    leaderboard_table_refs = pickle.load(f)

In [190]:
with open("new_leaderboard_refs.pkl", "rb") as f:
    leaderboard_refs = pickle.load(f)

In [17]:
with open("TABLE_Label_predicted.pkl", "rb") as f:
    collated_table_labels = pickle.load(f)

In [18]:
len(leaderboard_table_refs)

1988

##### b. SPLIT_2: Table caption refs

In [19]:
with open("refs_table_captions.pkl", "rb") as f:
    table_caption_refs = pickle.load(f)

##### c. SPLIT_3: Full text refs

In [296]:
with open("refs_full_text.pkl", "rb") as f:
    full_text_refs_dict = pickle.load(f)

# Analyse refs and papers loaded from 3 sources

In [293]:
for k,v in leaderboard_refs.items():
    print(k, v)
    break

table_cell_refs_keys = list(leaderboard_refs.keys())
print("\n", len(table_cell_refs_keys))

non_zero = 0
for k,v in leaderboard_refs.items():
    if v["count"] > 0:
        non_zero += 1
print(non_zero, len(table_cell_refs_keys)-non_zero)

2017_B1-Hhnslg {'refs': {'ldb': ["<ref id='bib-bib17'>2016</ref>", "<ref id='bib-bib1'>2013</ref>", "<ref id='bib-bib23'>2016</ref>", "<ref id='bib-bib29'>2016</ref>", "<ref id='bib-bib2'>2015</ref>", "<ref id='bib-bib22'>2017</ref>", "<ref id='bib-bib6'>2017</ref>"], 'abl': []}, 'count': 12}

 1718
419 1299


In [297]:
for k,v in table_caption_refs.items():
    print(k, v)
    break
    
table_cap_refs_keys = list(table_caption_refs.keys())
print("\n", len(table_cap_refs_keys))

2017_SJDaqqveg [(1, 'Table 2: Our IWSLT 2014 machine translation results with a convolutional encoder compared to the previous work by Ranzato et al. Please see 1 for an explanation of abbreviations. The asterisk identifies results from (Ranzato et\xa0al., 2015). The numbers reported with ≤ were approximately read from Figure 6 of (Ranzato et\xa0al., 2015)')]

 45


In [298]:
for k,v in full_text_refs_dict.items():
    print(k, v[0:12])
    break
    
ft_refs_keys = list(full_text_refs_dict.keys())
print("\n", len(ft_refs_keys))

2017_B1-q5Pqxl ['on the development data and that outperformed the DCR model~\\citep{Yu2015rank:arxiv}, which also introd', ' on the development data and that outperformed the DCR model~\\citep{Yu2015rank:arxiv}, which also introdu']

 1088


In [291]:
len(set(table_cell_refs_keys).intersection(set(ft_refs_keys)))

1136

# Read all references of 2504 papers

In [22]:
with open("/home/singh_shruti/workspace/PaperAcceptancePrediction/shruti/features/iclr_arxiv_map.pkl", "rb") as f:
    iclr_arxiv_map = pickle.load(f)

In [348]:
bbl_absent = []
bbl_multiple = []
bbl_unique = []
not_bib = []

exceptions_ids = []

paper_bbl_dict = {}

In [349]:
# DO THIS WITH CAUTION: THIS WILL REQUIRE THE TITLS TO BE DISAMBIGUATED AGAIN WHICH TAKES AROUND 4-5 HOURS
# Basically the portion where each of the reference is compared against all titles in the arxiv dump to map it back to an arxiv id.

for k in leaderboard_table_refs:
    arxivid = iclr_arxiv_map[k]["arxivId"]
    
    dir_name = arxivid.split(".")[0]
    
    # Read the bbl file to extract references
    potential_bbl_files = glob.glob("./data/unpacked_sources/{}/{}/*.bbl".format(dir_name, arxivid))
    all_tex_in_dir_for_multiple_bbl = potential_bbl_files = glob.glob("./data/unpacked_sources/{}/{}/*.tex".format(dir_name, arxivid))

    if len(potential_bbl_files) >= 1:
        
        annotate_seq = False
        bib_item_seq_number = None
        
        if len(potential_bbl_files) > 1:
#             read_texts_bib_all = []
            
#             NEW_potential_bbl_files = []
#             for bbbbbb in potential_bbl_files:
#                 only_first_name = bbbbbb.replace(".bbl", ".tex")
#                 if only_first_name in all_tex_in_dir_for_multiple_bbl:
#                     NEW_potential_bbl_files.append(bbbbbb)
#             potential_bbl_files = NEW_potential_bbl_files
            
            for bbbbbb in potential_bbl_files:
                with open(bibfileiter, "r", errors="ignore") as f:
                    read_texts_bib_all.append(f.read())
            same_text = False
            for iter_, text_iter in enumerate(read_texts_bib_all):
                if iter_ < len(read_texts_bib_all)-1:
                    if text_iter.lower() == read_texts_bib_all[iter_+1].lower():
                        same_text = True
                    else:
                        same_text = False
                        break
                else:
                    break
            if same_text:
                potential_bbl_files = [potential_bbl_files[0]]
        
        
        if len(potential_bbl_files) == 1:
            annotate_seq = True
            bib_item_seq_number = 1
        
        try:
            paper_bibitems = {}
            
            for bibfileiter in potential_bbl_files:
                with open(bibfileiter, "r", errors="ignore") as f:
                    lines = f.readlines()
                
                bibitem = []
                start_looking = True
                start_appending = False
                
                for l in lines:
                    if start_looking and (l.startswith("\\bibitem[") or l.startswith("\\bibitem{")):
                        start_looking = False
                        start_appending = True
                        bibitem = [l.strip().replace("\n", " ")]
                        continue
                    if start_appending:
                        if l == "\n":
                            bib_text = " ".join(bibitem)
                            cit_key = None
                            
                            if bib_text.startswith("\\bibitem["):
                                if bib_text.startswith("\\bibitem[\\protect\\citeauthoryear"):
                                    cit_key = None
                                    m = re.search('\\\\bibitem\[.*?\]%? ?({.*?})', bib_text)
                                    if m:
                                        cit_key = m.group(1)[1:-1]
                                    else:
                                        print("NOT FOUND CIT KEY[citauthor]: ", bibfileiter, bib_text)
                                else:
                                    m = re.search('\\\\bibitem\[.*?\]({.*?})', bib_text)
                                    if m:
                                        cit_key = m.group(1)[1:-1]
                                    else:
                                        print("NOT FOUND CIT KEY[]: ", bibfileiter, bib_text)
                                    
                            if bib_text.startswith("\\bibitem{"):
                                m = re.search('\\\\bibitem({.*?})', bib_text)
                                if m:
                                    cit_key = m.group(1)[1:-1]
                                else:
                                    print("NOT FOUND CIT KEY{}: ", bibfileiter, bib_text)
                            
                            if not cit_key is None:
                                if bib_item_seq_number:
                                    paper_bibitems[cit_key] = {"text": bib_text, "arxivids": [], "bib-seq": bib_item_seq_number}
                                else:
                                    paper_bibitems[cit_key] = {"text": bib_text, "arxivids": []}

                            bibitem = []
                            start_looking = True
                            start_appending = False
                            if bib_item_seq_number:
                                bib_item_seq_number += 1
                        else:
                            bibitem.append(l.strip().replace("\n", " "))
            
            if paper_bibitems:
                paper_bbl_dict[k] = paper_bibitems
                
        except Exception as ex:
            print(k, ex)
            exceptions_ids.append(k)
        
    elif len(potential_bbl_files) == 0:
        bbl_absent.append(k)

In [488]:
count_multiple_bbl_files_paper = 0
reduced_count_multiple_bbl_files_paper = 0

for k in leaderboard_table_refs:
    arxivid = iclr_arxiv_map[k]["arxivId"]
    
    dir_name = arxivid.split(".")[0]
    
    # Read the bbl file to extract references
    potential_bbl_files = glob.glob("./data/unpacked_sources/{}/{}/*.bbl".format(dir_name, arxivid))
    all_tex_in_dir_for_multiple_bbl = potential_bbl_files = glob.glob("./data/unpacked_sources/{}/{}/*.tex".format(dir_name, arxivid))

    if len(potential_bbl_files) >= 1:
        
        annotate_seq = False
        bib_item_seq_number = None
        
        if len(potential_bbl_files) > 1:
            count_multiple_bbl_files_paper+=1
#             read_texts_bib_all = []
            
            NEW_potential_bbl_files = []
            for bbbbbb in potential_bbl_files:
                only_first_name = bbbbbb.replace(".bbl", ".tex")
                if only_first_name in all_tex_in_dir_for_multiple_bbl:
                    NEW_potential_bbl_files.append(bbbbbb)
            potential_bbl_files = NEW_potential_bbl_files
            
            for bbbbbb in potential_bbl_files:
                with open(bibfileiter, "r", errors="ignore") as f:
                    read_texts_bib_all.append(f.read())
            same_text = False
            for iter_, text_iter in enumerate(read_texts_bib_all):
                if iter_ < len(read_texts_bib_all)-1:
                    if text_iter.lower() == read_texts_bib_all[iter_+1].lower():
                        same_text = True
                    else:
                        same_text = False
                        break
                else:
                    break
            if same_text:
                potential_bbl_files = [potential_bbl_files[0]]
        if len(potential_bbl_files) == 1:
            reduced_count_multiple_bbl_files_paper+=1


In [489]:
count_multiple_bbl_files_paper, reduced_count_multiple_bbl_files_paper

(1272, 716)

In [350]:
len(set(bbl_absent)), bbl_absent[0:10]

(0, [])

In [351]:
len(paper_bbl_dict)

48

In [347]:
list(paper_bbl_dict.keys())[0:10]

['2017_B1-Hhnslg',
 '2017_B1-q5Pqxl',
 '2017_B184E5qee',
 '2017_B1E7Pwqgl',
 '2017_B1G9tvcgx',
 '2017_B1GOWV5eg',
 '2017_B1Igu2ogg',
 '2017_B1IzH7cxl',
 '2017_B1MRcPclx',
 '2017_B1TTpYKgx']

In [162]:
iclr_arxiv_map["2017_BJYwwY9ll"]["arxivId"]

'1704.00109v1'

In [163]:
 glob.glob("./data/unpacked_sources/1704/1704.00109v1/*.bbl")

['./data/unpacked_sources/1704/1704.00109v1/main.bbl']

In [178]:
pwd

'/home/singh_shruti/workspace/axcell_ws/axcell/notebooks'

In [164]:
cat ./data/unpacked_sources/1704/1704.00109v1/main.bbl

\begin{thebibliography}{43}
\providecommand{\natexlab}[1]{#1}
\providecommand{\url}[1]{\texttt{#1}}
\expandafter\ifx\csname urlstyle\endcsname\relax
  \providecommand{\doi}[1]{doi: #1}\else
  \providecommand{\doi}{doi: \begingroup \urlstyle{rm}\Url}\fi

\bibitem[Bottou(2010)]{bottou2010large}
L{\'e}on Bottou.
\newblock Large-scale machine learning with stochastic gradient descent.
\newblock In \emph{COMPSTAT}. 2010.

\bibitem[Bucilu�� et~al.(2006)Bucilu��, Caruana, and
  Niculescu-Mizil]{bucilu2006model}
Cristian Bucilu��, Rich Caruana, and Alexandru Niculescu-Mizil.
\newblock Model compression.
\newblock In \emph{KDD}, 2006.

\bibitem[Caruana et~al.(2004)Caruana, Niculescu-Mizil, Crew, and
  Ksikes]{caruana2004ensemble}
Rich Caruana, Alexandru Niculescu-Mizil, Geoff Crew, and Alex Ksikes.
\newblock Ensemble selection from libraries of models.
\newblock In \emph{ICML}, 2004.

\bibitem[Collobert et~al.(2011)Collobert, Kavukcuoglu, and Fara

In [174]:
oiclr_arxiv_map["2018_BkQCGzZ0-"]["arxivId"], glob.glob("./data/unpacked_sources/1801/1801.09797v1/*")

('1801.09797v1',
 ['./data/unpacked_sources/1801/1801.09797v1/fancyhdr.sty',
  './data/unpacked_sources/1801/1801.09797v1/iclr2018_conference.sty',
  './data/unpacked_sources/1801/1801.09797v1/main.tex'])

In [176]:
!tail ./data/unpacked_sources/1801/1801.09797v1/main.tex

  Berg{-}Kirkpatrick]{textvae2}
Zichao Yang, Zhiting Hu, Ruslan Salakhutdinov, and Taylor Berg{-}Kirkpatrick.
\newblock Improved variational autoencoders for text modeling using dilated
  convolutions.
\newblock In \emph{Proceedings of {ICML}'17}, pp.\  3881--3890, 2017.

\end{thebibliography}


\end{document}


# Read all .aux files to see if that is a better option

In [526]:
one_only = []
multiple_only = []
zero_only = []

for k in leaderboard_table_refs:
    arxivid = iclr_arxiv_map[k]["arxivId"]
    
    dir_name = arxivid.split(".")[0]
    
    # Read the bbl file to extract references
    potential_aux_files = glob.glob("./data/unpacked_sources/{}/{}/*.bbl".format(dir_name, arxivid))
#     all_tex_in_dir_for_multiple_bbl = potential_bbl_files = glob.glob("./data/unpacked_sources/{}/{}/*.tex".format(dir_name, arxivid))

    if len(potential_aux_files) > 1:
        multiple_only.append(k)
    elif len(potential_aux_files) == 1:
        one_only.append(k)
    else:
        zero_only.append(k)

In [527]:
len(one_only), len(multiple_only), len(zero_only)

(1837, 73, 78)

# Map all references of 1896 papers to arxiv ids if possible based on title subset match

In [393]:
for k in paper_bbl_dict:
    for ref_key in paper_bbl_dict[k]:
        paper_bbl_dict[k][ref_key].pop("seq_id", None)

In [396]:
for k in paper_bbl_dict:
    if True:#k.startswith("2017_"):# or k.startswith("2020_"):
        if status%10==0:
            print("Done {} out of {}".format(status, total_stat))

        for ref_key in paper_bbl_dict[k]:
            clean_ref = unidecode.unidecode(paper_bbl_dict[k][ref_key]["text"])
            clean_ref = clean_ref.lower()
            clean_ref = re.sub('[\W_]', '', clean_ref)
            
            first_half_newblock_ind = clean_ref.find("newblock")
            second_half = clean_ref[first_half_newblock_ind+8:]
            second_half_newblock_ind = second_half.find("newblock")
            
            potential_title_idfied_newblock = second_half[0:second_half_newblock_ind]
            print(potential_title_idfied_newblock, clean_ref)
            break
    break

labelembeddingforattributebasedclassification bibitemakataetal2013akataperronninharchaouiandschmidakata2013labelzeynepakataflorentperronninzaidharchaouiandcordeliaschmidnewblocklabelembeddingforattributebasedclassificationnewblockinemphcomputervisionandpatternrecognitionpages8198262013


In [400]:
status = 0
total_stat = len(paper_bbl_dict)
idfied_working =0 

for k in paper_bbl_dict:
    if True:#k.startswith("2017_"):# or k.startswith("2020_"):
        if status%10==0:
            print("Done {} out of {} and idfied are {}".format(status, total_stat, idfied_working))

        for ref_key in paper_bbl_dict[k]:
            clean_ref = unidecode.unidecode(paper_bbl_dict[k][ref_key]["text"])
            clean_ref = clean_ref.lower()
            clean_ref = re.sub('[\W_]', '', clean_ref)
            
            first_half_newblock_ind = clean_ref.find("newblock")
            second_half = clean_ref[first_half_newblock_ind+8:]
            second_half_newblock_ind = second_half.find("newblock")
            
            potential_title_idfied_newblock = second_half[0:second_half_newblock_ind]
            
            if potential_title_idfied_newblock in paper_global_id_dict:
                idfied_working += 1
                paper_bbl_dict[k][ref_key]["seq_id"] = paper_global_id_dict[potential_title_idfied_newblock]
            else:
                for p_titel in paper_global_id_dict:
                    if clean_ref.find(p_titel) > -1:
                        paper_bbl_dict[k][ref_key]["seq_id"] = paper_global_id_dict[p_titel]
                        break
        status +=1

Done 0 out of 1896 and idfied are 0
Done 10 out of 1896 and idfied are 122
Done 20 out of 1896 and idfied are 353
Done 30 out of 1896 and idfied are 515
Done 40 out of 1896 and idfied are 645
Done 50 out of 1896 and idfied are 811
Done 60 out of 1896 and idfied are 991
Done 70 out of 1896 and idfied are 1187
Done 80 out of 1896 and idfied are 1323
Done 90 out of 1896 and idfied are 1486
Done 100 out of 1896 and idfied are 1660
Done 110 out of 1896 and idfied are 1768
Done 120 out of 1896 and idfied are 1947
Done 130 out of 1896 and idfied are 2126
Done 140 out of 1896 and idfied are 2320
Done 150 out of 1896 and idfied are 2457
Done 160 out of 1896 and idfied are 2610
Done 170 out of 1896 and idfied are 2772
Done 180 out of 1896 and idfied are 2914
Done 190 out of 1896 and idfied are 3075
Done 200 out of 1896 and idfied are 3235
Done 210 out of 1896 and idfied are 3418
Done 220 out of 1896 and idfied are 3569
Done 230 out of 1896 and idfied are 3790
Done 240 out of 1896 and idfied are 

In [491]:
len(paper_bbl_dict)

1896

In [401]:
with open("save_paper_bbl_becoz_ML_deadline_eating_RAM.pkl", "wb") as f:
    pickle.dump(paper_bbl_dict, f)

In [278]:
total = 0
succ = 0

for k in paper_bbl_dict:
    if True:# k.startswith("2017_") or k.startswith("2018_"):
        for ref_key in paper_bbl_dict[k]:
            total += 1
            if "seq_id" in paper_bbl_dict[k][ref_key]:
                succ += 1
print(succ, total)

57636 72619


In [158]:
with open("save_paper_bbl_becoz_ML_deadline_eating_RAM.pkl", "wb") as f:
    pickle.dump(paper_bbl_dict, f)

In [352]:
with open("save_paper_bbl_becoz_ML_deadline_eating_RAM.pkl", "rb") as f:
    paper_bbl_dict = pickle.load(f)

In [353]:
len(paper_bbl_dict)

1896

In [117]:
for k, v in paper_bbl_dict.items():
    if k.startswith("2017") or k.startswith("2018"):
        print(k, paper_bbl_dict[k].keys())
        break

2017_B1-Hhnslg dict_keys(['akata2013label', 'akata2015evaluation', 'lei2015predicting', 'banerjee2005clustering', 'bellet2013survey', 'edwards2017towards', 'elhoseiny2013write', 'goldberger2004neighbourhood', 'hochreiter1997long', 'ioffe2015batch', 'kingma2014adam', 'kingma2013auto', 'koch2015siamese', 'krizhevsky2012imagenet', 'kulis2012metric', 'lake2011one', 'liao2016', 'maaten2008visualizing', 'mensink2013distance', 'miller2000learning', 'min2009deep', 'ravi2017meta', 'reed2016learning', 'rezende2014stochastic', 'rippel2015metric', 'russakovsky2015imagenet', 'salakhutdinov2007learning', 'szegedy2015going', 'vinyals2016matching', 'weinberger2005distance', 'welinder2010caltech'])


# Strict reference-> title mapping/disambiguation

In [629]:
potential_errorenous_entries = []

In [630]:
status = 0
total_stat = len(paper_bbl_dict)
idfied_working =0 

for k in paper_bbl_dict:
    if True:#k.startswith("2017_"):# or k.startswith("2020_"):
        if status%10==0:
            print("Done {} out of {} and idfied are {}".format(status, total_stat, idfied_working))

        for ref_key in paper_bbl_dict[k]:
            clean_ref = unidecode.unidecode(paper_bbl_dict[k][ref_key]["text"])
            clean_ref = clean_ref.lower()
            clean_ref = re.sub('[\W_]', '', clean_ref)
            
            first_half_newblock_ind = clean_ref.find("newblock")
            second_half = clean_ref[first_half_newblock_ind+8:]
            second_half_newblock_ind = second_half.find("newblock")
            
            potential_title_idfied_newblock = second_half[0:second_half_newblock_ind]
            
            if potential_title_idfied_newblock in paper_global_id_dict:
                idfied_working += 1
                paper_bbl_dict[k][ref_key]["strict_seq_id"] = paper_global_id_dict[potential_title_idfied_newblock]
                if len(potential_title_idfied_newblock) < 10:
                    potential_errorenous_entries.append([k, potential_title_idfied_newblock, ref_key])
        status +=1

Done 0 out of 1896 and idfied are 0
Done 10 out of 1896 and idfied are 122
Done 20 out of 1896 and idfied are 353
Done 30 out of 1896 and idfied are 515
Done 40 out of 1896 and idfied are 645
Done 50 out of 1896 and idfied are 811
Done 60 out of 1896 and idfied are 991
Done 70 out of 1896 and idfied are 1187
Done 80 out of 1896 and idfied are 1323
Done 90 out of 1896 and idfied are 1486
Done 100 out of 1896 and idfied are 1660
Done 110 out of 1896 and idfied are 1768
Done 120 out of 1896 and idfied are 1947
Done 130 out of 1896 and idfied are 2126
Done 140 out of 1896 and idfied are 2320
Done 150 out of 1896 and idfied are 2457
Done 160 out of 1896 and idfied are 2610
Done 170 out of 1896 and idfied are 2772
Done 180 out of 1896 and idfied are 2914
Done 190 out of 1896 and idfied are 3075
Done 200 out of 1896 and idfied are 3235
Done 210 out of 1896 and idfied are 3418
Done 220 out of 1896 and idfied are 3569
Done 230 out of 1896 and idfied are 3790
Done 240 out of 1896 and idfied are 

In [631]:
len(potential_errorenous_entries)

152

In [633]:
potential_errorenous_entries[0:3]

[['2017_B1G9tvcgx', 'fastrcnn', 'frcnn'],
 ['2017_BJtNZAFgg', 'fastrcnn', 'fastrcnn'],
 ['2017_HyQJ-mclg', 'fastrcnn', 'Girshick']]

In [635]:
paper_bbl_dict["2017_B1G9tvcgx"]["frcnn"], paper_bbl_dict["2017_BJtNZAFgg"]["fastrcnn"], paper_bbl_dict["2017_HyQJ-mclg"]["Girshick"]

({'text': '\\bibitem[Girshick(2015)]{frcnn} Ross Girshick. \\newblock {Fast R-CNN}. \\newblock In \\emph{ICCV}, 2015.',
  'arxivids': [],
  'bib-seq': 8,
  'seq_id': 894386,
  'strict_seq_id': 894386},
 {'text': '\\bibitem[Girshick(2015)]{fastrcnn} Ross Girshick. \\newblock Fast {R-CNN}. \\newblock In \\emph{ICCV}, 2015.',
  'arxivids': [],
  'bib-seq': 8,
  'seq_id': 894386,
  'strict_seq_id': 894386},
 {'text': '\\bibitem[Girshick(2015)]{Girshick} Ross Girshick. \\newblock Fast r-cnn. \\newblock \\emph{In ICCV}, 2015.',
  'arxivids': [],
  'bib-seq': 5,
  'seq_id': 894386,
  'strict_seq_id': 894386})

In [636]:
potential_errorenous_entries

[['2017_B1G9tvcgx', 'fastrcnn', 'frcnn'],
 ['2017_BJtNZAFgg', 'fastrcnn', 'fastrcnn'],
 ['2017_HyQJ-mclg', 'fastrcnn', 'Girshick'],
 ['2017_HysBZSqlx', 'openaigym', 'brockman2016openai'],
 ['2017_SJNDWNOlg', 'fastrcnn', 'girshick2015_fast_rcnn'],
 ['2017_rkE3y85ee', 'asampling', 'maddison2014sampling'],
 ['2018_BJk59JZ0b', 'openaigym', 'BrockmanEtAl2016'],
 ['2018_H1bM1fZCW', 'maskrcnn', 'maskrcnn'],
 ['2018_H1cKvl-Rb', 'openaigym', 'brockman2016openai'],
 ['2018_Hkbd5xZRb', 'deepsets', 'zaheer2017deep'],
 ['2018_Hkbd5xZRb', 'deepsets', 'manzil_deepsets'],
 ['2018_S14EogZAZ', 'openaigym', 'brockman2016openai'],
 ['2018_S1J2ZyZ0Z', 'fastrcnn', 'Girshick2015'],
 ['2018_S1v4N2l0-', 'fastrcnn', 'girshick2015fast'],
 ['2018_SJA7xfb0b', 'fishergan', 'mroueh2017fisher'],
 ['2018_Sk9yuql0Z', 'fastrcnn', 'Girshick_2015_Fast'],
 ['2018_SyJS-OgR-', 'maskrcnn', 'he2017mask'],
 ['2018_Syg-YfWCW', 'deepsets', 'zaheer2016deepsets'],
 ['2018_Syjha0gAZ', 'maskrcnn', 'he2017mask'],
 ['2018_r1RF3ExCb', '

In [638]:
paper_bbl_dict["2018_Hkbd5xZRb"]["zaheer2017deep"], paper_bbl_dict["2018_Hkbd5xZRb"]["manzil_deepsets"], paper_bbl_dict["2020_ryxF80NYwS"]["deep_sets"]

({'text': '\\bibitem[Zaheer et~al.(2017{\\natexlab{a}})Zaheer, Kottur, Ravanbakhsh, Poczos, Salakhutdinov, and Smola]{zaheer2017deep} M.~Zaheer, S.~Kottur, S.~Ravanbakhsh, B.~Poczos, R.~Salakhutdinov, and A.~Smola. \\newblock Deep sets. \\newblock \\emph{arXiv preprint arXiv:1703.06114}, 2017{\\natexlab{a}}.',
  'arxivids': [],
  'bib-seq': 37,
  'seq_id': 644977,
  'strict_seq_id': 644977},
 {'text': '\\bibitem[Zaheer et~al.(2017{\\natexlab{b}})Zaheer, Kottur, Ravanbakhsh, Poczos, Salakhutdinov, and Smola]{manzil_deepsets} M.~Zaheer, S.~Kottur, S.~Ravanbakhsh, B.~Poczos, R.R. Salakhutdinov, and A.J. Smola. \\newblock Deep sets. \\newblock In \\emph{Advances in Neural Information Processing Systems 30}, pages 3393--3403, 2017{\\natexlab{b}}.',
  'arxivids': [],
  'bib-seq': 38,
  'seq_id': 644977,
  'strict_seq_id': 644977},
 {'text': "\\bibitem[Zaheer et~al.(2017)Zaheer, Kottur, Ravanbakhsh, P{\\'{o}}czos, Salakhutdinov, and Smola]{deep_sets} Zaheer, M., Kottur, S., Ravanbakhsh, S., P

# Extract citation key from citation context and map to unique paper

In [589]:
graph_edges = defaultdict(set)

In [592]:
counting_iclr_refs = defaultdict(list)
sanity_count = 0

**This will be done in 3 parts separately for each of the types of citation extracted from table cells, table captions and table-fulltextsearch**

In [593]:
# DONE = False
found_in_paper_not_arxiv = 0
fipna_pid = []
bib_seq_not_found = []

iclr_yearwise_graph_info = {}
reverse_iclr_yearwise_graph_info = {}

for k in leaderboard_refs:
    title = iclr_arxiv_map[k]["title"].lower()
    unaccented_title = unidecode.unidecode(title)
    clean_title = re.sub('[\W_]', '', unaccented_title)
    
    if clean_title in paper_global_id_dict:
        self_global_id = paper_global_id_dict[clean_title]
    else:
        paper_global_id_dict[clean_title] = len(paper_global_id_dict)
        self_global_id = paper_global_id_dict[clean_title]
    
    iclr_yearwise_graph_info[k] = self_global_id
    reverse_iclr_yearwise_graph_info[self_global_id] = k
    
    try:
        if leaderboard_refs[k]["count"] > 0 and k in paper_bbl_dict: #k.startswith("2017"):# or k.startswith("2018"):
            
            ldb_abl_keys = []
            if "ldb" in leaderboard_refs[k]['refs'] and leaderboard_refs[k]['refs']["ldb"]:
                ldb_abl_keys.append("ldb")
            if "abl" in leaderboard_refs[k]['refs'] and leaderboard_refs[k]['refs']["abl"]:
                ldb_abl_keys.append("abl")
            
            for ldbabl_ref_key in ldb_abl_keys:
                for ldb_ref in leaderboard_refs[k]['refs'][ldbabl_ref_key]:
                    m = re.search("bib[a]?-bib([0-9][0-9]?)", ldb_ref)
                    if m:
                        seq_key = int(m.group(1))
                        if k == "2017_BJlxmAKlg":
                            seq_key = seq_key - 1
                        found = False
                        partially_found = False
                        for paper_bib_entry in paper_bbl_dict[k].values():
                            if "bib-seq" in paper_bib_entry and paper_bib_entry["bib-seq"] == seq_key:
                                if "seq_id" in paper_bib_entry:
                                    found = True
                                    graph_edges[self_global_id].add(paper_bib_entry["seq_id"])
                                    sanity_count += 1
                                    counting_iclr_refs[paper_bib_entry["seq_id"]].append(paper_bib_entry)
                                else:
                                    partially_found = True
                                    found_in_paper_not_arxiv += 1
                        if not found and not partially_found:
                            bib_seq_not_found.append((k, ldb_ref))
                    else:
                        print("insucc bib re match: ", k, ldb_ref)
            if not self_global_id in graph_edges:
                fipna_pid.append(k)
    except Exception as ex:
        print("Error: ", k, ex)

insucc bib re match:  2017_BycCx8qex <ref id='S1-F2'>2</ref>
insucc bib re match:  2017_HJ0UKP9ge <ref id='S2-E2'>2</ref>
insucc bib re match:  2017_HJ0UKP9ge <ref id='S2-E1'>1</ref>
insucc bib re match:  2017_HkcdHtqlx <ref id='S3-E5'>5</ref>
insucc bib re match:  2017_Hyq4yhile <ref id='S5-SS4'>5.4</ref>
insucc bib re match:  2017_Hyq4yhile <ref id='S5-SS2'>5.2</ref>
insucc bib re match:  2017_Hyq4yhile <ref id='S5-SS3'>5.3</ref>
insucc bib re match:  2017_SJU4ayYgl <ref id='S2-E7'>7</ref>
insucc bib re match:  2017_SJU4ayYgl <ref id='S2-E8'>8</ref>
insucc bib re match:  2017_SJU4ayYgl <ref id='S2-E5'>5</ref>
insucc bib re match:  2017_SJU4ayYgl <ref id='S2-E6'>6</ref>
insucc bib re match:  2017_SkxKPDv5xl <ref id='S2-E2'>2</ref>
insucc bib re match:  2017_SyCSsUDee <ref id='S3-F2'>2</ref>
insucc bib re match:  2017_SyCSsUDee <ref id='S2-F1'>1</ref>
insucc bib re match:  2017_SyCSsUDee <ref id='S3-F3'>3</ref>
insucc bib re match:  2017_SyCSsUDee <ref id='S3-F2'>2</ref>
insucc bib re 

In [550]:
len(counting_iclr_refs), sanity_count

(988, 2176)

In [499]:
found_in_paper_not_arxiv, len(bib_seq_not_found), bib_seq_not_found[0:5]

(198,
 193,
 [('2017_BJbD_Pqlg', "<ref id='bib-bib1'>2014</ref>"),
  ('2017_HJ1kmv9xx', "<ref id='bib-bib23'>2016</ref>"),
  ('2017_HkpbnH9lx', "<ref id='bib-bib46'>46</ref>"),
  ('2017_HkpbnH9lx', "<ref id='bib-bib34'>34</ref>"),
  ('2017_HkpbnH9lx', "<ref id='bib-bib22'>22</ref>")])

In [405]:
392-198

194

In [368]:
nk = "2017_HkpbnH9lx"
iclr_arxiv_map[nk]["arxivId"]#, paper_bbl_dict[nk]

'1605.08803v3'

In [369]:
found_in_paper_not_arxiv

198

In [383]:
print("AutoencodersandGenerativeAdversarialNetworksforImbalancedSequenceClassification" in paper_global_id_dict)
print("AutoencodersandGenerativeAdversarialNetworksforImbalancedSequenceClassification".lower() in paper_global_id_dict)

True
False


In [382]:
#WRONG!!! DONT DO THIS AGAIN

# caps_titels_list = list(paper_global_id_dict.keys())

# for paper_caps in caps_titels_list:
#     if paper_caps.lower() in paper_global_id_dict:
#         del paper_global_id_dict[paper_caps]

In [384]:
len(paper_global_id_dict)

1713

In [370]:
leaderboard_refs["1702.08811v3"]

{}

In [372]:
del leaderboard_refs["1702.08811v3"]

In [413]:
graph_edges

defaultdict(set,
            {40517: {2895, 45773, 283317, 385142, 490188, 779070},
             419660: {111966, 396737, 861194, 888571, 1082720, 1147022},
             1121619: {117412,
              171477,
              190284,
              309772,
              513240,
              596867,
              907099,
              1039218,
              1140292},
             748140: {2895, 930686, 1044012},
             323641: {154986, 539819, 1107219},
             298789: {408575, 772825, 1082252},
             116985: {81294,
              88529,
              143084,
              416925,
              574288,
              809115,
              920951,
              928211,
              985741,
              1014228},
             57209: {38666, 502887, 545116, 635748, 838523},
             825280: {190284, 730027, 760983},
             106387: {404528, 406679, 416925, 678460, 774827, 818442},
             187357: {2895,
              88529,
              88735,
              

In [411]:
len(iclr_yearwise_graph_info), len(reverse_iclr_yearwise_graph_info)

(1718, 1711)

In [415]:
reverse_iclr_yearwise_graph_info[1121619], leaderboard_refs["2017_B1E7Pwqgl"]

('2017_B1E7Pwqgl',
 {'refs': {'ldb': ["<ref id='bib-bib62'>62</ref>",
    "<ref id='bib-bib19'>19</ref>",
    "<ref id='bib-bib58'>58</ref>",
    "<ref id='bib-bib63'>63</ref>",
    "<ref id='bib-bib59'>59</ref>",
    "<ref id='bib-bib21'>21</ref>",
    "<ref id='bib-bib60'>60</ref>",
    "<ref id='bib-bib29'>29</ref>",
    "<ref id='bib-bib30'>30</ref>",
    "<ref id='bib-bib25'>25</ref>",
    "<ref id='bib-bib44'>44</ref>"]},
  'count': 15})

In [434]:
#calculate ideal number of nodes after step 1

stage1_pot_nodes = []

for k in leaderboard_refs:
    if leaderboard_refs[k]["count"] > 0:
        stage1_pot_nodes.append(k)
print(len(stage1_pot_nodes))

419


In [441]:
len(set(fipna_pid))

57

In [443]:
fipna_pid[0:4]

['2017_BJbD_Pqlg', '2017_BycCx8qex', '2017_HJ1kmv9xx', '2017_HkcdHtqlx']

In [449]:
iclr_arxiv_map["2017_BycCx8qex"]["arxivId"], leaderboard_refs["2017_BycCx8qex"], paper_bbl_dict["2017_BycCx8qex"]

('1703.04474v1',
 {'refs': {'ldb': ["<ref id='S1-F2'>2</ref>"]}, 'count': 1},
 {'andor2016globally': {'text': '\\bibitem[{Andor et~al.(2016)Andor, Alberti, Weiss, Severyn, Presta, Ganchev, Petrov, and Collins}]{andor2016globally} Daniel Andor, Chris Alberti, David Weiss, Aliaksei Severyn, Alessandro Presta, Kuzman Ganchev, Slav Petrov, and Michael Collins. 2016. \\newblock Globally normalized transition-based neural networks. \\newblock In {\\em Proceedings of the 54th Annual Meeting of the Association for Computational Linguistics\\/}. pages 2442--2452.',
   'arxivids': [],
   'seq_id': 160522},
  'attardi2009reverse': {'text': "\\bibitem[{Attardi and Dell'Orletta(2009)}]{attardi2009reverse} Giuseppe Attardi and Felice Dell'Orletta. 2009. \\newblock Reverse revision and linear tree combination for dependency parsing. \\newblock In {\\em Proceedings of Human Language Technologies: The 2009 Annual Conference of the North American Chapter of the Association for Computational Linguistics,

In [500]:
print(len(graph_edges))

347


In [407]:
ec = 0

for x in graph_edges:
    ec += len(graph_edges[x])
print(ec)

2053


In [408]:
uniq_nodes = set()
for k, v in graph_edges.items():
#     print(k, v)
#     break
    uniq_nodes.add(k)
    for i in v:
        uniq_nodes.add(i)
uniq_nodes = set(uniq_nodes)

In [409]:
len(uniq_nodes)

1271

In [410]:
len(paper_global_id_dict)

1205268

In [377]:
list(paper_global_id_dict.keys())[0]

'onepositiveandtwonegativeresultsforderivedcategoriesofalgebraicstacks'

# Segment 2

In [594]:
found_in_paper_not_arxiv_p2 = []

for k,v in full_text_refs_dict.items():
    
    title = iclr_arxiv_map[k]["title"].lower()
    unaccented_title = unidecode.unidecode(title)
    clean_title = re.sub('[\W_]', '', unaccented_title)
    
    if clean_title in paper_global_id_dict:
        self_global_id = paper_global_id_dict[clean_title]
    else:
        paper_global_id_dict[clean_title] = len(paper_global_id_dict)
        self_global_id = paper_global_id_dict[clean_title]
    
    iclr_yearwise_graph_info[k] = self_global_id
    reverse_iclr_yearwise_graph_info[self_global_id] = k
    
    for i in v:
        m = re.search("\\\cite[ptyear\[\]\*]*?\{([^\{]*)\}", i)
        cits = []
        if m:
            p = m.group(1)
            cits = p.split(",")
        if k in paper_bbl_dict:
            for cit_key in cits:
                if cit_key in paper_bbl_dict[k]:
                    if "seq_id" in paper_bbl_dict[k][cit_key]:
                        graph_edges[self_global_id].add(paper_bbl_dict[k][cit_key]["seq_id"])
                        counting_iclr_refs[paper_bbl_dict[k][cit_key]["seq_id"]].append(paper_bbl_dict[k][cit_key])
                        sanity_count +=1
                    else:
                        found_in_paper_not_arxiv_p2.append(paper_bbl_dict[k][cit_key]["text"])
    if not self_global_id in graph_edges:
        print(k)

2017_B1YfAfcgl
2017_BJYwwY9ll
2017_BJh6Ztuxl
2017_By5e2L9gl
2017_HJSCGD9ex
2017_HJpfMIFll
2017_HkNRsU5ge
2017_Hkg4TI9xl
2017_HkwoSDPgg
2017_S13wCE9xx
2017_r1BJLw9ex
2017_r1G4z8cge
2017_r1y1aawlg
2017_rkFBJv9gg
2017_rky3QW9le
2017_ryuxYmvel
2018_B1ZZTfZAW
2018_B1jscMbAW
2018_B1l8BtlCb
2018_B1spAqUp-
2018_B1ydPgTpW
2018_BJvWjcgAZ
2018_BkUp6GZRW
2018_ByOnmlWC-
2018_H113pWZRb
2018_HkXWCMbRW
2018_Hksj2WWAW
2018_HyRVBzap-
2018_HyUNwulC-
2018_HytSvlWRZ
2018_S1GUgxgCW
2018_S1XolQbRW
2018_S1m6h21Cb
2018_SJCq_fZ0Z
2018_SJyEH91A-
2018_SyqShMZRb
2018_rJFOptp6Z
2018_rJTutzbA-
2018_rJg4YGWRb
2018_rk6cfpRjZ
2018_rkA1f3NpZ
2018_rkYTTf-AZ
2018_rkYgAJWCZ
2018_rkZvSe-RZ
2018_ry6-G_66b
2018_ryTp3f-0-
2018_ryazCMbR-
2019_B1G5ViAqFm
2019_BJe-Sn0ctm
2019_BkG5SjR5YQ
2019_BkG8sjR5Km
2019_BkfbpsAcF7
2019_Bkg2viA5FQ
2019_BklAEsR5t7
2019_BklKFo09YX
2019_ByMHvs0cFQ
2019_H1e572A5tQ
2019_H1gTEj09FX
2019_H1ldNoC9tX
2019_HJxfm2CqKm
2019_Hk4dFjR5K7
2019_HkgHk3RctX
2019_HylKJhCcKm
2019_Hyx4knR9Ym
2019_Hyxsl2AqKm
2019_Sk

In [552]:
len(counting_iclr_refs), sanity_count

(2546, 22312)

In [464]:
full_text_refs_dict["2017_ryuxYmvel"]

['\n   local patterns of tokens.\n \\item\n   2-layer 1D CNN with LSTM \\citep{lstm} sequence reduction. This mo',
 ' of\n   local patterns of tokens.\n \\item\n   2-layer 1D CNN with LSTM \\citep{lstm} sequence reduction. Thi',
 'of\n   local patterns of tokens.\n \\item\n   2-layer 1D CNN with LSTM \\citep{lstm} sequence reduction. This m']

In [465]:
m = re.search("\\\cite[ptyear\[\]\*]*?\{([^\{]*)\}", full_text_refs_dict["2017_ryuxYmvel"][0])
m, m.group(1)


(<re.Match object; span=(65, 77), match='\\citep{lstm}'>, 'lstm')

In [466]:
"lstm" in paper_bbl_dict["2017_ryuxYmvel"]

True

In [468]:
iclr_yearwise_graph_info["2017_ryuxYmvel"], 436123 in graph_edges

(436123, False)

In [463]:
paper_bbl_dict["2017_B1MRcPclx"]

{'tensorflow': {'text': '\\bibitem[Abadi et~al.(2016)Abadi, Agarwal, Barham, Brevdo, Chen, Citro, Corrado, Davis, Dean, Devin, et~al.]{tensorflow} Mart{\\i}n Abadi, Ashish Agarwal, Paul Barham, Eugene Brevdo, Zhifeng Chen, Craig Citro, Greg~S Corrado, Andy Davis, Jeffrey Dean, Matthieu Devin, et~al. \\newblock Tensorflow: Large-scale machine learning on heterogeneous distributed systems. \\newblock \\emph{arXiv preprint arXiv:1603.04467}, 2016.',
  'arxivids': [],
  'bib-seq': 1,
  'seq_id': 842029},
 'strnn': {'text': '\\bibitem[Balduzzi and Ghifary(2016)]{strnn} David Balduzzi and Muhammad Ghifary. \\newblock Strongly-typed recurrent neural networks. \\newblock In \\emph{ICML}, 2016.',
  'arxivids': [],
  'bib-seq': 2,
  'seq_id': 1008429},
 'bordes2016learning': {'text': '\\bibitem[Bordes and Weston(2016)]{bordes2016learning} Antoine Bordes and Jason Weston. \\newblock Learning end-to-end goal-oriented dialog. \\newblock \\emph{arXiv preprint arXiv:1605.07683}, 2016.',
  'arxivids':

In [432]:
len(set(found_in_paper_not_arxiv_p2)), len(found_in_paper_not_arxiv_p2)

(988, 3761)

In [502]:
print(len(graph_edges))

1313


In [503]:
ec = 0

for x in graph_edges:
    ec += len(graph_edges[x])
print(ec)

6631


In [505]:
uniq_nodes = set()
for k, v in graph_edges.items():
#     print(k, v)
#     break
    uniq_nodes.add(k)
    for i in v:
        uniq_nodes.add(i)
uniq_nodes = set(uniq_nodes)

In [506]:
len(uniq_nodes)

3641

# Segement 2-b

In [748]:
with open("type1_pdf_to_text.pkl", "rb") as f:
    type1_pdf_to_text = pickle.load(f)

# P1

In [473]:
yw_ec = {str(y): [] for y in range(2017, 2021)}
paper_nodes = {str(y): 0 for y in range(2017, 2021)}

for x in graph_edges:
    year = reverse_iclr_yearwise_graph_info[x][0:4]
    yw_ec[year] += graph_edges[x]
    paper_nodes[year] += 1
    
print([(y, len(yw_ec[y]), len(set(yw_ec[y]))) for y in yw_ec])
print(paper_nodes)

[('2017', 695, 376), ('2018', 1171, 603), ('2019', 1949, 1044), ('2020', 2816, 1503)]
{'2017': 162, '2018': 239, '2019': 389, '2020': 523}


### {'2017': 695, '2018': 1171, '2019': 1949, '2020': 2816}

# P2 

In [425]:
unique_references_set = set()

for y in yw_ec:
    for rrr in yw_ec[y]:
        unique_references_set.add(rrr)
print(len(unique_references_set))

2546


In [469]:
len(graph_edges)

1313

In [496]:
len(set(graph_edges.keys()).intersection(unique_references_set))

242

In [492]:
len(graph_edges)

1337

# Save all data

In [528]:
with open('GRAPH_0.pkl', "wb") as f:
    pickle.dump(graph_edges, f)

In [533]:
with open('iclr_arxiv_gid_map.pkl', "wb") as f:
    pickle.dump(iclr_yearwise_graph_info, f)

In [535]:
with open('reverse_iclr_arxiv_gid_map.pkl', "wb") as f:
    pickle.dump(reverse_iclr_yearwise_graph_info, f)

In [534]:
len(graph_edges), len(iclr_yearwise_graph_info), len(reverse_iclr_yearwise_graph_info)

(1313, 1718, 1711)

In [532]:
reverse_iclr_yearwise_graph_info

{40517: '2017_B1-Hhnslg',
 143084: '2017_B1-q5Pqxl',
 419660: '2017_B184E5qee',
 1121619: '2017_B1E7Pwqgl',
 74018: '2017_B1G9tvcgx',
 722379: '2017_B1GOWV5eg',
 748140: '2017_B1Igu2ogg',
 882646: '2017_B1IzH7cxl',
 121566: '2017_B1MRcPclx',
 477987: '2017_B1YfAfcgl',
 323641: '2017_B1ckMDqlg',
 298789: '2017_B1ewdt9xe',
 994759: '2017_B1gtu5ilg',
 116985: '2017_B1hdzd5lg',
 1079820: '2017_B1kJ6H9ex',
 712962: '2017_B1mAJI9gl',
 57209: '2017_BJ--gPcxl',
 659591: '2017_BJ0Ee8cxx',
 825280: '2017_BJAFbaolg',
 208291: '2017_BJC8LF9ex',
 106387: '2017_BJC_jUqxe',
 568032: '2017_BJK3Xasel',
 117882: '2017_BJYwwY9ll',
 828647: '2017_BJbD_Pqlg',
 917016: '2017_BJh6Ztuxl',
 352448: '2017_BJjn-Yixl',
 187357: '2017_BJlxmAKlg',
 447871: '2017_BJm4T4Kgx',
 767136: '2017_BJtNZAFgg',
 103149: '2017_BJuysoFeg',
 822222: '2017_BJwFrvOeg',
 636277: '2017_Bk8N0RLxx',
 697270: '2017_BkCPyXm1l',
 280424: '2017_BkUDvt5gg',
 147874: '2017_BkdpaH9ll',
 946498: '2017_Bkepl7cee',
 556709: '2017_BkfiXiUlg',
 4

## New papers to download

In [509]:
new_papers_to_download = unique_references_set.difference(set(graph_edges.keys()))

In [510]:
len(new_papers_to_download), len(new_papers_to_download)+218

(2328, 2546)

In [519]:
map_new_papers_to_download_to_arxivId = {}
hacky_list_of_dict = list(paper_global_id_dict.items())

for mid in new_papers_to_download:
    potential_paper = hacky_list_of_dict[mid]
    if mid == potential_paper[1]:
        map_new_papers_to_download_to_arxivId[mid] = [potential_paper[0], paper_id_titles[potential_paper[0]]]
    else:
        print("Not found: ", mid)

In [521]:
list(map_new_papers_to_download_to_arxivId.items())[0:2]

[(1179651, ['searchingformobilenetv3', ['a_1905.02244v5']]),
 (1179653,
  ['mixmatchaholisticapproachtosemisupervisedlearning', ['a_1905.02249v2']])]

In [522]:
with open("map_new_papers_to_download_to_arxiv.pkl", "wb") as f:
    pickle.dump(map_new_papers_to_download_to_arxivId, f)

In [None]:
global_graph_nodes = list(graph_edges.keys())
global_graph_nodes_itercopy = list(graph_edges.keys())
cc = 1
cc_list = defaultdict(set)
idx = 0


for k in global_graph_nodes_itercopy:
    cc_list[idx].add(k)
    for neighs in graph_edges[k]:
        cc_list.add(neighs)

In [476]:
def DFSUtil(temp, v, visited):   
    # Mark the current vertex as visited 
    visited[v] = True

    # Store the vertex to list 
    temp.append(v) 

    # Repeat for all vertices adjacent 
    # to this vertex v 
    for i in graph_edges[v]: 
        if visited[i] == False: 
            temp = DFSUtil(temp, i, visited) 
    return temp 

In [484]:
def connectedComponents(): 
    visited = {}
    cc = [] 
    for i in graph_edges: 
        visited[i] = False
        for j in graph_edges[i]:
            visited[j] = False
    
    for v in graph_edges:
        if visited[v] == False: 
            cc.append(DFSUtil([], v, visited)) 
    print(cc)

In [485]:
connectedComponents()

RuntimeError: dictionary changed size during iteration

In [263]:
for k,v in full_text_refs_dict.items():
    print(k)
    break

2017_B1-q5Pqxl


In [147]:
iclr_arxiv_map["2017_B1-Hhnslg"]["title"]

'Prototypical Networks for Few-shot Learning'

In [142]:
m = re.search("bib-bib([0-9][0-9]?)", leaderboard_refs["2017_B1-Hhnslg"]['refs']['ldb'][0])

In [144]:
m.group(1)

'17'

In [118]:
leaderboard_refs["2017_B1-Hhnslg"]

{'refs': {'ldb': ["<ref id='bib-bib17'>2016</ref>",
   "<ref id='bib-bib1'>2013</ref>",
   "<ref id='bib-bib23'>2016</ref>",
   "<ref id='bib-bib29'>2016</ref>",
   "<ref id='bib-bib2'>2015</ref>",
   "<ref id='bib-bib22'>2017</ref>",
   "<ref id='bib-bib6'>2017</ref>"],
  'abl': []},
 'count': 12}

In [119]:
paper_bbl_dict["2017_B1-Hhnslg"].keys()

dict_keys(['akata2013label', 'akata2015evaluation', 'lei2015predicting', 'banerjee2005clustering', 'bellet2013survey', 'edwards2017towards', 'elhoseiny2013write', 'goldberger2004neighbourhood', 'hochreiter1997long', 'ioffe2015batch', 'kingma2014adam', 'kingma2013auto', 'koch2015siamese', 'krizhevsky2012imagenet', 'kulis2012metric', 'lake2011one', 'liao2016', 'maaten2008visualizing', 'mensink2013distance', 'miller2000learning', 'min2009deep', 'ravi2017meta', 'reed2016learning', 'rezende2014stochastic', 'rippel2015metric', 'russakovsky2015imagenet', 'salakhutdinov2007learning', 'szegedy2015going', 'vinyals2016matching', 'weinberger2005distance', 'welinder2010caltech'])

In [121]:
paper_bbl_dict["2017_B1-Hhnslg"]["welinder2010caltech"]

{'text': '\\bibitem[Welinder et~al.(2010)Welinder, Branson, Mita, Wah, Schroff, Belongie, and Perona]{welinder2010caltech} P.~Welinder, S.~Branson, T.~Mita, C.~Wah, F.~Schroff, S.~Belongie, and P.~Perona. \\newblock {Caltech-UCSD Birds 200}. \\newblock Technical Report CNS-TR-2010-001, California Institute of Technology, 2010.',
 'arxivids': [],
 'bib-seq': 31}

In [123]:
*

SyntaxError: invalid syntax (<ipython-input-123-b462aee1c6c4>, line 1)

In [None]:
graph_edges = defaultdict(set)

In [None]:
for k in leaderboard_refs:

In [230]:
for k,v in paper_id_titles.items():
    print(k, v)
    break

Onepositiveandtwonegativeresultsforderivedcategoriesofalgebraicstacks ['e_1405.1888']


In [603]:
# paper_global_id_dict[2895]
for k,v in paper_global_id_dict.items():
    if v == 2895:
        print(k, v)
        break

q 2895


In [553]:
len(counting_iclr_refs)

2546

In [601]:
list(counting_iclr_refs.items())[0:40]

[(2895,
  [{'text': '\\bibitem[Liao et~al.(2016)Liao, Schwing, Zemel, and Urtasun]{liao2016} Renjie Liao, Alexander Schwing, Richard Zemel, and Raquel Urtasun. \\newblock Learning deep parsimonious representations. \\newblock \\emph{Advances in Neural Information Processing Systems}, 2016.',
    'arxivids': [],
    'bib-seq': 17,
    'seq_id': 2895},
   {'text': '\\bibitem[Socher et~al., 2014]{socher2014grounded} Socher, R., Karpathy, A., Le, Q.~V., Manning, C.~D., and Ng, A.~Y. (2014). \\newblock Grounded compositional semantics for finding and describing images with sentences. \\newblock {\\em Transactions of the Association for Computational Linguistics}, 2:207--218.',
    'arxivids': [],
    'bib-seq': 26,
    'seq_id': 2895},
   {'text': '\\bibitem[\\protect\\citeauthoryear{Chen, Bolton, and Manning}{Chen et~al\\mbox{.}}{2016}]% {ChenACL2016} \\bibfield{author}{\\bibinfo{person}{Danqi Chen}, \\bibinfo{person}{Jason Bolton}, {and} \\bibinfo{person}{Christopher~D Manning}.} \\bibinf

In [623]:
ref_venue_counts = {k: 0 for k in ["nips", "iclr", "icml", "cvpr", "acl", "arxiv_ppt"]}
rest_for_inspection = defaultdict(list)
multiple_venue_inspection = defaultdict(list)

venue_label = {}

for i in counting_iclr_refs:
    local_finder = []
    for one_ref in counting_iclr_refs[i]:
        
        ref_content = one_ref['text'].lower()
        if ref_content.find("neural information processing systems") > -1 or ref_content.find("nips") > -1:
            local_finder.append("nips")
        elif ref_content.find("iclr") > -1 or ref_content.find("international conference on learning representations") > -1 or ref_content.find("conference on learning representation") > -1:
            local_finder.append("iclr")
        elif ref_content.find("icml") > -1 or ref_content.find("international conference on machine learning") > -1:
            local_finder.append("icml")
        elif ref_content.find("cvpr") > -1 or ref_content.find("conference on computer vision and pattern recognition") > -1:
            local_finder.append("cvpr")
        elif ref_content.find("arxiv preprint") > -1:
            local_finder.append("arxiv_ppt")
        elif ref_content.find("association for computational linguistics") > -1 or ref_content.find(" acl ") > -1 or ref_content.find(" acl}") > -1 or ref_content.find("{acl}") > -1 or ref_content.find("{naacl-hlt}") > -1:
            local_finder.append("acl")
        else:
            rest_for_inspection[i].append(ref_content)
        multiple_venue_inspection[i].append(one_ref)
    local_finder = list(set(local_finder))
    if len(local_finder) == 0:
        rest_for_inspection[i].append(ref_content)
    else:
        if len(local_finder) == 1:
            ref_venue_counts[local_finder[0]] += 1
            venue_label[i] = local_finder[0]
        else:
            if "arxiv_ppt" in local_finder:
                local_finder.remove("arxiv_ppt")
            if len(local_finder) == 1:
                ref_venue_counts[local_finder[0]] += 1
                venue_label[i] = local_finder[0]
            else:
                print(i, local_finder)#, list(set(rest_for_inspection[i])))

2895 ['cvpr', 'icml', 'acl', 'iclr', 'nips']
45773 ['cvpr', 'icml', 'acl', 'iclr', 'nips']
111966 ['iclr', 'icml']
171477 ['cvpr', 'icml', 'iclr', 'nips']
545116 ['cvpr', 'icml', 'acl', 'iclr', 'nips']
1045843 ['acl', 'nips']
125627 ['cvpr', 'icml', 'acl', 'nips']
953530 ['nips', 'cvpr', 'icml']
835 ['nips', 'cvpr']
497194 ['cvpr', 'icml', 'acl', 'iclr', 'nips']
490510 ['nips', 'icml']
415153 ['cvpr', 'icml', 'acl', 'iclr', 'nips']
549873 ['nips', 'acl']
446477 ['nips', 'iclr']
1157598 ['nips', 'iclr', 'icml']
39600 ['acl', 'nips', 'icml']
708197 ['iclr', 'icml']
1073979 ['acl', 'iclr']
455790 ['nips', 'iclr']
818725 ['nips', 'iclr']
214306 ['acl', 'nips']
500227 ['nips', 'iclr']
833906 ['cvpr', 'icml']


In [612]:
counting_iclr_refs[214306]

[{'text': "\\bibitem[Camacho-Collados et~al.(2015)Camacho-Collados, Pilehvar, and Navigli]{camacho2015framework} Jos{\\'e} Camacho-Collados, Mohammad~Taher Pilehvar, and Roberto Navigli. \\newblock A framework for the construction of monolingual and cross-lingual word similarity datasets. \\newblock In \\emph{ACL (2)}, pp.\\  1--7, 2015.",
  'arxivids': [],
  'bib-seq': 20,
  'seq_id': 214306},
 {'text': '\\bibitem[Zesch \\& Gurevych(2006)Zesch and Gurevych]{zesch2006automatically} Torsten Zesch and Iryna Gurevych. \\newblock Automatically creating datasets for measures of semantic relatedness. \\newblock In \\emph{Proceedings of the Workshop on Linguistic Distances}, pp.\\ 16--24. Association for Computational Linguistics, 2006.',
  'arxivids': [],
  'bib-seq': 55,
  'seq_id': 214306},
 {'text': '\\bibitem[Zesch \\& Gurevych(2006)Zesch and Gurevych]{zesch2006automatically} Torsten Zesch and Iryna Gurevych. \\newblock Automatically creating datasets for measures of semantic relatedness

In [614]:
ref_venue_counts, len(counting_iclr_refs)

({'nips': 281,
  'iclr': 256,
  'icml': 219,
  'cvpr': 228,
  'acl': 138,
  'arxiv_ppt': 638},
 2546)

In [595]:
nips + iclr + icml + cvpr + acl + arxiv_ppt

1600

In [619]:
list(rest_for_inspection.items())[0:40]

[(2895,
  ['\\bibitem[\\protect\\citeauthoryear{zhang, zhu, chen, dai, wei, and jiang}{zhang et~al\\mbox{.}}{2017}]% {dblp:journals/corr/zhangzcdwj17} \\bibfield{author}{\\bibinfo{person}{junbei zhang}, \\bibinfo{person}{xiao{-}dan zhu}, \\bibinfo{person}{qian chen}, \\bibinfo{person}{li{-}rong dai}, \\bibinfo{person}{si wei}, {and} \\bibinfo{person}{hui jiang}.} \\bibinfo{year}{2017}\\natexlab{}. \\newblock \\showarticletitle{exploring question understanding and adaptation in neural-network-based question answering}. \\newblock \\bibinfo{journal}{{\\em corr\\/}}  \\bibinfo{volume}{abs/1703.04617} (\\bibinfo{year}{2017}). \\newblock',
   '\\bibitem[\\protect\\citeauthoryear{chen, fisch, weston, and bordes}{chen et~al\\mbox{.}}{2017}]% {dblp:journals/corr/chenfwb17} \\bibfield{author}{\\bibinfo{person}{danqi chen}, \\bibinfo{person}{adam fisch}, \\bibinfo{person}{jason weston}, {and} \\bibinfo{person}{antoine bordes}.} \\bibinfo{year}{2017}\\natexlab{}. \\newblock \\showarticletitle{rea

In [621]:
len(venue_label)

1760

In [626]:
venue_label

{490188: 'arxiv_ppt',
 779070: 'nips',
 385142: 'cvpr',
 283317: 'iclr',
 1147022: 'nips',
 396737: 'iclr',
 1082720: 'nips',
 861194: 'arxiv_ppt',
 888571: 'arxiv_ppt',
 190284: 'icml',
 1140292: 'iclr',
 513240: 'nips',
 309772: 'iclr',
 596867: 'iclr',
 907099: 'icml',
 930686: 'acl',
 1044012: 'nips',
 539819: 'acl',
 154986: 'arxiv_ppt',
 1107219: 'arxiv_ppt',
 408575: 'iclr',
 985741: 'arxiv_ppt',
 81294: 'acl',
 809115: 'arxiv_ppt',
 416925: 'acl',
 928211: 'arxiv_ppt',
 574288: 'acl',
 1014228: 'acl',
 88529: 'acl',
 143084: 'arxiv_ppt',
 920951: 'acl',
 635748: 'cvpr',
 38666: 'iclr',
 730027: 'icml',
 760983: 'icml',
 774827: 'acl',
 406679: 'acl',
 818442: 'arxiv_ppt',
 678460: 'arxiv_ppt',
 404528: 'iclr',
 116985: 'iclr',
 766849: 'iclr',
 88735: 'nips',
 959271: 'arxiv_ppt',
 818850: 'iclr',
 172080: 'icml',
 903195: 'icml',
 201733: 'arxiv_ppt',
 551035: 'icml',
 563087: 'arxiv_ppt',
 408784: 'cvpr',
 986408: 'acl',
 802248: 'cvpr',
 351057: 'cvpr',
 692073: 'icml',
 762

In [628]:
with open("graph1/node_venue_labels.pkl", "wb") as f:
    pickle.dump(venue_label, f)

# Strict Graph Construction

In [641]:
strict_graph = defaultdict(set)

In [642]:
strict_counting_iclr_refs = defaultdict(list)
sanity_count = 0

In [643]:
# DONE = False
found_in_paper_not_arxiv = 0
fipna_pid = []
bib_seq_not_found = []

strict_iclr_yearwise_graph_info = {}
strict_reverse_iclr_yearwise_graph_info = {}

for k in leaderboard_refs:
    title = iclr_arxiv_map[k]["title"].lower()
    unaccented_title = unidecode.unidecode(title)
    clean_title = re.sub('[\W_]', '', unaccented_title)
    
    if clean_title in paper_global_id_dict:
        self_global_id = paper_global_id_dict[clean_title]
    else:
        continue
        paper_global_id_dict[clean_title] = len(paper_global_id_dict)
        self_global_id = paper_global_id_dict[clean_title]
    
    strict_iclr_yearwise_graph_info[k] = self_global_id
    strict_reverse_iclr_yearwise_graph_info[self_global_id] = k
    
    try:
        if leaderboard_refs[k]["count"] > 0 and k in paper_bbl_dict: #k.startswith("2017"):# or k.startswith("2018"):
            
            ldb_abl_keys = []
            if "ldb" in leaderboard_refs[k]['refs'] and leaderboard_refs[k]['refs']["ldb"]:
                ldb_abl_keys.append("ldb")
            if "abl" in leaderboard_refs[k]['refs'] and leaderboard_refs[k]['refs']["abl"]:
                ldb_abl_keys.append("abl")
            
            for ldbabl_ref_key in ldb_abl_keys:
                for ldb_ref in leaderboard_refs[k]['refs'][ldbabl_ref_key]:
                    m = re.search("bib[a]?-bib([0-9][0-9]?)", ldb_ref)
                    if m:
                        seq_key = int(m.group(1))
                        if k == "2017_BJlxmAKlg":
                            seq_key = seq_key - 1
                        found = False
                        partially_found = False
                        for paper_bib_entry in paper_bbl_dict[k].values():
                            if "bib-seq" in paper_bib_entry and paper_bib_entry["bib-seq"] == seq_key:
                                if "strict_seq_id" in paper_bib_entry:
                                    found = True
                                    strict_graph[self_global_id].add(paper_bib_entry["strict_seq_id"])
                                    sanity_count += 1
                                    strict_counting_iclr_refs[paper_bib_entry["strict_seq_id"]].append(paper_bib_entry)
                                else:
                                    partially_found = True
                                    found_in_paper_not_arxiv += 1
                        if not found and not partially_found:
                            bib_seq_not_found.append((k, ldb_ref))
                    else:
                        print("insucc bib re match: ", k, ldb_ref)
            if not self_global_id in graph_edges:
                fipna_pid.append(k)
    except Exception as ex:
        print("Error: ", k, ex)

insucc bib re match:  2017_BycCx8qex <ref id='S1-F2'>2</ref>
insucc bib re match:  2017_HJ0UKP9ge <ref id='S2-E2'>2</ref>
insucc bib re match:  2017_HJ0UKP9ge <ref id='S2-E1'>1</ref>
insucc bib re match:  2017_HkcdHtqlx <ref id='S3-E5'>5</ref>
insucc bib re match:  2017_Hyq4yhile <ref id='S5-SS4'>5.4</ref>
insucc bib re match:  2017_Hyq4yhile <ref id='S5-SS2'>5.2</ref>
insucc bib re match:  2017_Hyq4yhile <ref id='S5-SS3'>5.3</ref>
insucc bib re match:  2017_SJU4ayYgl <ref id='S2-E7'>7</ref>
insucc bib re match:  2017_SJU4ayYgl <ref id='S2-E8'>8</ref>
insucc bib re match:  2017_SJU4ayYgl <ref id='S2-E5'>5</ref>
insucc bib re match:  2017_SJU4ayYgl <ref id='S2-E6'>6</ref>
insucc bib re match:  2017_SkxKPDv5xl <ref id='S2-E2'>2</ref>
insucc bib re match:  2017_SyCSsUDee <ref id='S3-F2'>2</ref>
insucc bib re match:  2017_SyCSsUDee <ref id='S2-F1'>1</ref>
insucc bib re match:  2017_SyCSsUDee <ref id='S3-F3'>3</ref>
insucc bib re match:  2017_SyCSsUDee <ref id='S3-F2'>2</ref>
insucc bib re 

In [644]:
found_in_paper_not_arxiv_p2 = []

for k,v in full_text_refs_dict.items():
    
    title = iclr_arxiv_map[k]["title"].lower()
    unaccented_title = unidecode.unidecode(title)
    clean_title = re.sub('[\W_]', '', unaccented_title)
    
    if clean_title in paper_global_id_dict:
        self_global_id = paper_global_id_dict[clean_title]
    else:
        continue
        paper_global_id_dict[clean_title] = len(paper_global_id_dict)
        self_global_id = paper_global_id_dict[clean_title]
    
    strict_iclr_yearwise_graph_info[k] = self_global_id
    strict_reverse_iclr_yearwise_graph_info[self_global_id] = k
    
    for i in v:
        m = re.search("\\\cite[ptyear\[\]\*]*?\{([^\{]*)\}", i)
        cits = []
        if m:
            p = m.group(1)
            cits = p.split(",")
        if k in paper_bbl_dict:
            for cit_key in cits:
                if cit_key in paper_bbl_dict[k]:
                    if "strict_seq_id" in paper_bbl_dict[k][cit_key]:
                        strict_graph[self_global_id].add(paper_bbl_dict[k][cit_key]["strict_seq_id"])
                        strict_counting_iclr_refs[paper_bbl_dict[k][cit_key]["strict_seq_id"]].append(paper_bbl_dict[k][cit_key])
                        sanity_count +=1
                    else:
                        found_in_paper_not_arxiv_p2.append(paper_bbl_dict[k][cit_key]["text"])
    if not self_global_id in strict_graph:
        print(k)

2017_B1IzH7cxl
2017_B1YfAfcgl
2017_B1gtu5ilg
2017_BJK3Xasel
2017_BJYwwY9ll
2017_BJh6Ztuxl
2017_BJwFrvOeg
2017_Bk8N0RLxx
2017_Bkepl7cee
2017_By5e2L9gl
2017_HJSCGD9ex
2017_HJpfMIFll
2017_HJy_5Mcll
2017_HkNRsU5ge
2017_Hkg4TI9xl
2017_HkwoSDPgg
2017_Hy-lMNqex
2017_HyY4Owjll
2017_S13wCE9xx
2017_Sk2iistgg
2017_SkgewU5ll
2017_SkyQWDcex
2017_r1BJLw9ex
2017_r1G4z8cge
2017_r1kGbydxg
2017_r1te3Fqel
2017_r1y1aawlg
2017_rJJ3YU5ge
2017_rkFBJv9gg
2017_rky3QW9le
2017_ryuxYmvel
2018_B1ZZTfZAW
2018_B1jscMbAW
2018_B1l8BtlCb
2018_B1spAqUp-
2018_B1ydPgTpW
2018_BJ6anzb0Z
2018_BJvWjcgAZ
2018_BkUp6GZRW
2018_ByOnmlWC-
2018_ByS1VpgRZ
2018_ByqFhGZCW
2018_H113pWZRb
2018_HJ4IhxZAb
2018_HJcSzz-CZ
2018_Hk5elxbRW
2018_HkOhuyA6-
2018_HkXWCMbRW
2018_Hksj2WWAW
2018_HkxF5RgC-
2018_HyRVBzap-
2018_HyUNwulC-
2018_HymYLebCb
2018_HytSvlWRZ
2018_S14EogZAZ
2018_S1ANxQW0b
2018_S1GUgxgCW
2018_S1XolQbRW
2018_S1m6h21Cb
2018_SJCq_fZ0Z
2018_SJFM0ZWCb
2018_SJw03ceRW
2018_SJyEH91A-
2018_Sy8XvGb0-
2018_SyELrEeAb
2018_SyqShMZRb
2018_rJFOp

In [646]:
full_text_refs_dict["2017_Hkg4TI9xl"]

[') against each other. The baseline detector has an AUPR approximately equal to the precision \\citep{auprba',
 ') against each other. The baseline detector has an AUPR approximately equal to the precision \\citep{auprba',
 'as does the Area Under the Precision-Recall curve (AUPR) which is sometimes deemed more informative \\citep',
 'as does the Area Under the Precision-Recall curve (AUPR) which is sometimes deemed more informative \\citep',
 'e informative \\citep{manning}. This is because the AUROC is not ideal when the positive class and negative ',
 'e informative \\citep{manning}. This is because the AUROC is not ideal when the positive class and negative ']

In [645]:
len(strict_graph)

1193

In [648]:
st_uniq_nodes = set()
st_ref_nodes = set()

for k, v in strict_graph.items():
#     print(k, v)
#     break
    st_uniq_nodes.add(k)
    for i in v:
        st_ref_nodes.add(i)
        st_uniq_nodes.add(i)
st_uniq_nodes = set(st_uniq_nodes)
st_ref_nodes = set(st_ref_nodes)
print(len(st_uniq_nodes))
print(len(st_ref_nodes))

3408
2418


In [649]:
st_edges_count = []
for k, v in strict_graph.items():
    for i in v:
        st_edges_count.append(i)
print(len(st_edges_count))

5370


In [653]:
with open("STRICT_GRAPH_0.pkl", "wb") as f:
    pickle.dump(strict_graph, f)

In [651]:
with open('strict_iclr_arxiv_gid_map.pkl', "wb") as f:
    pickle.dump(strict_iclr_yearwise_graph_info, f)

In [652]:
with open('strict_reverse_iclr_arxiv_gid_map.pkl', "wb") as f:
    pickle.dump(strict_reverse_iclr_yearwise_graph_info, f)

# Analyzing venues of type1 nodes

In [673]:
ref_venue_counts ={}

In [707]:
strict_ref_venue_counts = {k: 0 for k in ["nips", "iclr", "icml", "cvpr", "acl", "arxiv_ppt", "aaai"]}
rest_for_inspection = defaultdict(list)
multiple_venue_inspection = defaultdict(list)

venue_label = {}

for i in strict_counting_iclr_refs:
    local_finder = []
    for one_ref in strict_counting_iclr_refs[i]:
        
        ref_content = one_ref['text'].lower()
        if ref_content.find("neural information processing systems") > -1 or ref_content.find("nips") > -1 or ref_content.find("{neurips}") > -1:
            local_finder.append("nips")
        elif ref_content.find("{aaai}") > -1:
            local_finder.append("aaai")
            
        elif ref_content.find("iclr") > -1 or ref_content.find("international conference on learning representations") > -1 or ref_content.find("conference on learning representation") > -1:
            local_finder.append("iclr")
        elif ref_content.find("icml") > -1 or ref_content.find("international conference on machine learning") > -1:
            local_finder.append("icml")
        elif ref_content.find("cvpr") > -1 or ref_content.find("computer vision and pattern recognition") > -1 or ref_content.find("international conference on computer vision") > -1 or ref_content.find("european conference on computer vision") > -1 or ref_content.find("{eccv}") > -1 or ref_content.find("{iccv}") > -1 or ref_content.find("\\emph{iccv ") > -1 or ref_content.find("\\emph{eccv ") > -1:
            local_finder.append("cvpr")
        elif ref_content.find("arxiv preprint") > -1 or ref_content.find("{corr}") > -1 or ref_content.find("{arxiv}") > -1 or ref_content.find("arxiv e-prints") > -1 or ref_content.find("{arxiv.org}") > -1 or ref_content.find("\\emph{arxiv:") > -1 or ref_content.find("\\emph{arxiv ") > -1 or ref_content.find("") > -1:
            local_finder.append("arxiv_ppt")
        elif ref_content.find("association for computational linguistics") > -1 or ref_content.find(" acl ") > -1 or ref_content.find(" acl}") > -1 or ref_content.find("{acl}") > -1 or ref_content.find("{acl ") > -1 or ref_content.find("{naacl-hlt}") > -1 or  ref_content.find("{naacl}") > -1 or ref_content.find("emnlp") > -1 or ref_content.find("empirical methods in natural language processing") > -1 or ref_content.find("{eacl}") > -1:
            local_finder.append("acl")
        else:
            rest_for_inspection[i].append(ref_content)
        multiple_venue_inspection[i].append(one_ref)
    local_finder = list(set(local_finder))
    if len(local_finder) == 0:
        rest_for_inspection[i].append(ref_content)
    else:
        if len(local_finder) == 1:
            strict_ref_venue_counts[local_finder[0]] += 1
            venue_label[i] = local_finder[0]
        else:
            if "arxiv_ppt" in local_finder:
                local_finder.remove("arxiv_ppt")
            if len(local_finder) == 1:
                strict_ref_venue_counts[local_finder[0]] += 1
                venue_label[i] = local_finder[0]
            else:
                print(i, local_finder)#, list(set(rest_for_inspection[i])))

111966 ['iclr', 'icml']
708197 ['iclr', 'icml']
1073979 ['acl', 'iclr']
500227 ['nips', 'iclr']
1151032 ['acl', 'iclr']
833906 ['cvpr', 'icml']


In [696]:
print(len(venue_label))

1977


In [747]:
len(strict_counting_iclr_refs), sum(v for v in strict_ref_venue_counts.values()), strict_ref_venue_counts

(2418,
 2191,
 {'nas': 1,
  'nips': 284,
  'iclr': 250,
  'icml': 237,
  'cvpr': 356,
  'acl': 189,
  'arxiv_ppt': 767,
  'aaai': 60,
  'acm': 22,
  'ieee': 25})

In [708]:
counting_iclr_refs[419660]

[{'text': '\\bibitem[Grave et~al.(2016)Grave, Joulin, and Usunier]{grave2016improving} Edouard Grave, Armand Joulin, and Nicolas Usunier. \\newblock Improving neural language models with a continuous cache. \\newblock \\emph{arXiv preprint arXiv:1612.04426}, 2016.',
  'arxivids': [],
  'bib-seq': 13,
  'seq_id': 419660,
  'strict_seq_id': 419660},
 {'text': '\\bibitem[Grave et~al.(2016)Grave, Joulin, and Usunier]{Grave2016} Grave, E., Joulin, A., and Usunier, N. \\newblock Improving neural language models with a continuous cache. \\newblock \\emph{arXiv preprint arXiv:1612.04426}, 2016.',
  'arxivids': [],
  'bib-seq': 11,
  'seq_id': 419660,
  'strict_seq_id': 419660},
 {'text': '\\bibitem[Grave et~al.(2016)Grave, Joulin, and Usunier]{Grave2016} Grave, E., Joulin, A., and Usunier, N. \\newblock Improving neural language models with a continuous cache. \\newblock \\emph{arXiv preprint arXiv:1612.04426}, 2016.',
  'arxivids': [],
  'bib-seq': 11,
  'seq_id': 419660,
  'strict_seq_id': 4

111966 ['iclr', 'icml']  -> ICML
708197 ['iclr', 'icml']  -> ICLR 2018
1073979 ['acl', 'iclr']  -> EMNLP 2014
500227 ['nips', 'iclr']  -> ICLR 2018
1151032 ['acl', 'iclr']  -> EMNLP W 2018
833906 ['cvpr', 'icml']  -> ICML 2017

In [742]:
rest_for_inspection

defaultdict(list,
            {779070: ['\\bibitem[vinyals et~al.(2016)vinyals, blundell, lillicrap, kavukcuoglu, and wierstra]{vinyals16nips} oriol vinyals, charles blundell, timothy lillicrap, koray kavukcuoglu, and daan wierstra. \\newblock matching networks for one shot learning. \\newblock 2016.'],
             1082720: ['\\bibitem[sukhbaatar {\\em et~al.}(2015)sukhbaatar, szlam, weston, and fergus]{sukhbaatar2015end} sukhbaatar, s., szlam, a., weston, j., and fergus, r. (2015). \\newblock end-to-end memory networks. \\newblock {\\em proceedings of nips\\/}.'],
             888571: ['\\bibitem[zaremba et~al.(2014)zaremba, sutskever, and vinyals]{zaremba2014recurrent} wojciech zaremba, ilya sutskever, and oriol vinyals. \\newblock recurrent neural network regularization. \\newblock 2014.'],
             1044012: ['\\bibitem[kiros et~al.(2015)kiros, zhu, salakhutdinov, zemel, torralba, urtasun, and fidler]{kiros2015} ryan kiros, yukun zhu, ruslan salakhutdinov, richard~s. zemel, ant

In [669]:
multiple_venue_inspection[0]

[]

In [None]:

international society for music information retrieval conference
international conference on artificial intelligence and statistics
conference on machine translation (wmt)

{\\em corr}


In [745]:
strict_ref_venue_counts = {k: 0 for k in ["nas", "nips", "iclr", "icml", "cvpr", "acl", "arxiv_ppt", "aaai", "acm", "ieee"]}
rest_for_inspection = defaultdict(list)
multiple_venue_inspection = defaultdict(list)

venue_label = {}

for i in strict_counting_iclr_refs:
    local_finder = []
    for one_ref in strict_counting_iclr_refs[i]:
        
        ref_content = one_ref['text'].lower()
        
        if re.search("neural information processing systems", ref_content) or re.search("[ {\(]neurips[ }\),]", ref_content) or re.search("[ {\(]nips[ }\),]", ref_content):
            local_finder.append("nips")
        elif re.search("aaai[,]? conference on artificial intelligence", ref_content) or re.search("journal of artificial intelligence research", ref_content) or re.search("association for the advancement of artificial intelligence", ref_content) or re.search("international joint conference on artificial intelligence", ref_content) or re.search("[ {\(]aaai[ }\),]", ref_content) or re.search("[ {\(]ijcai[ }\),]", ref_content):
            local_finder.append("aaai")
        elif re.search("[ {\(]iclr[ }\),]", ref_content) or re.search("international conference on learning representations", ref_content) or re.search("conference on learning representation", ref_content):
            local_finder.append("iclr")
        elif re.search("international conference on machine learning", ref_content) or re.search("journal of machine learning research", ref_content) or re.search("[ {\(]icml[ }\),]", ref_content) or re.search("[ {\(]jmlr[ }\),]", ref_content):
            local_finder.append("icml")
        elif re.search("computer vision and pattern recognition", ref_content) or re.search("international conference on computer vision", ref_content) or re.search("european conference on computer vision", ref_content) or re.search("international journal of computer vision", ref_content) or re.search("[ {\(]cvpr[ }\),]", ref_content) or re.search("[ {\(]iccv[ }\),]", ref_content) or re.search("[ {\(]eccv[ }\),]", ref_content) or re.search("[ {\(]ijcv[ }\),]", ref_content):
            local_finder.append("cvpr")
        elif re.search("association for computational linguistics", ref_content) or re.search("empirical methods in natural language processing", ref_content) or re.search("[ {\(]acl[ }\),]", ref_content) or re.search("[ {\(]naacl[ -]hlt[ }\),]", ref_content) or re.search("[ {\(]naacl[ }\),]", ref_content) or re.search("[ {\(]emnlp[ }\),]", ref_content) or re.search("[ {\(]eacl[ }\),]", ref_content) or re.search("[ {\(]tacl[ }\),]", ref_content) or re.search("[ {\(]conll[ }\),]", ref_content):
            local_finder.append("acl")
        elif re.search("http://doi.acm.org/", ref_content) or re.search("international conference on knowledge discovery and data mining", ref_content) or re.search("[ {\(]sigkdd[ }\),]", ref_content) or re.search("[ {\(]sigir[ }\),']", ref_content):
            local_finder.append("acm")
        elif re.search("ieee transactions o[nf]", ref_content) or re.search("ieee symposium on", ref_content):# or re.search("[ {\(]ieee[ }\),]", ref_content):
            local_finder.append("ieee")
        elif re.search("national academy of science", ref_content):
            local_finder.append("nas")
        elif re.search("arxiv preprint", ref_content) or re.search("arxiv e-prints", ref_content) or re.search("[ {\(]corr[ }\),]", ref_content) or re.search("[ {\(]arxiv[: }\),]", ref_content) or re.search("[ {\(]arxiv\.org[ }\),]", ref_content) or re.search("[ {\(]https://arxiv\.org/[ }\),]", ref_content):
            local_finder.append("arxiv_ppt")
        else:
            rest_for_inspection[i].append(ref_content)
        multiple_venue_inspection[i].append(one_ref)
    local_finder = list(set(local_finder))
    if len(local_finder) == 0:
        rest_for_inspection[i].append(ref_content)
    else:
        if len(local_finder) == 1:
            strict_ref_venue_counts[local_finder[0]] += 1
            venue_label[i] = local_finder[0]
        else:
            if "arxiv_ppt" in local_finder:
                local_finder.remove("arxiv_ppt")
            if len(local_finder) == 1:
                strict_ref_venue_counts[local_finder[0]] += 1
                venue_label[i] = local_finder[0]
            else:
                print(i, local_finder)#, list(set(rest_for_inspection[i])))

111966 ['iclr', 'icml']
708197 ['iclr', 'icml']
1073979 ['acl', 'iclr']
658122 ['ieee', 'nips']
500227 ['nips', 'iclr']
1151032 ['acl', 'iclr']
833906 ['cvpr', 'icml']


In [751]:
strict_counting_iclr_refs[111966]

[{'text': '\\bibitem[Zilly et~al.(2016)Zilly, Srivastava, Koutn{\\\'\\i}k, and Schmidhuber]{zilly2016recurrent} Julian~Georg Zilly, Rupesh~Kumar Srivastava, Jan Koutn{\\\'\\i}k, and J{\\"u}rgen Schmidhuber. \\newblock Recurrent highway networks. \\newblock \\emph{arXiv preprint arXiv:1607.03474}, 2016.',
  'arxivids': [],
  'bib-seq': 50,
  'seq_id': 111966,
  'strict_seq_id': 111966},
 {'text': '\\bibitem[Zilly et~al.(2016)Zilly, Srivastava, Koutn{\\\'\\i}k, and Schmidhuber]{Zilly2016} Zilly, Julian~Georg, Srivastava, Rupesh~Kumar, Koutn{\\\'\\i}k, Jan, and Schmidhuber, J{\\"u}rgen. \\newblock {Recurrent Highway Networks}. \\newblock \\emph{arXiv preprint arXiv:1607.03474}, 2016.',
  'arxivids': [],
  'bib-seq': 27,
  'seq_id': 111966,
  'strict_seq_id': 111966},
 {'text': '\\bibitem[Zilly et~al.(2016)Zilly, Srivastava, Koutn{\\\'\\i}k, and Schmidhuber]{zilly2016recurrent} Julian~Georg Zilly, Rupesh~Kumar Srivastava, Jan Koutn{\\\'\\i}k, and J{\\"u}rgen Schmidhuber. \\newblock Recurre

In [746]:
strict_ref_venue_counts

{'nas': 1,
 'nips': 284,
 'iclr': 250,
 'icml': 237,
 'cvpr': 356,
 'acl': 189,
 'arxiv_ppt': 767,
 'aaai': 60,
 'acm': 22,
 'ieee': 25}

In [735]:
strict_ref_venue_counts

{'nas': 1,
 'nips': 284,
 'iclr': 250,
 'icml': 237,
 'cvpr': 353,
 'acl': 186,
 'arxiv_ppt': 768,
 'aaai': 54,
 'acm': 22,
 'ieee': 54}

In [741]:
print(len(venue_label))

2181


In [None]:
acl, naacl, emnlp, eacl, conll
cvpr, iccv, eccv, ijcv
nips
aaai, ijcai, jair
iclr
icml, jmlr
nas
acm, sigir, sigkdd
ieee transactions/symposium

In [750]:
with open("graph1/STRICT_node_venue_labels.pkl", "wb") as f:
    pickle.dump(venue_label, f)