In [133]:
# File utils
import json
import pickle
import glob

# Preprocessing utils
import unidecode
import re
from collections import defaultdict
import editdistance

# Debugging
from IPython.core.debugger import Tracer

In [132]:
!pip install editdistance

Collecting editdistance
  Using cached editdistance-0.5.3-cp37-cp37m-manylinux1_x86_64.whl (179 kB)
Installing collected packages: editdistance
Successfully installed editdistance-0.5.3


In [3]:
from axcell.helpers.datasets import read_arxiv_papers

# Read all paper titles and create a hashtable

##### a. Read axcell paper data

In [4]:
arxiv_axcell = read_arxiv_papers("../../data/arxiv-papers.csv.xz")
arxiv_axcell.head(2)

Unnamed: 0,arxiv_id,archive_size,sha256,title,sections,tables,status
0,0704.0004v1,9486,83b5c83d0963d796ed61fae5ed47cac55d2c942d41e03f...,A determinant of Stirling cycle numbers counts...,1,0,success
1,0704.0010v1,45695,6dd40a2af3e336e0a8e94a5a20a1075819af829f1fcef7...,"Partial cubes: structures, characterizations, ...",0,0,no-tex


In [5]:
yearwise_axcell = defaultdict(int)

for aid in arxiv_axcell["arxiv_id"]:
    yearwise_axcell[aid[0:2]] += 1

In [6]:
yearwise_axcell

defaultdict(int,
            {'07': 48,
             '08': 87,
             '09': 122,
             '10': 189,
             '11': 298,
             '12': 652,
             '13': 3575,
             '14': 5193,
             '15': 6975,
             '16': 10845,
             '17': 16177,
             '18': 24760,
             '19': 35144,
             '20': 645})

##### b. Read full arxiv data from the ECIR leaderboard paper 

In [7]:
with open("/home/singh_shruti/data/arxiv/All_Title_Dump.json", "r") as f:
    arxiv_ecir = json.load(f)

In [8]:
print(len(arxiv_ecir), list(arxiv_ecir.items())[0:2])

1152195 [('1405.1888', 'One positive and two negative results for derived categories of\n  algebraic stacks'), ('nucl-th_0512069', 'Helicity amplitudes and electromagnetic decays of strange baryon\n  resonances')]


In [9]:
yearwise_ecir = defaultdict(int)

for aid in arxiv_ecir:
    yearwise_ecir[aid[0:2]] += 1

In [10]:
list(arxiv_ecir.keys())[0:5]

['1405.1888', 'nucl-th_0512069', '1504.05139', '1504.05138', '1112.0655']

In [11]:
sorted(yearwise_ecir.items(), key=lambda x: x[0])

[('07', 30196),
 ('08', 43896),
 ('09', 59045),
 ('10', 62827),
 ('11', 67544),
 ('12', 75470),
 ('13', 81528),
 ('14', 83613),
 ('15', 89883),
 ('16', 97725),
 ('17', 107288),
 ('18', 34326),
 ('ac', 2),
 ('ad', 304),
 ('al', 1164),
 ('ao', 13),
 ('as', 34667),
 ('at', 67),
 ('ba', 11),
 ('ch', 1881),
 ('cm', 892),
 ('co', 67679),
 ('cs', 7090),
 ('dg', 543),
 ('fu', 257),
 ('gr', 16048),
 ('he', 77200),
 ('ma', 53208),
 ('mt', 151),
 ('nl', 5175),
 ('nu', 14511),
 ('pa', 440),
 ('ph', 15248),
 ('pl', 28),
 ('q-', 3120),
 ('qu', 18320),
 ('so', 767),
 ('su', 68)]

##### c. Collate both data

Steps:
1. Decode all accented chars from title
2. Clean all non alpha-numeric chars from the title
3. Collate titles from both dataset and add both ids

In [385]:
paper_id_titles = defaultdict(list)

for aid, title in arxiv_ecir.items():
    title_lower = title.lower()
    unaccented_title = unidecode.unidecode(title_lower)
    clean_title = re.sub('[\W_]', '', unaccented_title)
    
    if clean_title:
        paper_id_titles[clean_title].append("e_" + aid)
    else:
        print(title)

for entry in arxiv_axcell[["arxiv_id", "title"]].iterrows():
    aid = entry[1].arxiv_id
    title_lower = entry[1].title.lower()
    
    unaccented_title = unidecode.unidecode(title_lower)
    clean_title = re.sub('[\W_]', '', unaccented_title)
    
    if clean_title:
        paper_id_titles[clean_title].append("a_" + aid)
    else:
        print(title)

In [386]:
paper_global_id_dict = {}

for seqid, k in enumerate(paper_id_titles):
    paper_global_id_dict[k] = seqid

In [391]:
with open("IMP_paper_global_id_dict.pkl", "wb") as f:
    pickle.dump(paper_global_id_dict, f)

with open("IMP_paper_id_titles.pkl", "wb") as f:
    pickle.dump(paper_id_titles, f)

In [387]:
#latest after lowercasing titles which I catastrophically forgot to do :/
len(paper_id_titles), len(paper_global_id_dict)

(1205268, 1205268)

In [388]:
len(paper_id_titles), len(paper_global_id_dict)

(1205268, 1205268)

In [389]:
ecir_dups = 0
axcell_dups = 0

for compressed_title, id_list in paper_id_titles.items():
    if len(id_list) > 1:
        
        dups_e = list(filter(lambda x: x.startswith('e_'), id_list))
        if dups_e and len(dups_e) > 1:
            ecir_dups +=1
            #print(compressed_title, id_list, arxiv_ecir[dups_e[0][2:]], arxiv_ecir[dups_e[1][2:]])
        
        dups_a = list(filter(lambda x: x.startswith('a_'), id_list))
        if dups_a and len(dups_a) > 1:
            axcell_dups +=1
        
print("Duplicate counts:\nECIR: {}\nAXCELL:{}".format(ecir_dups, axcell_dups))

Duplicate counts:
ECIR: 2817
AXCELL:256


In [99]:
s=0
for i in ttttt:
#     if "count" in ttttt[i] and ttttt[i]["count"] > 0:
#         s += 1
    if "refs" in ttttt[i] and "abl" in ttttt[i]["refs"] and ttttt[i]["refs"]["abl"]:
        s+=1
print(s)

170


# Read the 3-way split refs data

##### a. SPLIT_1: Table cell refs

In [15]:
with open("leaderboard_table_refs.pkl", "rb") as f:
    leaderboard_table_refs = pickle.load(f)

In [190]:
with open("new_leaderboard_refs.pkl", "rb") as f:
    leaderboard_refs = pickle.load(f)

In [17]:
with open("TABLE_Label_predicted.pkl", "rb") as f:
    collated_table_labels = pickle.load(f)

In [18]:
len(leaderboard_table_refs)

1988

##### b. SPLIT_2: Table caption refs

In [19]:
with open("refs_table_captions.pkl", "rb") as f:
    table_caption_refs = pickle.load(f)

##### c. SPLIT_3: Full text refs

In [296]:
with open("refs_full_text.pkl", "rb") as f:
    full_text_refs_dict = pickle.load(f)

# Analyse refs and papers loaded from 3 sources

In [293]:
for k,v in leaderboard_refs.items():
    print(k, v)
    break

table_cell_refs_keys = list(leaderboard_refs.keys())
print("\n", len(table_cell_refs_keys))

non_zero = 0
for k,v in leaderboard_refs.items():
    if v["count"] > 0:
        non_zero += 1
print(non_zero, len(table_cell_refs_keys)-non_zero)

2017_B1-Hhnslg {'refs': {'ldb': ["<ref id='bib-bib17'>2016</ref>", "<ref id='bib-bib1'>2013</ref>", "<ref id='bib-bib23'>2016</ref>", "<ref id='bib-bib29'>2016</ref>", "<ref id='bib-bib2'>2015</ref>", "<ref id='bib-bib22'>2017</ref>", "<ref id='bib-bib6'>2017</ref>"], 'abl': []}, 'count': 12}

 1718
419 1299


In [297]:
for k,v in table_caption_refs.items():
    print(k, v)
    break
    
table_cap_refs_keys = list(table_caption_refs.keys())
print("\n", len(table_cap_refs_keys))

2017_SJDaqqveg [(1, 'Table 2: Our IWSLT 2014 machine translation results with a convolutional encoder compared to the previous work by Ranzato et al. Please see 1 for an explanation of abbreviations. The asterisk identifies results from (Ranzato et\xa0al., 2015). The numbers reported with ≤ were approximately read from Figure 6 of (Ranzato et\xa0al., 2015)')]

 45


In [298]:
for k,v in full_text_refs_dict.items():
    print(k, v[0:12])
    break
    
ft_refs_keys = list(full_text_refs_dict.keys())
print("\n", len(ft_refs_keys))

2017_B1-q5Pqxl ['on the development data and that outperformed the DCR model~\\citep{Yu2015rank:arxiv}, which also introd', ' on the development data and that outperformed the DCR model~\\citep{Yu2015rank:arxiv}, which also introdu']

 1088


In [291]:
len(set(table_cell_refs_keys).intersection(set(ft_refs_keys)))

1136

# Read all references of 2504 papers

In [22]:
with open("/home/singh_shruti/workspace/PaperAcceptancePrediction/shruti/features/iclr_arxiv_map.pkl", "rb") as f:
    iclr_arxiv_map = pickle.load(f)

In [348]:
bbl_absent = []
bbl_multiple = []
bbl_unique = []
not_bib = []

exceptions_ids = []

paper_bbl_dict = {}

In [349]:
# DO THIS WITH CAUTION: THIS WILL REQUIRE THE TITLS TO BE DISAMBIGUATED AGAIN WHICH TAKES AROUND 4-5 HOURS

for k in leaderboard_table_refs:
    arxivid = iclr_arxiv_map[k]["arxivId"]
    
    dir_name = arxivid.split(".")[0]
    
    # Read the bbl file to extract references
    potential_bbl_files = glob.glob("./data/unpacked_sources/{}/{}/*.bbl".format(dir_name, arxivid))
    all_tex_in_dir_for_multiple_bbl = potential_bbl_files = glob.glob("./data/unpacked_sources/{}/{}/*.tex".format(dir_name, arxivid))

    if len(potential_bbl_files) >= 1:
        
        annotate_seq = False
        bib_item_seq_number = None
        
        if len(potential_bbl_files) > 1:
#             read_texts_bib_all = []
            
#             NEW_potential_bbl_files = []
#             for bbbbbb in potential_bbl_files:
#                 only_first_name = bbbbbb.replace(".bbl", ".tex")
#                 if only_first_name in all_tex_in_dir_for_multiple_bbl:
#                     NEW_potential_bbl_files.append(bbbbbb)
#             potential_bbl_files = NEW_potential_bbl_files
            
            for bbbbbb in potential_bbl_files:
                with open(bibfileiter, "r", errors="ignore") as f:
                    read_texts_bib_all.append(f.read())
            same_text = False
            for iter_, text_iter in enumerate(read_texts_bib_all):
                if iter_ < len(read_texts_bib_all)-1:
                    if text_iter.lower() == read_texts_bib_all[iter_+1].lower():
                        same_text = True
                    else:
                        same_text = False
                        break
                else:
                    break
            if same_text:
                potential_bbl_files = [potential_bbl_files[0]]
        
        
        if len(potential_bbl_files) == 1:
            annotate_seq = True
            bib_item_seq_number = 1
        
        try:
            paper_bibitems = {}
            
            for bibfileiter in potential_bbl_files:
                with open(bibfileiter, "r", errors="ignore") as f:
                    lines = f.readlines()
                
                bibitem = []
                start_looking = True
                start_appending = False
                
                for l in lines:
                    if start_looking and (l.startswith("\\bibitem[") or l.startswith("\\bibitem{")):
                        start_looking = False
                        start_appending = True
                        bibitem = [l.strip().replace("\n", " ")]
                        continue
                    if start_appending:
                        if l == "\n":
                            bib_text = " ".join(bibitem)
                            cit_key = None
                            
                            if bib_text.startswith("\\bibitem["):
                                if bib_text.startswith("\\bibitem[\\protect\\citeauthoryear"):
                                    cit_key = None
                                    m = re.search('\\\\bibitem\[.*?\]%? ?({.*?})', bib_text)
                                    if m:
                                        cit_key = m.group(1)[1:-1]
                                    else:
                                        print("NOT FOUND CIT KEY[citauthor]: ", bibfileiter, bib_text)
                                else:
                                    m = re.search('\\\\bibitem\[.*?\]({.*?})', bib_text)
                                    if m:
                                        cit_key = m.group(1)[1:-1]
                                    else:
                                        print("NOT FOUND CIT KEY[]: ", bibfileiter, bib_text)
                                    
                            if bib_text.startswith("\\bibitem{"):
                                m = re.search('\\\\bibitem({.*?})', bib_text)
                                if m:
                                    cit_key = m.group(1)[1:-1]
                                else:
                                    print("NOT FOUND CIT KEY{}: ", bibfileiter, bib_text)
                            
                            if not cit_key is None:
                                if bib_item_seq_number:
                                    paper_bibitems[cit_key] = {"text": bib_text, "arxivids": [], "bib-seq": bib_item_seq_number}
                                else:
                                    paper_bibitems[cit_key] = {"text": bib_text, "arxivids": []}

                            bibitem = []
                            start_looking = True
                            start_appending = False
                            if bib_item_seq_number:
                                bib_item_seq_number += 1
                        else:
                            bibitem.append(l.strip().replace("\n", " "))
            
            if paper_bibitems:
                paper_bbl_dict[k] = paper_bibitems
                
        except Exception as ex:
            print(k, ex)
            exceptions_ids.append(k)
        
    elif len(potential_bbl_files) == 0:
        bbl_absent.append(k)

In [488]:
count_multiple_bbl_files_paper = 0
reduced_count_multiple_bbl_files_paper = 0

for k in leaderboard_table_refs:
    arxivid = iclr_arxiv_map[k]["arxivId"]
    
    dir_name = arxivid.split(".")[0]
    
    # Read the bbl file to extract references
    potential_bbl_files = glob.glob("./data/unpacked_sources/{}/{}/*.bbl".format(dir_name, arxivid))
    all_tex_in_dir_for_multiple_bbl = potential_bbl_files = glob.glob("./data/unpacked_sources/{}/{}/*.tex".format(dir_name, arxivid))

    if len(potential_bbl_files) >= 1:
        
        annotate_seq = False
        bib_item_seq_number = None
        
        if len(potential_bbl_files) > 1:
            count_multiple_bbl_files_paper+=1
#             read_texts_bib_all = []
            
            NEW_potential_bbl_files = []
            for bbbbbb in potential_bbl_files:
                only_first_name = bbbbbb.replace(".bbl", ".tex")
                if only_first_name in all_tex_in_dir_for_multiple_bbl:
                    NEW_potential_bbl_files.append(bbbbbb)
            potential_bbl_files = NEW_potential_bbl_files
            
            for bbbbbb in potential_bbl_files:
                with open(bibfileiter, "r", errors="ignore") as f:
                    read_texts_bib_all.append(f.read())
            same_text = False
            for iter_, text_iter in enumerate(read_texts_bib_all):
                if iter_ < len(read_texts_bib_all)-1:
                    if text_iter.lower() == read_texts_bib_all[iter_+1].lower():
                        same_text = True
                    else:
                        same_text = False
                        break
                else:
                    break
            if same_text:
                potential_bbl_files = [potential_bbl_files[0]]
        if len(potential_bbl_files) == 1:
            reduced_count_multiple_bbl_files_paper+=1


In [489]:
count_multiple_bbl_files_paper, reduced_count_multiple_bbl_files_paper

(1272, 716)

In [350]:
len(set(bbl_absent)), bbl_absent[0:10]

(0, [])

In [351]:
len(paper_bbl_dict)

48

In [347]:
list(paper_bbl_dict.keys())[0:10]

['2017_B1-Hhnslg',
 '2017_B1-q5Pqxl',
 '2017_B184E5qee',
 '2017_B1E7Pwqgl',
 '2017_B1G9tvcgx',
 '2017_B1GOWV5eg',
 '2017_B1Igu2ogg',
 '2017_B1IzH7cxl',
 '2017_B1MRcPclx',
 '2017_B1TTpYKgx']

In [162]:
iclr_arxiv_map["2017_BJYwwY9ll"]["arxivId"]

'1704.00109v1'

In [163]:
 glob.glob("./data/unpacked_sources/1704/1704.00109v1/*.bbl")

['./data/unpacked_sources/1704/1704.00109v1/main.bbl']

In [178]:
pwd

'/home/singh_shruti/workspace/axcell_ws/axcell/notebooks'

In [164]:
cat ./data/unpacked_sources/1704/1704.00109v1/main.bbl

\begin{thebibliography}{43}
\providecommand{\natexlab}[1]{#1}
\providecommand{\url}[1]{\texttt{#1}}
\expandafter\ifx\csname urlstyle\endcsname\relax
  \providecommand{\doi}[1]{doi: #1}\else
  \providecommand{\doi}{doi: \begingroup \urlstyle{rm}\Url}\fi

\bibitem[Bottou(2010)]{bottou2010large}
L{\'e}on Bottou.
\newblock Large-scale machine learning with stochastic gradient descent.
\newblock In \emph{COMPSTAT}. 2010.

\bibitem[Bucilu�� et~al.(2006)Bucilu��, Caruana, and
  Niculescu-Mizil]{bucilu2006model}
Cristian Bucilu��, Rich Caruana, and Alexandru Niculescu-Mizil.
\newblock Model compression.
\newblock In \emph{KDD}, 2006.

\bibitem[Caruana et~al.(2004)Caruana, Niculescu-Mizil, Crew, and
  Ksikes]{caruana2004ensemble}
Rich Caruana, Alexandru Niculescu-Mizil, Geoff Crew, and Alex Ksikes.
\newblock Ensemble selection from libraries of models.
\newblock In \emph{ICML}, 2004.

\bibitem[Collobert et~al.(2011)Collobert, Kavukcuoglu, and Fara

In [174]:
oiclr_arxiv_map["2018_BkQCGzZ0-"]["arxivId"], glob.glob("./data/unpacked_sources/1801/1801.09797v1/*")

('1801.09797v1',
 ['./data/unpacked_sources/1801/1801.09797v1/fancyhdr.sty',
  './data/unpacked_sources/1801/1801.09797v1/iclr2018_conference.sty',
  './data/unpacked_sources/1801/1801.09797v1/main.tex'])

In [176]:
!tail ./data/unpacked_sources/1801/1801.09797v1/main.tex

  Berg{-}Kirkpatrick]{textvae2}
Zichao Yang, Zhiting Hu, Ruslan Salakhutdinov, and Taylor Berg{-}Kirkpatrick.
\newblock Improved variational autoencoders for text modeling using dilated
  convolutions.
\newblock In \emph{Proceedings of {ICML}'17}, pp.\  3881--3890, 2017.

\end{thebibliography}


\end{document}


# Map all references of 1896 papers to arxiv ids if possible based on title subset match

In [393]:
for k in paper_bbl_dict:
    for ref_key in paper_bbl_dict[k]:
        paper_bbl_dict[k][ref_key].pop("seq_id", None)

In [396]:
for k in paper_bbl_dict:
    if True:#k.startswith("2017_"):# or k.startswith("2020_"):
        if status%10==0:
            print("Done {} out of {}".format(status, total_stat))

        for ref_key in paper_bbl_dict[k]:
            clean_ref = unidecode.unidecode(paper_bbl_dict[k][ref_key]["text"])
            clean_ref = clean_ref.lower()
            clean_ref = re.sub('[\W_]', '', clean_ref)
            
            first_half_newblock_ind = clean_ref.find("newblock")
            second_half = clean_ref[first_half_newblock_ind+8:]
            second_half_newblock_ind = second_half.find("newblock")
            
            potential_title_idfied_newblock = second_half[0:second_half_newblock_ind]
            print(potential_title_idfied_newblock, clean_ref)
            break
    break

labelembeddingforattributebasedclassification bibitemakataetal2013akataperronninharchaouiandschmidakata2013labelzeynepakataflorentperronninzaidharchaouiandcordeliaschmidnewblocklabelembeddingforattributebasedclassificationnewblockinemphcomputervisionandpatternrecognitionpages8198262013


In [400]:
status = 0
total_stat = len(paper_bbl_dict)
idfied_working =0 

for k in paper_bbl_dict:
    if True:#k.startswith("2017_"):# or k.startswith("2020_"):
        if status%10==0:
            print("Done {} out of {} and idfied are {}".format(status, total_stat, idfied_working))

        for ref_key in paper_bbl_dict[k]:
            clean_ref = unidecode.unidecode(paper_bbl_dict[k][ref_key]["text"])
            clean_ref = clean_ref.lower()
            clean_ref = re.sub('[\W_]', '', clean_ref)
            
            first_half_newblock_ind = clean_ref.find("newblock")
            second_half = clean_ref[first_half_newblock_ind+8:]
            second_half_newblock_ind = second_half.find("newblock")
            
            potential_title_idfied_newblock = second_half[0:second_half_newblock_ind]
            
            if potential_title_idfied_newblock in paper_global_id_dict:
                idfied_working += 1
                paper_bbl_dict[k][ref_key]["seq_id"] = paper_global_id_dict[potential_title_idfied_newblock]
            else:
                for p_titel in paper_global_id_dict:
                    if clean_ref.find(p_titel) > -1:
                        paper_bbl_dict[k][ref_key]["seq_id"] = paper_global_id_dict[p_titel]
                        break
        status +=1

Done 0 out of 1896 and idfied are 0
Done 10 out of 1896 and idfied are 122
Done 20 out of 1896 and idfied are 353
Done 30 out of 1896 and idfied are 515
Done 40 out of 1896 and idfied are 645
Done 50 out of 1896 and idfied are 811
Done 60 out of 1896 and idfied are 991
Done 70 out of 1896 and idfied are 1187
Done 80 out of 1896 and idfied are 1323
Done 90 out of 1896 and idfied are 1486
Done 100 out of 1896 and idfied are 1660
Done 110 out of 1896 and idfied are 1768
Done 120 out of 1896 and idfied are 1947
Done 130 out of 1896 and idfied are 2126
Done 140 out of 1896 and idfied are 2320
Done 150 out of 1896 and idfied are 2457
Done 160 out of 1896 and idfied are 2610
Done 170 out of 1896 and idfied are 2772
Done 180 out of 1896 and idfied are 2914
Done 190 out of 1896 and idfied are 3075
Done 200 out of 1896 and idfied are 3235
Done 210 out of 1896 and idfied are 3418
Done 220 out of 1896 and idfied are 3569
Done 230 out of 1896 and idfied are 3790
Done 240 out of 1896 and idfied are 

In [491]:
len(paper_bbl_dict)

1896

In [401]:
with open("save_paper_bbl_becoz_ML_deadline_eating_RAM.pkl", "wb") as f:
    pickle.dump(paper_bbl_dict, f)

In [278]:
total = 0
succ = 0

for k in paper_bbl_dict:
    if True:# k.startswith("2017_") or k.startswith("2018_"):
        for ref_key in paper_bbl_dict[k]:
            total += 1
            if "seq_id" in paper_bbl_dict[k][ref_key]:
                succ += 1
print(succ, total)

57636 72619


In [158]:
with open("save_paper_bbl_becoz_ML_deadline_eating_RAM.pkl", "wb") as f:
    pickle.dump(paper_bbl_dict, f)

In [352]:
with open("save_paper_bbl_becoz_ML_deadline_eating_RAM.pkl", "rb") as f:
    paper_bbl_dict = pickle.load(f)

In [353]:
len(paper_bbl_dict)

1896

In [117]:
for k, v in paper_bbl_dict.items():
    if k.startswith("2017") or k.startswith("2018"):
        print(k, paper_bbl_dict[k].keys())
        break

2017_B1-Hhnslg dict_keys(['akata2013label', 'akata2015evaluation', 'lei2015predicting', 'banerjee2005clustering', 'bellet2013survey', 'edwards2017towards', 'elhoseiny2013write', 'goldberger2004neighbourhood', 'hochreiter1997long', 'ioffe2015batch', 'kingma2014adam', 'kingma2013auto', 'koch2015siamese', 'krizhevsky2012imagenet', 'kulis2012metric', 'lake2011one', 'liao2016', 'maaten2008visualizing', 'mensink2013distance', 'miller2000learning', 'min2009deep', 'ravi2017meta', 'reed2016learning', 'rezende2014stochastic', 'rippel2015metric', 'russakovsky2015imagenet', 'salakhutdinov2007learning', 'szegedy2015going', 'vinyals2016matching', 'weinberger2005distance', 'welinder2010caltech'])


# Extract citation key from citation context and map to unique paper

In [497]:
graph_edges = defaultdict(set)

**This will be done in 3 parts separately for each of the types of citation extracted from table cells, table captions and table-fulltextsearch**

In [498]:
# DONE = False
found_in_paper_not_arxiv = 0
fipna_pid = []
bib_seq_not_found = []

iclr_yearwise_graph_info = {}
reverse_iclr_yearwise_graph_info = {}

for k in leaderboard_refs:
    title = iclr_arxiv_map[k]["title"].lower()
    unaccented_title = unidecode.unidecode(title)
    clean_title = re.sub('[\W_]', '', unaccented_title)
    
    if clean_title in paper_global_id_dict:
        self_global_id = paper_global_id_dict[clean_title]
    else:
        paper_global_id_dict[clean_title] = len(paper_global_id_dict)
        self_global_id = paper_global_id_dict[clean_title]
    
    iclr_yearwise_graph_info[k] = self_global_id
    reverse_iclr_yearwise_graph_info[self_global_id] = k
    
    try:
        if leaderboard_refs[k]["count"] > 0 and k in paper_bbl_dict: #k.startswith("2017"):# or k.startswith("2018"):
            
            ldb_abl_keys = []
            if "ldb" in leaderboard_refs[k]['refs'] and leaderboard_refs[k]['refs']["ldb"]:
                ldb_abl_keys.append("ldb")
            if "abl" in leaderboard_refs[k]['refs'] and leaderboard_refs[k]['refs']["abl"]:
                ldb_abl_keys.append("abl")
            
            for ldbabl_ref_key in ldb_abl_keys:
                for ldb_ref in leaderboard_refs[k]['refs'][ldbabl_ref_key]:
                    m = re.search("bib[a]?-bib([0-9][0-9]?)", ldb_ref)
                    if m:
                        seq_key = int(m.group(1))
                        if k == "2017_BJlxmAKlg":
                            seq_key = seq_key - 1
                        found = False
                        partially_found = False
                        for paper_bib_entry in paper_bbl_dict[k].values():
                            if "bib-seq" in paper_bib_entry and paper_bib_entry["bib-seq"] == seq_key:
                                if "seq_id" in paper_bib_entry:
                                    found = True
                                    graph_edges[self_global_id].add(paper_bib_entry["seq_id"])
                                else:
                                    partially_found = True
                                    found_in_paper_not_arxiv += 1
                        if not found and not partially_found:
                            bib_seq_not_found.append((k, ldb_ref))
                    else:
                        print("insucc bib re match: ", k, ldb_ref)
            if not self_global_id in graph_edges:
                fipna_pid.append(k)
    except Exception as ex:
        print("Error: ", k, ex)

insucc bib re match:  2017_BycCx8qex <ref id='S1-F2'>2</ref>
insucc bib re match:  2017_HJ0UKP9ge <ref id='S2-E2'>2</ref>
insucc bib re match:  2017_HJ0UKP9ge <ref id='S2-E1'>1</ref>
insucc bib re match:  2017_HkcdHtqlx <ref id='S3-E5'>5</ref>
insucc bib re match:  2017_Hyq4yhile <ref id='S5-SS4'>5.4</ref>
insucc bib re match:  2017_Hyq4yhile <ref id='S5-SS2'>5.2</ref>
insucc bib re match:  2017_Hyq4yhile <ref id='S5-SS3'>5.3</ref>
insucc bib re match:  2017_SJU4ayYgl <ref id='S2-E7'>7</ref>
insucc bib re match:  2017_SJU4ayYgl <ref id='S2-E8'>8</ref>
insucc bib re match:  2017_SJU4ayYgl <ref id='S2-E5'>5</ref>
insucc bib re match:  2017_SJU4ayYgl <ref id='S2-E6'>6</ref>
insucc bib re match:  2017_SkxKPDv5xl <ref id='S2-E2'>2</ref>
insucc bib re match:  2017_SyCSsUDee <ref id='S3-F2'>2</ref>
insucc bib re match:  2017_SyCSsUDee <ref id='S2-F1'>1</ref>
insucc bib re match:  2017_SyCSsUDee <ref id='S3-F3'>3</ref>
insucc bib re match:  2017_SyCSsUDee <ref id='S3-F2'>2</ref>
insucc bib re 

In [None]:
len(leade)

In [499]:
found_in_paper_not_arxiv, len(bib_seq_not_found), bib_seq_not_found[0:5]

(198,
 193,
 [('2017_BJbD_Pqlg', "<ref id='bib-bib1'>2014</ref>"),
  ('2017_HJ1kmv9xx', "<ref id='bib-bib23'>2016</ref>"),
  ('2017_HkpbnH9lx', "<ref id='bib-bib46'>46</ref>"),
  ('2017_HkpbnH9lx', "<ref id='bib-bib34'>34</ref>"),
  ('2017_HkpbnH9lx', "<ref id='bib-bib22'>22</ref>")])

In [405]:
392-198

194

In [368]:
nk = "2017_HkpbnH9lx"
iclr_arxiv_map[nk]["arxivId"]#, paper_bbl_dict[nk]

'1605.08803v3'

In [369]:
found_in_paper_not_arxiv

198

In [383]:
print("AutoencodersandGenerativeAdversarialNetworksforImbalancedSequenceClassification" in paper_global_id_dict)
print("AutoencodersandGenerativeAdversarialNetworksforImbalancedSequenceClassification".lower() in paper_global_id_dict)

True
False


In [382]:
#WRONG!!! DONT DO THIS AGAIN

# caps_titels_list = list(paper_global_id_dict.keys())

# for paper_caps in caps_titels_list:
#     if paper_caps.lower() in paper_global_id_dict:
#         del paper_global_id_dict[paper_caps]

In [384]:
len(paper_global_id_dict)

1713

In [370]:
leaderboard_refs["1702.08811v3"]

{}

In [372]:
del leaderboard_refs["1702.08811v3"]

In [413]:
graph_edges

defaultdict(set,
            {40517: {2895, 45773, 283317, 385142, 490188, 779070},
             419660: {111966, 396737, 861194, 888571, 1082720, 1147022},
             1121619: {117412,
              171477,
              190284,
              309772,
              513240,
              596867,
              907099,
              1039218,
              1140292},
             748140: {2895, 930686, 1044012},
             323641: {154986, 539819, 1107219},
             298789: {408575, 772825, 1082252},
             116985: {81294,
              88529,
              143084,
              416925,
              574288,
              809115,
              920951,
              928211,
              985741,
              1014228},
             57209: {38666, 502887, 545116, 635748, 838523},
             825280: {190284, 730027, 760983},
             106387: {404528, 406679, 416925, 678460, 774827, 818442},
             187357: {2895,
              88529,
              88735,
              

In [411]:
len(iclr_yearwise_graph_info), len(reverse_iclr_yearwise_graph_info)

(1718, 1711)

In [415]:
reverse_iclr_yearwise_graph_info[1121619], leaderboard_refs["2017_B1E7Pwqgl"]

('2017_B1E7Pwqgl',
 {'refs': {'ldb': ["<ref id='bib-bib62'>62</ref>",
    "<ref id='bib-bib19'>19</ref>",
    "<ref id='bib-bib58'>58</ref>",
    "<ref id='bib-bib63'>63</ref>",
    "<ref id='bib-bib59'>59</ref>",
    "<ref id='bib-bib21'>21</ref>",
    "<ref id='bib-bib60'>60</ref>",
    "<ref id='bib-bib29'>29</ref>",
    "<ref id='bib-bib30'>30</ref>",
    "<ref id='bib-bib25'>25</ref>",
    "<ref id='bib-bib44'>44</ref>"]},
  'count': 15})

In [434]:
#calculate ideal number of nodes after step 1

stage1_pot_nodes = []

for k in leaderboard_refs:
    if leaderboard_refs[k]["count"] > 0:
        stage1_pot_nodes.append(k)
print(len(stage1_pot_nodes))

419


In [441]:
len(set(fipna_pid))

57

In [443]:
fipna_pid[0:4]

['2017_BJbD_Pqlg', '2017_BycCx8qex', '2017_HJ1kmv9xx', '2017_HkcdHtqlx']

In [449]:
iclr_arxiv_map["2017_BycCx8qex"]["arxivId"], leaderboard_refs["2017_BycCx8qex"], paper_bbl_dict["2017_BycCx8qex"]

('1703.04474v1',
 {'refs': {'ldb': ["<ref id='S1-F2'>2</ref>"]}, 'count': 1},
 {'andor2016globally': {'text': '\\bibitem[{Andor et~al.(2016)Andor, Alberti, Weiss, Severyn, Presta, Ganchev, Petrov, and Collins}]{andor2016globally} Daniel Andor, Chris Alberti, David Weiss, Aliaksei Severyn, Alessandro Presta, Kuzman Ganchev, Slav Petrov, and Michael Collins. 2016. \\newblock Globally normalized transition-based neural networks. \\newblock In {\\em Proceedings of the 54th Annual Meeting of the Association for Computational Linguistics\\/}. pages 2442--2452.',
   'arxivids': [],
   'seq_id': 160522},
  'attardi2009reverse': {'text': "\\bibitem[{Attardi and Dell'Orletta(2009)}]{attardi2009reverse} Giuseppe Attardi and Felice Dell'Orletta. 2009. \\newblock Reverse revision and linear tree combination for dependency parsing. \\newblock In {\\em Proceedings of Human Language Technologies: The 2009 Annual Conference of the North American Chapter of the Association for Computational Linguistics,

In [500]:
print(len(graph_edges))

347


In [407]:
ec = 0

for x in graph_edges:
    ec += len(graph_edges[x])
print(ec)

2053


In [408]:
uniq_nodes = set()
for k, v in graph_edges.items():
#     print(k, v)
#     break
    uniq_nodes.add(k)
    for i in v:
        uniq_nodes.add(i)
uniq_nodes = set(uniq_nodes)

In [409]:
len(uniq_nodes)

1271

In [410]:
len(paper_global_id_dict)

1205268

In [377]:
list(paper_global_id_dict.keys())[0]

'onepositiveandtwonegativeresultsforderivedcategoriesofalgebraicstacks'

# Segment 2

In [501]:
found_in_paper_not_arxiv_p2 = []

for k,v in full_text_refs_dict.items():
    
    title = iclr_arxiv_map[k]["title"].lower()
    unaccented_title = unidecode.unidecode(title)
    clean_title = re.sub('[\W_]', '', unaccented_title)
    
    if clean_title in paper_global_id_dict:
        self_global_id = paper_global_id_dict[clean_title]
    else:
        paper_global_id_dict[clean_title] = len(paper_global_id_dict)
        self_global_id = paper_global_id_dict[clean_title]
    
    iclr_yearwise_graph_info[k] = self_global_id
    reverse_iclr_yearwise_graph_info[self_global_id] = k
    
    for i in v:
        m = re.search("\\\cite[ptyear\[\]\*]*?\{([^\{]*)\}", i)
        cits = []
        if m:
            p = m.group(1)
            cits = p.split(",")
        if k in paper_bbl_dict:
            for cit_key in cits:
                if cit_key in paper_bbl_dict[k]:
                    if "seq_id" in paper_bbl_dict[k][cit_key]:
                        graph_edges[self_global_id].add(paper_bbl_dict[k][cit_key]["seq_id"])
                    else:
                        found_in_paper_not_arxiv_p2.append(paper_bbl_dict[k][cit_key]["text"])
    if not self_global_id in graph_edges:
        print(k)

2017_B1YfAfcgl
2017_BJYwwY9ll
2017_BJh6Ztuxl
2017_By5e2L9gl
2017_HJSCGD9ex
2017_HJpfMIFll
2017_HkNRsU5ge
2017_Hkg4TI9xl
2017_HkwoSDPgg
2017_S13wCE9xx
2017_r1BJLw9ex
2017_r1G4z8cge
2017_r1y1aawlg
2017_rkFBJv9gg
2017_rky3QW9le
2017_ryuxYmvel
2018_B1ZZTfZAW
2018_B1jscMbAW
2018_B1l8BtlCb
2018_B1spAqUp-
2018_B1ydPgTpW
2018_BJvWjcgAZ
2018_BkUp6GZRW
2018_ByOnmlWC-
2018_H113pWZRb
2018_HkXWCMbRW
2018_Hksj2WWAW
2018_HyRVBzap-
2018_HyUNwulC-
2018_HytSvlWRZ
2018_S1GUgxgCW
2018_S1XolQbRW
2018_S1m6h21Cb
2018_SJCq_fZ0Z
2018_SJyEH91A-
2018_SyqShMZRb
2018_rJFOptp6Z
2018_rJTutzbA-
2018_rJg4YGWRb
2018_rk6cfpRjZ
2018_rkA1f3NpZ
2018_rkYTTf-AZ
2018_rkYgAJWCZ
2018_rkZvSe-RZ
2018_ry6-G_66b
2018_ryTp3f-0-
2018_ryazCMbR-
2019_B1G5ViAqFm
2019_BJe-Sn0ctm
2019_BkG5SjR5YQ
2019_BkG8sjR5Km
2019_BkfbpsAcF7
2019_Bkg2viA5FQ
2019_BklAEsR5t7
2019_BklKFo09YX
2019_ByMHvs0cFQ
2019_H1e572A5tQ
2019_H1gTEj09FX
2019_H1ldNoC9tX
2019_HJxfm2CqKm
2019_Hk4dFjR5K7
2019_HkgHk3RctX
2019_HylKJhCcKm
2019_Hyx4knR9Ym
2019_Hyxsl2AqKm
2019_Sk

In [464]:
full_text_refs_dict["2017_ryuxYmvel"]

['\n   local patterns of tokens.\n \\item\n   2-layer 1D CNN with LSTM \\citep{lstm} sequence reduction. This mo',
 ' of\n   local patterns of tokens.\n \\item\n   2-layer 1D CNN with LSTM \\citep{lstm} sequence reduction. Thi',
 'of\n   local patterns of tokens.\n \\item\n   2-layer 1D CNN with LSTM \\citep{lstm} sequence reduction. This m']

In [465]:
m = re.search("\\\cite[ptyear\[\]\*]*?\{([^\{]*)\}", full_text_refs_dict["2017_ryuxYmvel"][0])
m, m.group(1)


(<re.Match object; span=(65, 77), match='\\citep{lstm}'>, 'lstm')

In [466]:
"lstm" in paper_bbl_dict["2017_ryuxYmvel"]

True

In [468]:
iclr_yearwise_graph_info["2017_ryuxYmvel"], 436123 in graph_edges

(436123, False)

In [463]:
paper_bbl_dict["2017_B1MRcPclx"]

{'tensorflow': {'text': '\\bibitem[Abadi et~al.(2016)Abadi, Agarwal, Barham, Brevdo, Chen, Citro, Corrado, Davis, Dean, Devin, et~al.]{tensorflow} Mart{\\i}n Abadi, Ashish Agarwal, Paul Barham, Eugene Brevdo, Zhifeng Chen, Craig Citro, Greg~S Corrado, Andy Davis, Jeffrey Dean, Matthieu Devin, et~al. \\newblock Tensorflow: Large-scale machine learning on heterogeneous distributed systems. \\newblock \\emph{arXiv preprint arXiv:1603.04467}, 2016.',
  'arxivids': [],
  'bib-seq': 1,
  'seq_id': 842029},
 'strnn': {'text': '\\bibitem[Balduzzi and Ghifary(2016)]{strnn} David Balduzzi and Muhammad Ghifary. \\newblock Strongly-typed recurrent neural networks. \\newblock In \\emph{ICML}, 2016.',
  'arxivids': [],
  'bib-seq': 2,
  'seq_id': 1008429},
 'bordes2016learning': {'text': '\\bibitem[Bordes and Weston(2016)]{bordes2016learning} Antoine Bordes and Jason Weston. \\newblock Learning end-to-end goal-oriented dialog. \\newblock \\emph{arXiv preprint arXiv:1605.07683}, 2016.',
  'arxivids':

In [432]:
len(set(found_in_paper_not_arxiv_p2)), len(found_in_paper_not_arxiv_p2)

(988, 3761)

In [502]:
print(len(graph_edges))

1313


In [503]:
ec = 0

for x in graph_edges:
    ec += len(graph_edges[x])
print(ec)

6631


In [505]:
uniq_nodes = set()
for k, v in graph_edges.items():
#     print(k, v)
#     break
    uniq_nodes.add(k)
    for i in v:
        uniq_nodes.add(i)
uniq_nodes = set(uniq_nodes)

In [506]:
len(uniq_nodes)

3641

# P1

In [473]:
yw_ec = {str(y): [] for y in range(2017, 2021)}
paper_nodes = {str(y): 0 for y in range(2017, 2021)}

for x in graph_edges:
    year = reverse_iclr_yearwise_graph_info[x][0:4]
    yw_ec[year] += graph_edges[x]
    paper_nodes[year] += 1
    
print([(y, len(yw_ec[y]), len(set(yw_ec[y]))) for y in yw_ec])
print(paper_nodes)

[('2017', 695, 376), ('2018', 1171, 603), ('2019', 1949, 1044), ('2020', 2816, 1503)]
{'2017': 162, '2018': 239, '2019': 389, '2020': 523}


### {'2017': 695, '2018': 1171, '2019': 1949, '2020': 2816}

# P2 

In [425]:
unique_references_set = set()

for y in yw_ec:
    for rrr in yw_ec[y]:
        unique_references_set.add(rrr)
print(len(unique_references_set))

2546


In [469]:
len(graph_edges)

1313

In [496]:
len(set(graph_edges.keys()).intersection(unique_references_set))

242

In [492]:
len(graph_edges)

1337

## New papers to download

In [509]:
new_papers_to_download = unique_references_set.difference(set(graph_edges.keys()))

In [510]:
len(new_papers_to_download), len(new_papers_to_download)+218

(2328, 2546)

In [519]:
map_new_papers_to_download_to_arxivId = {}
hacky_list_of_dict = list(paper_global_id_dict.items())

for mid in new_papers_to_download:
    potential_paper = hacky_list_of_dict[mid]
    if mid == potential_paper[1]:
        map_new_papers_to_download_to_arxivId[mid] = [potential_paper[0], paper_id_titles[potential_paper[0]]]
    else:
        print("Not found: ", mid)

In [521]:
list(map_new_papers_to_download_to_arxivId.items())[0:2]

[(1179651, ['searchingformobilenetv3', ['a_1905.02244v5']]),
 (1179653,
  ['mixmatchaholisticapproachtosemisupervisedlearning', ['a_1905.02249v2']])]

In [522]:
with open("map_new_papers_to_download_to_arxiv.pkl", "wb") as f:
    pickle.dump(map_new_papers_to_download_to_arxivId, f)

In [None]:
global_graph_nodes = list(graph_edges.keys())
global_graph_nodes_itercopy = list(graph_edges.keys())
cc = 1
cc_list = defaultdict(set)
idx = 0


for k in global_graph_nodes_itercopy:
    cc_list[idx].add(k)
    for neighs in graph_edges[k]:
        cc_list.add(neighs)

In [476]:
def DFSUtil(temp, v, visited):   
    # Mark the current vertex as visited 
    visited[v] = True

    # Store the vertex to list 
    temp.append(v) 

    # Repeat for all vertices adjacent 
    # to this vertex v 
    for i in graph_edges[v]: 
        if visited[i] == False: 
            temp = DFSUtil(temp, i, visited) 
    return temp 

In [484]:
def connectedComponents(): 
    visited = {}
    cc = [] 
    for i in graph_edges: 
        visited[i] = False
        for j in graph_edges[i]:
            visited[j] = False
    
    for v in graph_edges:
        if visited[v] == False: 
            cc.append(DFSUtil([], v, visited)) 
    print(cc)

In [485]:
connectedComponents()

RuntimeError: dictionary changed size during iteration

In [263]:
for k,v in full_text_refs_dict.items():
    print(k)
    break

2017_B1-q5Pqxl


In [147]:
iclr_arxiv_map["2017_B1-Hhnslg"]["title"]

'Prototypical Networks for Few-shot Learning'

In [142]:
m = re.search("bib-bib([0-9][0-9]?)", leaderboard_refs["2017_B1-Hhnslg"]['refs']['ldb'][0])

In [144]:
m.group(1)

'17'

In [118]:
leaderboard_refs["2017_B1-Hhnslg"]

{'refs': {'ldb': ["<ref id='bib-bib17'>2016</ref>",
   "<ref id='bib-bib1'>2013</ref>",
   "<ref id='bib-bib23'>2016</ref>",
   "<ref id='bib-bib29'>2016</ref>",
   "<ref id='bib-bib2'>2015</ref>",
   "<ref id='bib-bib22'>2017</ref>",
   "<ref id='bib-bib6'>2017</ref>"],
  'abl': []},
 'count': 12}

In [119]:
paper_bbl_dict["2017_B1-Hhnslg"].keys()

dict_keys(['akata2013label', 'akata2015evaluation', 'lei2015predicting', 'banerjee2005clustering', 'bellet2013survey', 'edwards2017towards', 'elhoseiny2013write', 'goldberger2004neighbourhood', 'hochreiter1997long', 'ioffe2015batch', 'kingma2014adam', 'kingma2013auto', 'koch2015siamese', 'krizhevsky2012imagenet', 'kulis2012metric', 'lake2011one', 'liao2016', 'maaten2008visualizing', 'mensink2013distance', 'miller2000learning', 'min2009deep', 'ravi2017meta', 'reed2016learning', 'rezende2014stochastic', 'rippel2015metric', 'russakovsky2015imagenet', 'salakhutdinov2007learning', 'szegedy2015going', 'vinyals2016matching', 'weinberger2005distance', 'welinder2010caltech'])

In [121]:
paper_bbl_dict["2017_B1-Hhnslg"]["welinder2010caltech"]

{'text': '\\bibitem[Welinder et~al.(2010)Welinder, Branson, Mita, Wah, Schroff, Belongie, and Perona]{welinder2010caltech} P.~Welinder, S.~Branson, T.~Mita, C.~Wah, F.~Schroff, S.~Belongie, and P.~Perona. \\newblock {Caltech-UCSD Birds 200}. \\newblock Technical Report CNS-TR-2010-001, California Institute of Technology, 2010.',
 'arxivids': [],
 'bib-seq': 31}

In [123]:
*

SyntaxError: invalid syntax (<ipython-input-123-b462aee1c6c4>, line 1)

In [None]:
graph_edges = defaultdict(set)

In [None]:
for k in leaderboard_refs:

In [230]:
for k,v in paper_id_titles.items():
    print(k, v)
    break

Onepositiveandtwonegativeresultsforderivedcategoriesofalgebraicstacks ['e_1405.1888']
