In [2]:
import json
from pprint import pprint
import re
from collections import defaultdict
import pickle

### Read all years paper jsons

In [3]:
def read_papers_json(year):
    paper_data = defaultdict(list)
    
    if type(year) == int:
        year = [year]
    
    for y in year:
        file_path = "./science-parse/output/{}/{}_ICLR".format(y, y)
        with open(file_path, "r") as f:
            for line in f:
                paper_data[y].append(json.loads(line))
            
    return paper_data

In [4]:
paper_data = read_papers_json([2017, 2018, 2019, 2020])

### Count ICLR references

In [5]:
def construct_iclr_references_feature(paper_data):
    
    iclr_ref_feature = {}
    total_refs = 0
    total_venues_present = 0
    
    for year_key in paper_data.keys():
        for p in paper_data[year_key]:
            
            iclr_refs_count = 0
            
            try:
                if "name" in p:
                    pid = str(year_key) + "_" + p["name"].split(".")[0]
                    if "metadata" in p and "references" in p["metadata"]:
                        
                        total_refs += len(p["metadata"]["references"])
                        
                        for r in p["metadata"]["references"]:
                            if "venue" in r and r["venue"]:
                                total_venues_present += 1
                                if "iclr" in r["venue"].lower() or "international conference on learning representations" in r["venue"].lower() or "learning representation" in r["venue"].lower():
                                    iclr_refs_count += 1
                                    
                        iclr_ref_feature[pid] = iclr_refs_count
                    else:
                        print("Metadata/References not found for: ", pid)
                else:
                    print("Name field missing : ", year_key)
            except Exception as ex:
                print("Exception occurred for: " + p["name"], end="")
                print(ex)
        
    print("Total refs: {} and total venues in refs: {}".format(total_refs, total_venues_present))
    return iclr_ref_feature

In [6]:
feature_dict = construct_iclr_references_feature(paper_data=paper_data)

Total refs: 172288 and total venues in refs: 149205


In [7]:
149205/172288

0.8660208488112927

In [8]:
feature_dict

{'2019_rylV-2C9KQ': 2,
 '2018_HJYQLb-RW': 2,
 '2020_B1esygHFwS': 0,
 '2020_rkehoAVtvS': 0,
 '2018_SJ60SbW0b': 0,
 '2020_SJxIkkSKwB': 2,
 '2020_BylEqnVFDB': 0,
 '2020_SJlh8CEYDB': 1,
 '2019_SkGNrnC9FQ': 0,
 '2020_HygHbTVYPB': 0,
 '2019_SJx5kn0cK7': 3,
 '2020_SkxLFaNKwB': 0,
 '2020_S1xnXRVFwH': 7,
 '2020_rJgzzJHtDB': 12,
 '2020_SJx4O34YvS': 4,
 '2019_Hyffti0ctQ': 7,
 '2020_Syl5o2EFPB': 3,
 '2019_HyztsoC5Y7': 1,
 '2019_SkMwpiR9Y7': 0,
 '2017_Sy6iJDqlx': 0,
 '2020_BJxg_hVtwH': 8,
 '2020_B1gdkxHFDH': 0,
 '2019_SylPMnR9Ym': 10,
 '2018_Bya8fGWAZ': 0,
 '2020_BkevoJSYPB': 4,
 '2017_SJJKxrsgl': 0,
 '2020_rJecSyHtDS': 0,
 '2019_ryxxCiRqYX': 1,
 '2020_HJxWl0NKPB': 1,
 '2020_r1e_FpNFDr': 1,
 '2020_SJeY-1BKDS': 1,
 '2020_BJgcwh4FwS': 2,
 '2019_S1gDCiCqtQ': 0,
 '2019_rkMhusC5Y7': 1,
 '2018_ryZERzWCZ': 0,
 '2017_H1kjdOYlx': 1,
 '2020_SyxjVRVKDB': 2,
 '2020_BkgNqkHFPr': 4,
 '2020_BJlzm64tDH': 2,
 '2020_H1lOUeSFvB': 0,
 '2019_SyxAb30cY7': 0,
 '2020_SJe9qT4YPr': 1,
 '2020_BJlLdhNFPr': 3,
 '2019_ByeZ5jC5Y

In [10]:
with open("feature_dict/iclr_refs_count.pkl", "wb") as f:
    pickle.dump(feature_dict, f)