### Load the Data

In [1]:
import numpy as np
import pandas as pd
from collections import defaultdict
import ast
import matplotlib.pyplot as plt

np.set_printoptions(threshold=np.nan)
pd.options.display.max_colwidth = 1000

In [2]:
%%time

# Read the records
records = pd.read_csv('new-citations.tsv', sep='\t', header=None)
records.columns = ['recID', 'oldRef', 'newRef', 'metaData']

print('Total Records: {}'.format(len(records)))

Total Records: 30230224
CPU times: user 2min 33s, sys: 7.37 s, total: 2min 40s
Wall time: 2min 41s


In [3]:
%%time

# Filter out all the unattributed records
un_attributed = records.newRef == 0
unattributed = records[un_attributed]

total_unattributed = len(unattributed)
print('Total number of unattirbuted: {}'.format(total_unattributed))

Total number of unattirbuted: 8485730
CPU times: user 713 ms, sys: 200 ms, total: 913 ms
Wall time: 917 ms


### Parse the metaData into a DataFrame

In [None]:
%%time

pi_count = 0
rn_count = 0
doi_count = 0
misc_count = 0
arxiv_count = 0
isbn_count = 0
other_count = 0

for i, record in enumerate(unattributed.iterrows()):
    
    meta_data = ast.literal_eval(record[1]['metaData'])
    
    reference_info = meta_data.get('reference')
    
    if reference_info:

        if reference_info.get('publication_info'):
            pi_count += 1
            
        if reference_info.get('report_numbers'):
            rn_count += 1

        if reference_info.get('dois'):
            doi_count += 1

        if reference_info.get('misc'):
            misc_count += 1
            
        if reference_info.get('arxiv_eprint'):
            arxiv_count += 1

        if reference_info.get('isbn'):
            isbn_count += 1

    else:
        other_count += 1
    
    if i%1000 == 0:
        print('Completed {} / {} - Counts: P{}, R{}, D{}, M{}, A{}, I{}, O{}'
          .format(i + 1, total_unattributed, pi_count, rn_count, doi_count, misc_count,
                 arxiv_count, isbn_count, other_count))

In [9]:
print('Total records: {} - Final Counts: Publication Info: {}, Report Number: {}, DOIs: {}, Misc: {}, Arxiv: {}, ISBN: {}, Other: {}'
    .format(total_unattributed, pi_count, rn_count, doi_count, misc_count,
            arxiv_count, isbn_count, other_count))

Total records: 8485730 - Final Counts: Publication Info: 6166111, Report Number: 97277, DOIs: 284393, Misc: 3881793, Arxiv: 100716, ISBN: 7612, Other: 685
