In [1]:
import numpy as np
import pandas as pd

np.set_printoptions(threshold=np.nan)
pd.options.display.max_colwidth = 1000

In [2]:
%%time

# Read information for the new records
records = pd.read_csv('new-citations.tsv', sep='\t', header=None)
records.columns = ['recID', 'oldRef', 'newRef', 'metaData']

print('Total Records: {}'.format(len(records)))

Total Records: 30230224
CPU times: user 2min 32s, sys: 7.62 s, total: 2min 39s
Wall time: 2min 40s


In [11]:
# Read the new citation counts
labs = pd.read_csv('new-citation-counts.tsv', sep='\t', header=None)
labs.columns = ['recID', 'Citation_Difference']

# Index by recIDs
labs_indexed = labs.set_index('recID')

In [12]:
# Read the legacy citation counts
legacy = pd.read_csv('old-citation-counts.tsv', sep='\t')
legacy.columns = ['recID', 'Citation_Difference']

# Index by recrod IDs
legacy_indexed = legacy.set_index('recID')

In [13]:
# Compute the difference in citation counts between Labs and Legacy
diff = labs_indexed - legacy_indexed

In [39]:
# Filter out the NaNs
is_nan = diff.Citation_Difference.isna()
NaN_entries = diff[is_nan]

print('{} articles are differently indexed between the legacy and labs records and have no correspondence'.format(len(NaN_entries)))

6032 articles are differently indexed between the legacy and labs records and have no correspondence


In [15]:
# Filter the valid entries
is_valid = ~diff.Citation_Difference.isna()
valid_diff = diff[is_valid]

In [16]:
# Get the articles which have lost citations
lost = valid_diff.Citation_Difference < 0
lost_citations = valid_diff[lost]
print('Total {} articles lost citations in the Labs algorithm, with maximum lost count being {}'.format(
    len(lost_citations), lost_citations.min()))

# Sort the list
lost_citations = lost_citations.sort_values('Citation_Difference')

Total 46256 articles lost citations in the Labs algorithm, with maximum lost count being Citation_Difference   -1023.0
dtype: float64


In [17]:
# Get the articles which have gained citations
gained = valid_diff.Citation_Difference > 0
gained_citations = valid_diff[gained]
print('Total {} articles gained citations in the Labs algorithm, with maximum gained count being {}'.format(
    len(gained_citations), gained_citations.max()))

# Sort the list
gained_citations = gained_citations.sort_values('Citation_Difference', ascending=False)

Total 19416 articles gained citations in the Labs algorithm, with maximum gained count being Citation_Difference    408.0
dtype: float64


In [35]:
# Get the counts for absolute differences in citation numbers
abs_diff = np.abs(valid_diff)
abs_diff = abs_diff.sort_values('Citation_Difference', ascending=False)

# Filter out the zero difference entries
non_zero_diff = abs_diff.Citation_Difference != 0
abs_diff = abs_diff[non_zero_diff]

In [37]:
# Initialize extended DataFrames for the differences
lost_citations_ext = lost_citations
gained_citations_ext = gained_citations
abs_diff_ext = abs_diff

# For lost citations
lost_citations_ext['Legacy_Count'] = legacy_indexed.ix[lost_citations.index]
lost_citations_ext['Labs_Count'] = labs_indexed.ix[lost_citations.index]

# For gained citations
gained_citations_ext['Legacy_Count'] = legacy_indexed.ix[gained_citations.index]
gained_citations_ext['Labs_Count'] = labs_indexed.ix[gained_citations.index]

# For absolute difference in citation counts
abs_diff_ext['Legacy_Count'] = legacy_indexed.ix[abs_diff.index]
abs_diff_ext['Labs_Count'] = labs_indexed.ix[abs_diff.index]

In [38]:
# Save the citation difference DataFrames to CSV Files
lost_citations_ext.to_csv('Corrected Citation Differences/lost_citations.csv')
gained_citations_ext.to_csv('Corrected Citation Differences/gained_citations.csv')
lost_citations_ext.to_csv('Corrected Citation Differences/lost_citations.csv')

Unnamed: 0_level_0,Citation_Difference,Legacy_Count,Labs_Count
recID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
299778,1023.0,1405,382
1390184,856.0,863,7
181166,682.0,1644,962
87997,580.0,846,266
54961,563.0,599,36
83610,408.0,50,458
673261,400.0,252,652
279039,384.0,772,388
673262,380.0,438,58
1245018,350.0,361,11
