In [1]:
import numpy as np
import pandas as pd

np.set_printoptions(threshold=np.nan)
pd.options.display.max_colwidth = 1000

In [2]:
%%time

# Read information for the new records
records = pd.read_csv('new-citations.tsv', sep='\t', header=None)
records.columns = ['recID', 'oldRef', 'newRef', 'metaData']

print('Total Records: {}'.format(len(records)))

Total Records: 30230224
CPU times: user 2min 32s, sys: 7.62 s, total: 2min 39s
Wall time: 2min 40s


In [11]:
# Read the new citation counts
labs = pd.read_csv('new-citation-counts.tsv', sep='\t', header=None)
labs.columns = ['recID', 'Citation_Difference']

# Index by recIDs
labs_indexed = labs.set_index('recID')

In [12]:
# Read the legacy citation counts
legacy = pd.read_csv('old-citation-counts.tsv', sep='\t')
legacy.columns = ['recID', 'Citation_Difference']

# Index by recrod IDs
legacy_indexed = legacy.set_index('recID')

In [13]:
# Compute the difference in citation counts between Labs and Legacy
diff = labs_indexed - legacy_indexed

In [39]:
# Filter out the NaNs
is_nan = diff.Citation_Difference.isna()
NaN_entries = diff[is_nan]

print('{} articles are differently indexed between the legacy and labs records and have no correspondence'.format(len(NaN_entries)))

6032 articles are differently indexed between the legacy and labs records and have no correspondence


In [15]:
# Filter the valid entries
is_valid = ~diff.Citation_Difference.isna()
valid_diff = diff[is_valid]

In [16]:
# Get the articles which have lost citations
lost = valid_diff.Citation_Difference < 0
lost_citations = valid_diff[lost]
print('Total {} articles lost citations in the Labs algorithm, with maximum lost count being {}'.format(
    len(lost_citations), lost_citations.min()))

# Sort the list
lost_citations = lost_citations.sort_values('Citation_Difference')

Total 46256 articles lost citations in the Labs algorithm, with maximum lost count being Citation_Difference   -1023.0
dtype: float64


In [17]:
# Get the articles which have gained citations
gained = valid_diff.Citation_Difference > 0
gained_citations = valid_diff[gained]
print('Total {} articles gained citations in the Labs algorithm, with maximum gained count being {}'.format(
    len(gained_citations), gained_citations.max()))

# Sort the list
gained_citations = gained_citations.sort_values('Citation_Difference', ascending=False)

Total 19416 articles gained citations in the Labs algorithm, with maximum gained count being Citation_Difference    408.0
dtype: float64


In [35]:
# Get the counts for absolute differences in citation numbers
abs_diff = np.abs(valid_diff)
abs_diff = abs_diff.sort_values('Citation_Difference', ascending=False)

# Filter out the zero difference entries
non_zero_diff = abs_diff.Citation_Difference != 0
abs_diff = abs_diff[non_zero_diff]

In [37]:
# Initialize extended DataFrames for the differences
lost_citations_ext = lost_citations
gained_citations_ext = gained_citations
abs_diff_ext = abs_diff

# For lost citations
lost_citations_ext['Legacy_Count'] = legacy_indexed.ix[lost_citations.index]
lost_citations_ext['Labs_Count'] = labs_indexed.ix[lost_citations.index]

# For gained citations
gained_citations_ext['Legacy_Count'] = legacy_indexed.ix[gained_citations.index]
gained_citations_ext['Labs_Count'] = labs_indexed.ix[gained_citations.index]

# For absolute difference in citation counts
abs_diff_ext['Legacy_Count'] = legacy_indexed.ix[abs_diff.index]
abs_diff_ext['Labs_Count'] = labs_indexed.ix[abs_diff.index]

In [40]:
# Save the citation difference DataFrames to CSV Files
lost_citations_ext.to_csv('Corrected Citation Differences/lost_citations.csv')
gained_citations_ext.to_csv('Corrected Citation Differences/gained_citations.csv')
abs_diff_ext.to_csv('Corrected Citation Differences/overall_difference.csv')

### Some useful statistics

In [51]:
# Indicate Number of articles with zero differences in citation counts
num_cited_legacy = len(legacy.index.unique())
num_cited_labs = len(labs.index.unique())
num_unique_articles = len(records.recID.unique())
print('Total number of unique Cited articles as per legacy: {}'.format(num_cited_legacy))
print('Total number of unique Cited articles as per Labs: {}'.format(num_cited_labs))
print('Difference in Number between the Legacy and Labs Records: {}'.format(num_cited_legacy - num_cited_labs))
print('\nTotal Number of Unique Articles which cite other papers: {}\n'.format(num_unique_articles))

num_zero_diff = valid_diff.Citation_Difference.value_counts()[0]
zero_diff = num_zero_diff / num_cited_legacy * 100
print('Number of Cited articles with zero difference: {}'.format(num_zero_diff))
print('Percentage of Cited articles with zero difference: {0:0.2f} %'.format(zero_diff))

Total number of unique Cited articles as per legacy: 760203
Total number of unique Cited articles as per Labs: 759481
Difference in Number between the Legacy and Labs Records: 722

Total Number of Unique Articles which cite other papers: 942155

Number of Cited articles with zero difference: 691154
Percentage of Cited articles with zero difference: 90.92 %


In [52]:
# Percentage Differences for absolute differences
abs_diff_counts = abs_diff.Citation_Difference.value_counts()
print('Percent (%) of Citation Counts Difference in Labs')
print(abs_diff_counts[:5] / num_cited_legacy * 100)

num_grt5 = sum(abs_diff_counts[6:])
percent_grt5 = num_grt5 / num_cited_legacy * 100
print('\nTotal {} records with difference greater than 5 - Percentage: {} %\n'.format(num_grt5, percent_grt5))

Percent (%) of Citation Counts Difference in Labs
1.0    6.652828
2.0    0.992498
3.0    0.330307
4.0    0.168508
5.0    0.100499
Name: Citation_Difference, dtype: float64

Total 2996 records with difference greater than 5 - Percentage: 0.3941052587269453 %



In [53]:
# Percentage Differences for lost citations
lost_citations_count = lost_citations.Citation_Difference.value_counts()
lost_percent_diff = lost_citations_count/ num_cited_legacy * 100
print('Percent (%) of Citation Counts Lost in Labs')
print(lost_percent_diff[:-5])

lost_grt5 = sum(lost_citations_count[-6:])
lost_percent_grt5 = lost_grt5 / num_cited_legacy * 100
print('\nTotal {} records with lost citations greater than 5 - Percentage: {} %\n'.format(lost_grt5, lost_percent_grt5))

Percent (%) of Citation Counts Lost in Labs
-1.0    4.818450
-2.0    0.701655
-3.0    0.201788
-4.0    0.091949
-5.0    0.050381
Name: Citation_Difference, dtype: float64

Total 1676 records with lost citations greater than 5 - Percentage: 0.22046742777915898 %



In [54]:
# Percentage Differences for gained citations
gained_citations_count = gained_citations.Citation_Difference.value_counts()
gained_percent_diff = gained_citations_count/ num_cited_legacy * 100
print('Percent (%) of Citation Counts Gained in Labs')
print(gained_percent_diff[:5])

gained_grt5 = sum(gained_citations_count[6:])
gained_percent_grt5 = gained_grt5 / num_cited_legacy * 100
print('\nTotal {} records with gained citations greater than 5 - Percentage: {} %\n'.format(gained_grt5, gained_percent_grt5))

Percent (%) of Citation Counts Gained in Labs
1.0    1.834378
2.0    0.290843
3.0    0.128518
4.0    0.076558
5.0    0.050118
Name: Citation_Difference, dtype: float64

Total 1320 records with gained citations greater than 5 - Percentage: 0.17363783094778631 %

