In [1]:
import numpy as np
import pandas as pd
import os
import matplotlib.pyplot as plt

pd.options.display.max_colwidth = 1000

In [2]:
# Read the Labs citation counts
labs = pd.read_csv('new-citation-counts-NEWAUG2018.tsv', sep='\t', header=None)
labs.columns = ['recID', 'Citation_Difference']
# Citation_Difference is actually citation count but is used here for ease later when subtracting

# Index by recIDs
labs_indexed = labs.set_index('recID')
len(labs_indexed)

769457

In [3]:
# Read the legacy citation counts
legacy = pd.read_csv('old-citation-counts-AUG2018.csv', sep=',')
legacy.columns = ['recID', 'Citation_Difference', 'Timestamp']
# Citation_Difference is actually citation count but is used here for ease later when subtracting
legacy = legacy.drop(['Timestamp'], axis='columns')

# Index by recrod IDs
legacy_indexed = legacy.set_index('recID')
len(legacy_indexed)

776108

In [4]:
# Read the db citation counts
db = pd.read_csv('literature_count_NEWAUG2018.csv', sep=',')
db.columns = ['recID', 'Citation_Difference']
# Citation_Difference is actually citation count but is used here for ease later when subtracting

# Index by recIDs
db_indexed = db.set_index('recID')

len(db_indexed)

1286536

In [5]:
# Load the core records list
core_list = pd.read_csv('inspire_core_list_NEWAUG2018.txt', sep='\n', header=None)

### Difference between Labs and DB

In [23]:
diff_labs_db = db_indexed - labs_indexed

In [24]:
diff_labs_db_valid = diff_labs_db[~diff_labs_db.Citation_Difference.isna()]
diff_labs_db_valid = diff_labs_db_valid.sort_values('Citation_Difference')

In [25]:
diff_labs_db_valid_zero_diff = diff_labs_db_valid[diff_labs_db_valid.Citation_Difference == 0]
print('{} records have 0 difference'.format(len(diff_labs_db_valid_zero_diff)))

694354 records have 0 difference


In [28]:
diff_labs_db_valid_abs_diff = diff_labs_db_valid[diff_labs_db_valid.Citation_Difference != 0]
diff_labs_db_valid_abs_diff = np.abs(diff_labs_db_valid)
diff_labs_db_valid_abs_diff = diff_labs_db_valid_abs_diff.sort_values('Citation_Difference', ascending=False)
print('{} records have some difference'.format(len(diff_labs_db_valid_abs_diff)))

769387 records have some difference


In [30]:
diff_labs_db_valid_lost_citations  = diff_labs_db_valid[diff_labs_db_valid.Citation_Difference < 0]
diff_labs_db_valid_lost_citations = diff_labs_db_valid_lost_citations.sort_values('Citation_Difference')
print('{} records have lost citations'.format(len(diff_labs_db_valid_lost_citations)))

51653 records have lost citations


In [31]:
diff_labs_db_valid_gained_citations  = diff_labs_db_valid[diff_labs_db_valid.Citation_Difference > 0]
diff_labs_db_valid_gained_citations = diff_labs_db_valid_gained_citations.sort_values('Citation_Difference', ascending=False)
print('{} records have gained citations'.format(len(diff_labs_db_valid_gained_citations)))

23380 records have gained citations


In [32]:
# Get the extended data frames
# Initialize extended DataFrames for the "invalid" differences
labs_db_abs_diff_ext = diff_labs_db_valid_abs_diff
labs_db_lost_citations_ext = diff_labs_db_valid_lost_citations
labs_db_gained_citations_ext = diff_labs_db_valid_gained_citations

# For absolute difference
labs_db_abs_diff_ext['DB_Count'] = db_indexed.loc[diff_labs_db_valid_abs_diff.index]
labs_db_abs_diff_ext['Labs_Count'] = labs_indexed.loc[diff_labs_db_valid_abs_diff.index]

# For lost citations
labs_db_lost_citations_ext['DB_Count'] = db_indexed.loc[diff_labs_db_valid_lost_citations.index]
labs_db_lost_citations_ext['Labs_Count'] = labs_indexed.loc[diff_labs_db_valid_lost_citations.index]

# For gained citations
labs_db_gained_citations_ext['DB_Count'] = db_indexed.loc[diff_labs_db_valid_gained_citations.index]
labs_db_gained_citations_ext['Labs_Count'] = labs_indexed.loc[diff_labs_db_valid_gained_citations.index]

##### Statistics

In [33]:
# Indicate Number of articles with zero differences in citation counts
total_unique_cited_articles = len(diff_labs_db_valid)
print('Union of the above two - Total unique cited articles: {}'.format(total_unique_cited_articles))

num_zero_diff = diff_labs_db_valid.Citation_Difference.value_counts()[0]
percent_zero_diff = num_zero_diff / total_unique_cited_articles * 100
print('Number of Cited articles with zero difference: {}'.format(num_zero_diff))
print('Percentage of Cited articles with zero difference: {0:0.2f} %'.format(percent_zero_diff))

Union of the above two - Total unique cited articles: 769387
Number of Cited articles with zero difference: 694354
Percentage of Cited articles with zero difference: 90.25 %


In [34]:
# Percentage Differences for absolute differences
abs_diff_counts = labs_db_abs_diff_ext.Citation_Difference.value_counts()
print('Total Number of article with changes in citation counts in Labs: {}'.format(len(labs_db_abs_diff_ext)))

print('\nNumber of articles with changes in Citation Counts in Labs')
print(abs_diff_counts[:5])

print('\nPercent (%) of Citation Counts Difference in Labs')
print(abs_diff_counts[:5] / total_unique_cited_articles * 100)

num_grt5 = sum(abs_diff_counts[6:])
percent_grt5 = num_grt5 / total_unique_cited_articles * 100
print('\nTotal {} records with difference greater than 5 - Percentage: {} %\n'.format(num_grt5, percent_grt5))

Total Number of article with changes in citation counts in Labs: 769387

Number of articles with changes in Citation Counts in Labs
0.0    694354
1.0     55896
2.0      9836
3.0      3544
4.0      1703
5.0       946
Name: Citation_Difference, dtype: int64

Percent (%) of Citation Counts Difference in Labs
0.0    90.247691
1.0     7.265004
2.0     1.278420
3.0     0.460626
4.0     0.221345
5.0     0.122955
Name: Citation_Difference, dtype: float64

Total 3108 records with difference greater than 5 - Percentage: 0.4039579561391082 %



In [35]:
# Percentage Differences for lost citations
lost_citations_count = labs_db_lost_citations_ext.Citation_Difference.value_counts()
print('Total Number of article with loss in citation counts in Labs: {}'.format(len(labs_db_lost_citations_ext)))

print('\nNumber of articles losing Citation Counts in Labs')
print(lost_citations_count[:-5])

lost_percent_diff = lost_citations_count/ total_unique_cited_articles * 100
print('\nPercent (%) of Citation Counts Lost in Labs')
print(lost_percent_diff[:-5])

lost_grt5 = sum(lost_citations_count[-6:])
lost_percent_grt5 = lost_grt5 / total_unique_cited_articles * 100
print('\nTotal {} records with lost citations greater than 5 - Percentage: {} %\n'.format(lost_grt5, lost_percent_grt5))

Total Number of article with loss in citation counts in Labs: 51653

Number of articles losing Citation Counts in Labs
-1.0    38165
-2.0     7301
-3.0     2574
-4.0     1190
-5.0      668
Name: Citation_Difference, dtype: int64

Percent (%) of Citation Counts Lost in Labs
-1.0    4.960443
-2.0    0.948937
-3.0    0.334552
-4.0    0.154669
-5.0    0.086822
Name: Citation_Difference, dtype: float64

Total 1755 records with lost citations greater than 5 - Percentage: 0.22810367214418753 %



In [36]:
# Percentage Differences for gained citations
gained_citations_count = labs_db_gained_citations_ext.Citation_Difference.value_counts()
print('Total Number of article with gain in citation counts in Labs: {}'.format(len(labs_db_gained_citations_ext)))

print('\nNumber of articles gaining Citation Counts in Labs')
print(gained_citations_count[:5])

gained_percent_diff = gained_citations_count/ total_unique_cited_articles * 100
print('\nPercent (%) of Citation Counts Gained in Labs')
print(gained_percent_diff[:5])


gained_grt5 = sum(gained_citations_count[6:])
gained_percent_grt5 = gained_grt5 / total_unique_cited_articles * 100
print('\nTotal {} records with gained citations greater than 5 - Percentage: {} %\n'.format(gained_grt5, gained_percent_grt5))

Total Number of article with gain in citation counts in Labs: 23380

Number of articles gaining Citation Counts in Labs
1.0    17731
2.0     2535
3.0      970
4.0      513
5.0      278
Name: Citation_Difference, dtype: int64

Percent (%) of Citation Counts Gained in Labs
1.0    2.304562
2.0    0.329483
3.0    0.126074
4.0    0.066676
5.0    0.036133
Name: Citation_Difference, dtype: float64

Total 1353 records with gained citations greater than 5 - Percentage: 0.17585428399492065 %



##### For Core articles

In [55]:
# For lost core records (in DB)
labs_db_core_articles_with_loss = np.intersect1d(diff_labs_db_valid_lost_citations.index, core_list[0].unique())
labs_db_core_articles_with_loss = labs_db_lost_citations_ext.loc[labs_db_core_articles_with_loss].sort_values('Citation_Difference')
len(labs_db_core_articles_with_loss)

31564

In [57]:
# For gained core records (in DB)
labs_db_core_articles_with_gain = np.intersect1d(diff_labs_db_valid_gained_citations.index, core_list[0].unique())
labs_db_core_articles_with_gain = labs_db_gained_citations_ext.loc[labs_db_core_articles_with_gain].sort_values('Citation_Difference', ascending=False)
len(labs_db_core_articles_with_gain)

14981

### Difference between Legacy and DB

In [48]:
diff_legacy_db = db_indexed - legacy_indexed

In [49]:
diff_legacy_db_valid = diff_legacy_db[~diff_legacy_db.Citation_Difference.isna()]
diff_legacy_db_valid = diff_legacy_db_valid.sort_values('Citation_Difference')

In [50]:
diff_legacy_db_valid_zero_diff = diff_legacy_db_valid[diff_legacy_db_valid.Citation_Difference == 0]
print('{} records have 0 difference'.format(len(diff_legacy_db_valid_zero_diff)))

692683 records have 0 difference


In [51]:
diff_legacy_db_valid_abs_diff = diff_legacy_db_valid[diff_legacy_db_valid.Citation_Difference != 0]
diff_legacy_db_valid_abs_diff = np.abs(diff_legacy_db_valid)
diff_legacy_db_valid_abs_diff = diff_legacy_db_valid_abs_diff.sort_values('Citation_Difference', ascending=False)
print('{} records have some difference'.format(len(diff_legacy_db_valid_abs_diff)))

775936 records have some difference


In [52]:
diff_legacy_db_valid_lost_citations  = diff_legacy_db_valid[diff_legacy_db_valid.Citation_Difference < 0]
diff_legacy_db_valid_lost_citations = diff_legacy_db_valid_lost_citations.sort_values('Citation_Difference')
print('{} records have lost citations'.format(len(diff_legacy_db_valid_lost_citations)))

68525 records have lost citations


In [53]:
diff_legacy_db_valid_gained_citations  = diff_legacy_db_valid[diff_legacy_db_valid.Citation_Difference > 0]
diff_legacy_db_valid_gained_citations = diff_legacy_db_valid_gained_citations.sort_values('Citation_Difference', ascending=False)
print('{} records have gained citations'.format(len(diff_legacy_db_valid_gained_citations)))

14728 records have gained citations


In [54]:
# Get the extended data frames
legacy_db_abs_diff_ext = diff_legacy_db_valid_abs_diff
legacy_db_lost_citations_ext = diff_legacy_db_valid_lost_citations
legacy_db_gained_citations_ext = diff_legacy_db_valid_gained_citations

# For absolute difference
legacy_db_abs_diff_ext['DB_Count'] = db_indexed.loc[diff_legacy_db_valid_abs_diff.index]
legacy_db_abs_diff_ext['Legacy_Count'] = labs_indexed.loc[diff_legacy_db_valid_abs_diff.index]

# For lost citations
legacy_db_lost_citations_ext['DB_Count'] = db_indexed.loc[diff_legacy_db_valid_lost_citations.index]
legacy_db_lost_citations_ext['Legacy_Count'] = legacy_indexed.loc[diff_legacy_db_valid_lost_citations.index]

# For gained citations
legacy_db_gained_citations_ext['DB_Count'] = db_indexed.loc[diff_legacy_db_valid_gained_citations.index]
legacy_db_gained_citations_ext['Legacy_Count'] = legacy_indexed.loc[diff_legacy_db_valid_gained_citations.index]

Passing list-likes to .loc or [] with any missing label will raise
KeyError in the future, you can use .reindex() as an alternative.

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#deprecate-loc-reindex-listlike
  


##### Statistics

In [55]:
# Indicate Number of articles with zero differences in citation counts
total_unique_cited_articles = len(diff_legacy_db_valid)
print('Total unique cited articles: {}'.format(total_unique_cited_articles))

num_zero_diff = diff_legacy_db_valid.Citation_Difference.value_counts()[0]
percent_zero_diff = num_zero_diff / total_unique_cited_articles * 100
print('Number of Cited articles with zero difference: {}'.format(num_zero_diff))
print('Percentage of Cited articles with zero difference: {0:0.2f} %'.format(percent_zero_diff))

Total unique cited articles: 775936
Number of Cited articles with zero difference: 692683
Percentage of Cited articles with zero difference: 89.27 %


In [56]:
# Percentage Differences for absolute differences
abs_diff_counts = legacy_db_abs_diff_ext.Citation_Difference.value_counts()
print('Total Number of article with changes in citation counts in Labs: {}'.format(len(legacy_db_abs_diff_ext)))

print('\nNumber of articles with changes in Citation Counts in Labs')
print(abs_diff_counts[:5])

print('\nPercent (%) of Citation Counts Difference in Labs')
print(abs_diff_counts[:5] / total_unique_cited_articles * 100)

num_grt5 = sum(abs_diff_counts[6:])
percent_grt5 = num_grt5 / total_unique_cited_articles * 100
print('\nTotal {} records with difference greater than 5 - Percentage: {} %\n'.format(num_grt5, percent_grt5))

Total Number of article with changes in citation counts in Labs: 775936

Number of articles with changes in Citation Counts in Labs
0.0    692683
1.0     62950
2.0     11122
3.0      3696
4.0      1728
5.0       967
Name: Citation_Difference, dtype: int64

Percent (%) of Citation Counts Difference in Labs
0.0    89.270636
1.0     8.112782
2.0     1.433366
3.0     0.476328
4.0     0.222699
5.0     0.124624
Name: Citation_Difference, dtype: float64

Total 2790 records with difference greater than 5 - Percentage: 0.3595657373804025 %



In [57]:
# Percentage Differences for lost citations
lost_citations_count = legacy_db_lost_citations_ext.Citation_Difference.value_counts()
print('Total Number of article with loss in citation counts in Labs: {}'.format(len(legacy_db_lost_citations_ext)))

print('\nNumber of articles losing Citation Counts in Labs')
print(lost_citations_count[:-5])

lost_percent_diff = lost_citations_count/ total_unique_cited_articles * 100
print('\nPercent (%) of Citation Counts Lost in Labs')
print(lost_percent_diff[:-5])

lost_grt5 = sum(lost_citations_count[-6:])
lost_percent_grt5 = lost_grt5 / total_unique_cited_articles * 100
print('\nTotal {} records with lost citations greater than 5 - Percentage: {} %\n'.format(lost_grt5, lost_percent_grt5))

Total Number of article with loss in citation counts in Labs: 68525

Number of articles losing Citation Counts in Labs
-1.0    50893
-2.0     9730
-3.0     3192
-4.0     1481
-5.0      850
Name: Citation_Difference, dtype: int64

Percent (%) of Citation Counts Lost in Labs
-1.0    6.558917
-2.0    1.253969
-3.0    0.411374
-4.0    0.190866
-5.0    0.109545
Name: Citation_Difference, dtype: float64

Total 2379 records with lost citations greater than 5 - Percentage: 0.30659745133619265 %



In [58]:
# Percentage Differences for gained citations
gained_citations_count = legacy_db_gained_citations_ext.Citation_Difference.value_counts()
print('Total Number of article with gain in citation counts in Labs: {}'.format(len(legacy_db_gained_citations_ext)))

print('\nNumber of articles gaining Citation Counts in Labs')
print(gained_citations_count[:5])

gained_percent_diff = gained_citations_count/ total_unique_cited_articles * 100
print('\nPercent (%) of Citation Counts Gained in Labs')
print(gained_percent_diff[:5])


gained_grt5 = sum(gained_citations_count[6:])
gained_percent_grt5 = gained_grt5 / total_unique_cited_articles * 100
print('\nTotal {} records with gained citations greater than 5 - Percentage: {} %\n'.format(gained_grt5, gained_percent_grt5))

Total Number of article with gain in citation counts in Labs: 14728

Number of articles gaining Citation Counts in Labs
1.0    12057
2.0     1392
3.0      504
4.0      247
5.0      117
Name: Citation_Difference, dtype: int64

Percent (%) of Citation Counts Gained in Labs
1.0    1.553865
2.0    0.179396
3.0    0.064954
4.0    0.031833
5.0    0.015079
Name: Citation_Difference, dtype: float64

Total 411 records with gained citations greater than 5 - Percentage: 0.05296828604420983 %



##### For Core articles

In [65]:
# For lost core records (in DB)
legacy_db_core_articles_with_loss = np.intersect1d(diff_legacy_db_valid_lost_citations.index, core_list[0].unique())
legacy_db_core_articles_with_loss = legacy_db_lost_citations_ext.loc[legacy_db_core_articles_with_loss].sort_values('Citation_Difference')
len(legacy_db_core_articles_with_loss)

40935

In [66]:
# For gained core records (in DB)
legacy_db_core_articles_with_gain = np.intersect1d(diff_legacy_db_valid_gained_citations.index, core_list[0].unique())
legacy_db_core_articles_with_gain = legacy_db_gained_citations_ext.loc[legacy_db_core_articles_with_gain].sort_values('Citation_Difference', ascending=False)
len(legacy_db_core_articles_with_gain)

11512