In [1]:
import numpy as np
import pandas as pd
import os
import matplotlib.pyplot as plt

pd.options.display.max_colwidth = 1000

In [12]:
# Read the Labs citation counts
labs = pd.read_csv('new-citation-counts-NEWAUG2018.tsv', sep='\t', header=None)
labs.columns = ['recID', 'Citation_Difference']
# Citation_Difference is actually citation count but is used here for ease later when subtracting

# Index by recIDs
labs_indexed = labs.set_index('recID')
len(labs_indexed)

769457

In [13]:
# Read the legacy citation counts
legacy = pd.read_csv('old-citation-counts-AUG2018.csv', sep=',')
legacy.columns = ['recID', 'Citation_Difference', 'Timestamp']
# Citation_Difference is actually citation count but is used here for ease later when subtracting
legacy = legacy.drop(['Timestamp'], axis='columns')

# Index by recrod IDs
legacy_indexed = legacy.set_index('recID')
len(legacy_indexed)

776108

In [14]:
# Read the db citation counts
db = pd.read_csv('literature_count_NEWAUG2018.csv', sep=',')
db.columns = ['recID', 'Citation_Difference']
# Citation_Difference is actually citation count but is used here for ease later when subtracting

# Index by recIDs
db_indexed = db.set_index('recID')

len(db_indexed)

1286536

In [39]:
# Load the core records list
core_list = pd.read_csv('inspire_core_list_NEWAUG2018.txt', sep='\n', header=None)

### Difference between Labs and DB

In [15]:
diff_labs_db = db_indexed - labs_indexed

In [22]:
diff_labs_db_valid = diff_labs_db[~diff_labs_db.Citation_Difference.isna()]
diff_labs_db_valid = diff_labs_db_valid.sort_values('Citation_Difference')

In [26]:
diff_labs_db_valid_zero_diff = diff_labs_db_valid[diff_labs_db_valid.Citation_Difference == 0]
print('{} records have 0 difference'.format(len(diff_labs_db_valid_zero_diff)))

694354 records have 0 difference


In [33]:
diff_labs_db_valid_lost_citations  = diff_labs_db_valid[diff_labs_db_valid.Citation_Difference < 0]
diff_labs_db_valid_lost_citations = diff_labs_db_valid_lost_citations.sort_values('Citation_Difference')
print('{} records have lost citations'.format(len(diff_labs_db_valid_lost_citations)))

51653 records have lost citations


In [35]:
diff_labs_db_valid_gained_citations  = diff_labs_db_valid[diff_labs_db_valid.Citation_Difference > 0]
diff_labs_db_valid_gained_citations = diff_labs_db_valid_gained_citations.sort_values('Citation_Difference', ascending=False)
print('{} records have gained citations'.format(len(diff_labs_db_valid_gained_citations)))

23380 records have gained citations


In [52]:
# Get the extended data frames
# Initialize extended DataFrames for the "invalid" differences
labs_db_lost_citations_ext = diff_labs_db_valid_lost_citations
labs_db_gained_citations_ext = diff_labs_db_valid_gained_citations

# For lost citations
labs_db_lost_citations_ext['DB_Count'] = db_indexed.loc[diff_labs_db_valid_lost_citations.index]
labs_db_lost_citations_ext['Labs_Count'] = labs_indexed.loc[diff_labs_db_valid_lost_citations.index]

# For gained citations
labs_db_gained_citations_ext['DB_Count'] = db_indexed.loc[diff_labs_db_valid_gained_citations.index]
labs_db_gained_citations_ext['Labs_Count'] = labs_indexed.loc[diff_labs_db_valid_gained_citations.index]

In [55]:
# For lost core records (in DB)
labs_db_core_articles_with_loss = np.intersect1d(diff_labs_db_valid_lost_citations.index, core_list[0].unique())
labs_db_core_articles_with_loss = labs_db_lost_citations_ext.loc[labs_db_core_articles_with_loss].sort_values('Citation_Difference')
len(labs_db_core_articles_with_loss)

31564

In [57]:
# For gained core records (in DB)
labs_db_core_articles_with_gain = np.intersect1d(diff_labs_db_valid_gained_citations.index, core_list[0].unique())
labs_db_core_articles_with_gain = labs_db_gained_citations_ext.loc[labs_db_core_articles_with_gain].sort_values('Citation_Difference', ascending=False)
len(labs_db_core_articles_with_gain)

14981

### Difference between Legacy and DB

In [59]:
diff_legacy_db = db_indexed - legacy_indexed

In [60]:
diff_legacy_db_valid = diff_legacy_db[~diff_legacy_db.Citation_Difference.isna()]
diff_legacy_db_valid = diff_legacy_db_valid.sort_values('Citation_Difference')

In [61]:
diff_legacy_db_valid_zero_diff = diff_legacy_db_valid[diff_legacy_db_valid.Citation_Difference == 0]
print('{} records have 0 difference'.format(len(diff_legacy_db_valid_zero_diff)))

692683 records have 0 difference


In [62]:
diff_legacy_db_valid_lost_citations  = diff_legacy_db_valid[diff_legacy_db_valid.Citation_Difference < 0]
diff_legacy_db_valid_lost_citations = diff_legacy_db_valid_lost_citations.sort_values('Citation_Difference')
print('{} records have lost citations'.format(len(diff_legacy_db_valid_lost_citations)))

68525 records have lost citations


In [63]:
diff_legacy_db_valid_gained_citations  = diff_legacy_db_valid[diff_legacy_db_valid.Citation_Difference > 0]
diff_legacy_db_valid_gained_citations = diff_legacy_db_valid_gained_citations.sort_values('Citation_Difference', ascending=False)
print('{} records have gained citations'.format(len(diff_legacy_db_valid_gained_citations)))

14728 records have gained citations


In [64]:
# Get the extended data frames
# Initialize extended DataFrames for the "invalid" differences
legacy_db_lost_citations_ext = diff_legacy_db_valid_lost_citations
legacy_db_gained_citations_ext = diff_legacy_db_valid_gained_citations

# For lost citations
legacy_db_lost_citations_ext['DB_Count'] = db_indexed.loc[diff_legacy_db_valid_lost_citations.index]
legacy_db_lost_citations_ext['Legacy_Count'] = legacy_indexed.loc[diff_legacy_db_valid_lost_citations.index]

# For gained citations
legacy_db_gained_citations_ext['DB_Count'] = db_indexed.loc[diff_legacy_db_valid_gained_citations.index]
legacy_db_gained_citations_ext['Legacy_Count'] = legacy_indexed.loc[diff_legacy_db_valid_gained_citations.index]

In [65]:
# For lost core records (in DB)
legacy_db_core_articles_with_loss = np.intersect1d(diff_legacy_db_valid_lost_citations.index, core_list[0].unique())
legacy_db_core_articles_with_loss = legacy_db_lost_citations_ext.loc[legacy_db_core_articles_with_loss].sort_values('Citation_Difference')
len(legacy_db_core_articles_with_loss)

40935

In [66]:
# For gained core records (in DB)
legacy_db_core_articles_with_gain = np.intersect1d(diff_legacy_db_valid_gained_citations.index, core_list[0].unique())
legacy_db_core_articles_with_gain = legacy_db_gained_citations_ext.loc[legacy_db_core_articles_with_gain].sort_values('Citation_Difference', ascending=False)
len(legacy_db_core_articles_with_gain)

11512