In [98]:
import numpy as np
import pandas as pd
import os

pd.options.display.max_colwidth = 1000

In [99]:
csv_save_dir = 'output_csvs_new'
if not os.path.exists(csv_save_dir):
    os.makedirs(csv_save_dir)

In [3]:
%%time

# Read information for the new records
records = pd.read_csv('new-citations-rerun.tsv', sep='\t', header=None)
records.columns = ['recID', 'oldRef', 'newRef', 'metaData']

print('Total Records: {}'.format(len(records)))

Total Records: 30230936
CPU times: user 2min 39s, sys: 8.49 s, total: 2min 47s
Wall time: 2min 50s


In [100]:
# Read the new citation counts
labs = pd.read_csv('new-citation-counts-rerun.tsv', sep='\t', header=None)
labs.columns = ['recID', 'Citation_Difference']

# Index by recIDs
labs_indexed = labs.set_index('recID')

In [101]:
# Read the citation counts before update
labs_old = pd.read_csv('new-citation-counts.tsv', sep='\t', header=None)
labs_old.columns = ['recID', 'Citation_Difference']

# Index by recIDs
labs_indexed_old = labs_old.set_index('recID')

In [102]:
# Read the legacy citation counts
legacy = pd.read_csv('old-citation-counts.tsv', sep='\t')
legacy.columns = ['recID', 'Citation_Difference']

# Index by recrod IDs
legacy_indexed = legacy.set_index('recID')

In [103]:
# Compute the difference in citation counts between Labs and Legacy
diff = labs_indexed - legacy_indexed
diff_old = labs_indexed_old - legacy_indexed

In [104]:
# Filter out the NaNs
is_nan = diff.Citation_Difference.isna()
NaN_entries = diff[is_nan]

print('{} articles are differently indexed between the legacy and labs records and have no correspondence'.format(len(NaN_entries)))

11434 articles are differently indexed between the legacy and labs records and have no correspondence


In [105]:
# Filter out the NaNs (old)
is_nan_old = diff_old.Citation_Difference.isna()
NaN_entries_old = diff_old[is_nan_old]

print('{} articles are differently indexed between the legacy and labs (old run) records and have no correspondence'.format(len(NaN_entries_old)))

6032 articles are differently indexed between the legacy and labs (old run) records and have no correspondence


In [106]:
# Filter the valid entries
is_valid = ~diff.Citation_Difference.isna()
valid_diff = diff[is_valid]

is_valid_old = ~diff_old.Citation_Difference.isna()
valid_diff_old = diff_old[is_valid_old]

In [107]:
# Get the articles which have lost citations
lost = valid_diff.Citation_Difference < 0
lost_citations = valid_diff[lost]
print('Total {} articles lost citations in the Labs algorithm, with maximum lost count being {}'.format(
    len(lost_citations), lost_citations.min()))

# Sort the list
lost_citations = lost_citations.sort_values('Citation_Difference')

Total 50426 articles lost citations in the Labs algorithm, with maximum lost count being Citation_Difference   -1148.0
dtype: float64


In [108]:
# Get the articles which have lost citations (old)
lost_old = valid_diff_old.Citation_Difference < 0
lost_citations_old = valid_diff_old[lost_old]
print('Total {} articles lost citations in the old Labs algorithm, with maximum lost count being {}'.format(
    len(lost_citations_old), lost_citations_old.min()))

# Sort the list
lost_citations_old = lost_citations_old.sort_values('Citation_Difference')

Total 46256 articles lost citations in the old Labs algorithm, with maximum lost count being Citation_Difference   -1023.0
dtype: float64


In [109]:
# Get the articles which have gained citations
gained = valid_diff.Citation_Difference > 0
gained_citations = valid_diff[gained]
print('Total {} articles gained citations in the Labs algorithm, with maximum gained count being {}'.format(
    len(gained_citations), gained_citations.max()))

# Sort the list
gained_citations = gained_citations.sort_values('Citation_Difference', ascending=False)

Total 17141 articles gained citations in the Labs algorithm, with maximum gained count being Citation_Difference    267.0
dtype: float64


In [110]:
# Get the articles which have gained citations (old)
gained_old = valid_diff_old.Citation_Difference > 0
gained_citations_old = valid_diff_old[gained_old]
print('Total {} articles gained citations in the old Labs algorithm, with maximum gained count being {}'.format(
    len(gained_citations_old), gained_citations_old.max()))

# Sort the list
gained_citations_old = gained_citations_old.sort_values('Citation_Difference', ascending=False)

Total 19416 articles gained citations in the old Labs algorithm, with maximum gained count being Citation_Difference    408.0
dtype: float64


In [111]:
# Get the counts for absolute differences in citation numbers
abs_diff = np.abs(valid_diff)
abs_diff = abs_diff.sort_values('Citation_Difference', ascending=False)

# Filter out the zero difference entries
non_zero_diff = abs_diff.Citation_Difference != 0
abs_diff = abs_diff[non_zero_diff]

In [112]:
# Get the counts for absolute differences in citation numbers (old)
abs_diff_old = np.abs(valid_diff_old)
abs_diff_old = abs_diff_old.sort_values('Citation_Difference', ascending=False)

# Filter out the zero difference entries
non_zero_diff_old = abs_diff_old.Citation_Difference != 0
abs_diff_old = abs_diff_old[non_zero_diff_old]

In [113]:
# Initialize extended DataFrames for the differences
lost_citations_ext = lost_citations
gained_citations_ext = gained_citations
abs_diff_ext = abs_diff

# For lost citations
lost_citations_ext['Legacy_Count'] = legacy_indexed.loc[lost_citations.index]
lost_citations_ext['Labs_Count'] = labs_indexed.loc[lost_citations.index]

# For gained citations
gained_citations_ext['Legacy_Count'] = legacy_indexed.loc[gained_citations.index]
gained_citations_ext['Labs_Count'] = labs_indexed.loc[gained_citations.index]

# For absolute difference in citation counts
abs_diff_ext['Legacy_Count'] = legacy_indexed.loc[abs_diff.index]
abs_diff_ext['Labs_Count'] = labs_indexed.loc[abs_diff.index]

In [114]:
# Initialize extended DataFrames for the differences (old)
lost_citations_ext_old = lost_citations_old
gained_citations_ext_old = gained_citations_old
abs_diff_ext_old = abs_diff_old

# For lost citations
lost_citations_ext_old['Legacy_Count'] = legacy_indexed.loc[lost_citations_old.index]
lost_citations_ext_old['Labs_Count'] = labs_indexed_old.loc[lost_citations_old.index]

# For gained citations
gained_citations_ext_old['Legacy_Count'] = legacy_indexed.loc[gained_citations_old.index]
gained_citations_ext_old['Labs_Count'] = labs_indexed_old.loc[gained_citations_old.index]

# For absolute difference in citation counts
abs_diff_ext_old['Legacy_Count'] = legacy_indexed.loc[abs_diff_old.index]
abs_diff_ext_old['Labs_Count'] = labs_indexed_old.loc[abs_diff_old.index]

In [115]:
# Save the citation difference DataFrames to CSV Files
lost_citations_ext.to_csv(os.path.join(csv_save_dir, 'lost_citations.csv'))
gained_citations_ext.to_csv(os.path.join(csv_save_dir, 'gained_citations.csv'))
abs_diff_ext.to_csv(os.path.join(csv_save_dir, 'overall_difference.csv'))

### Some useful statistics

In [116]:
# Indicate Number of articles with zero differences in citation counts
num_cited_legacy = len(legacy.index.unique())
num_cited_labs = len(labs.index.unique())
num_unique_articles = len(records.recID.unique())
print('Total number of unique Cited articles as per legacy: {}'.format(num_cited_legacy))
print('Total number of unique Cited articles as per Labs: {}'.format(num_cited_labs))
print('Difference in Number between the Legacy and Labs Records: {}'.format(num_cited_legacy - num_cited_labs))
print('\nTotal Number of Unique Articles which cite other papers: {}\n'.format(num_unique_articles))

num_zero_diff = valid_diff.Citation_Difference.value_counts()[0]
zero_diff = num_zero_diff / num_cited_legacy * 100
print('Number of Cited articles with zero difference: {}'.format(num_zero_diff))
print('Percentage of Cited articles with zero difference: {0:0.2f} %'.format(zero_diff))

Total number of unique Cited articles as per legacy: 760203
Total number of unique Cited articles as per Labs: 753443
Difference in Number between the Legacy and Labs Records: 6760

Total Number of Unique Articles which cite other papers: 942172

Number of Cited articles with zero difference: 683539
Percentage of Cited articles with zero difference: 89.92 %


In [117]:
# Indicate Number of articles with zero differences in citation counts (old)
num_cited_legacy_old = len(legacy.index.unique())
num_cited_labs_old = len(labs_old.index.unique())
num_unique_articles_old = len(records.recID.unique())
print('Total number of unique Cited articles as per legacy: {}'.format(num_cited_legacy_old))
print('Total number of unique Cited articles as per Labs: {}'.format(num_cited_labs_old))
print('Difference in Number between the Legacy and Labs Records: {}'.format(num_cited_legacy_old - num_cited_labs_old))
print('\nTotal Number of Unique Articles which cite other papers: {}\n'.format(num_unique_articles_old))

num_zero_diff_old = valid_diff_old.Citation_Difference.value_counts()[0]
zero_diff_old = num_zero_diff_old / num_cited_legacy_old * 100
print('Number of Cited articles with zero difference: {}'.format(num_zero_diff_old))
print('Percentage of Cited articles with zero difference: {0:0.2f} %'.format(zero_diff_old))

Total number of unique Cited articles as per legacy: 760203
Total number of unique Cited articles as per Labs: 759481
Difference in Number between the Legacy and Labs Records: 722

Total Number of Unique Articles which cite other papers: 942172

Number of Cited articles with zero difference: 691154
Percentage of Cited articles with zero difference: 90.92 %


In [118]:
# Percentage Differences for absolute differences
abs_diff_counts = abs_diff.Citation_Difference.value_counts()
print('Percent (%) of Citation Counts Difference in Labs')
print(abs_diff_counts[:5] / num_cited_legacy * 100)

num_grt5 = sum(abs_diff_counts[6:])
percent_grt5 = num_grt5 / num_cited_legacy * 100
print('\nTotal {} records with difference greater than 5 - Percentage: {} %\n'.format(num_grt5, percent_grt5))

Percent (%) of Citation Counts Difference in Labs
1.0    7.017994
2.0    1.024989
3.0    0.321493
4.0    0.149960
5.0    0.084451
Name: Citation_Difference, dtype: float64

Total 2198 records with difference greater than 5 - Percentage: 0.28913329729032905 %



In [119]:
# Percentage Differences for absolute differences (old)
abs_diff_counts_old = abs_diff_old.Citation_Difference.value_counts()
print('Percent (%) of Citation Counts Difference in Labs')
print(abs_diff_counts_old[:5] / num_cited_legacy_old * 100)

num_grt5_old = sum(abs_diff_counts_old[6:])
percent_grt5_old = num_grt5_old / num_cited_legacy_old * 100
print('\nTotal {} records with difference greater than 5 - Percentage: {} %\n'.format(num_grt5_old, percent_grt5_old))

Percent (%) of Citation Counts Difference in Labs
1.0    6.652828
2.0    0.992498
3.0    0.330307
4.0    0.168508
5.0    0.100499
Name: Citation_Difference, dtype: float64

Total 2996 records with difference greater than 5 - Percentage: 0.3941052587269453 %



In [120]:
# Percentage Differences for lost citations
lost_citations_count = lost_citations.Citation_Difference.value_counts()
lost_percent_diff = lost_citations_count/ num_cited_legacy * 100
print('Percent (%) of Citation Counts Lost in Labs')
print(lost_percent_diff[:-5])

lost_grt5 = sum(lost_citations_count[-6:])
lost_percent_grt5 = lost_grt5 / num_cited_legacy * 100
print('\nTotal {} records with lost citations greater than 5 - Percentage: {} %\n'.format(lost_grt5, lost_percent_grt5))

Percent (%) of Citation Counts Lost in Labs
-1.0    5.254912
-2.0    0.782160
-3.0    0.227439
-4.0    0.101552
-5.0    0.055906
Name: Citation_Difference, dtype: float64

Total 1606 records with lost citations greater than 5 - Percentage: 0.21125936098647335 %



In [121]:
# Percentage Differences for lost citations (old)
lost_citations_count_old = lost_citations_old.Citation_Difference.value_counts()
lost_percent_diff_old = lost_citations_count_old/ num_cited_legacy_old * 100
print('Percent (%) of Citation Counts Lost in Labs')
print(lost_percent_diff_old[:-5])

lost_grt5_old = sum(lost_citations_count_old[-6:])
lost_percent_grt5_old = lost_grt5_old / num_cited_legacy_old * 100
print('\nTotal {} records with lost citations greater than 5 - Percentage: {} %\n'.format(lost_grt5_old, lost_percent_grt5_old))

Percent (%) of Citation Counts Lost in Labs
-1.0    4.818450
-2.0    0.701655
-3.0    0.201788
-4.0    0.091949
-5.0    0.050381
Name: Citation_Difference, dtype: float64

Total 1676 records with lost citations greater than 5 - Percentage: 0.22046742777915898 %



In [122]:
# Percentage Differences for gained citations
gained_citations_count = gained_citations.Citation_Difference.value_counts()
gained_percent_diff = gained_citations_count/ num_cited_legacy * 100
print('Percent (%) of Citation Counts Gained in Labs')
print(gained_percent_diff[:5])

gained_grt5 = sum(gained_citations_count[6:])
gained_percent_grt5 = gained_grt5 / num_cited_legacy * 100
print('\nTotal {} records with gained citations greater than 5 - Percentage: {} %\n'.format(gained_grt5, gained_percent_grt5))

Percent (%) of Citation Counts Gained in Labs
1.0    1.763082
2.0    0.242830
3.0    0.094054
4.0    0.048408
5.0    0.028545
Name: Citation_Difference, dtype: float64

Total 592 records with gained citations greater than 5 - Percentage: 0.07787393630385568 %



In [123]:
# Percentage Differences for gained citations (old)
gained_citations_count_old = gained_citations_old.Citation_Difference.value_counts()
gained_percent_diff_old = gained_citations_count_old/ num_cited_legacy_old * 100
print('Percent (%) of Citation Counts Gained in Labs')
print(gained_percent_diff_old[:5])

gained_grt5_old = sum(gained_citations_count_old[6:])
gained_percent_grt5_old = gained_grt5_old / num_cited_legacy_old * 100
print('\nTotal {} records with gained citations greater than 5 - Percentage: {} %\n'.format(gained_grt5_old, gained_percent_grt5_old))

Percent (%) of Citation Counts Gained in Labs
1.0    1.834378
2.0    0.290843
3.0    0.128518
4.0    0.076558
5.0    0.050118
Name: Citation_Difference, dtype: float64

Total 1320 records with gained citations greater than 5 - Percentage: 0.17363783094778631 %



### Further checking functionality

In [124]:
# Get all the records in both oldRef and newRef which are unique
old_unique = pd.DataFrame(records.oldRef.unique())
new_unique = pd.DataFrame(records.newRef.unique())

In [125]:
# Get the lists of records which are in Labs but not Legacy and vice-versa
old_unique_diff = np.isin(old_unique, new_unique, assume_unique=True)
new_unique_diff = np.isin(new_unique, old_unique, assume_unique=True)

labs_only = new_unique[~new_unique_diff]
legacy_only = old_unique[~old_unique_diff]

In [126]:
# Import the records as CSV
legacy_only.to_csv(os.path.join(csv_save_dir, 'legacy_only.csv'), index=False)
labs_only.to_csv(os.path.join(csv_save_dir, 'labs_only.csv'), index=False)