### About
This notebook will retrieve citation counts and other metrics from Dimensions.  If data has been loaded from WoS, you can choose to use WoS citation counts ('TC' or 'Z9') instead and therefore skip this step. Dimensions tend to have better coverage of citation counts.

In [14]:
print('------------------------------------------------------')
print('Step 3:  Add citation data')
from datetime import datetime as dt
t_start = dt.now()
print(t_start)
print('------------------------------------------------------')

------------------------------------------------------
Step 3:  Add citation data
2018-02-16 14:02:20.901055
------------------------------------------------------


In [15]:
from config import Config as c

# import dependencies
import pandas as pd
import numpy as np
import os
import pickle
from bs4 import BeautifulSoup as bs
# for querying scopus
import requests
import json

# inputs
# datapath = c.datapath
dois_pkl = c.dois_pkl
dois = pickle.load(open(dois_pkl,'rb'))
print('Adding citation data for ',len(dois),' articles')

Adding citation data for  7710  articles


In [16]:
working_data = c.working_data
df = pd.read_csv(working_data, index_col=0)
filepaths_pkl = c.filepaths_pkl

In [17]:
dim_data_p = 'data/dimensions_metrics.json'
dim_failures_p = 'data/dimensions_failures.json'
print('Reading Dimensions data from cache')
try:
    with open(dim_data_p,'rb') as f:
        dim_data = json.loads(f.read())
    successes = [x for x in dim_data.keys()]

except:
    print('Cache not found.')
    print('Creating new cache.')
    dim_data = {}
    with open(dim_data_p,'wb') as f:
        f.write(b'{}') 
    successes=[]
        
try:
    with open(dim_failures_p,'rb') as f:
        failures = json.loads(f.read())

except:
    failures = []
    with open(dim_failures_p,'wb') as f:
        f.write(b'[]') 

Reading Dimensions data from cache


In [18]:
#define api call
def get_dimensions(doi, dim_data):
    try:
        out = dim_data[doi]
    except:
        base= 'http://metrics-api.dimensions.ai/doi/'
        r = requests.get(base+doi)
        out = r.json()
        dim_data[doi] = out
    return out, dim_data

In [19]:
# example
# get_dimensions('10.1007/s00401-012-1028-y',{})

In [20]:
# d, below is here to allow easily building a dataframe with dois as index.  Not really needed.
# this also means that try/except bits below are a little odd

d = {'field_citation_ratio': [],
 'highly_cited_1': [],
 'highly_cited_10': [],
 'highly_cited_5': [],
 'recent_citations': [],
 'relative_citation_ratio': [],
 'times_cited': []}

i=0
print('Retrieving data from Dimension Metrics API')
for doi in dois:
    if i%100==0:
        print('Data retrieved for: ',i,'/',len(dois),' articles')
    try:
        doi_dim_data, dim_data = get_dimensions(doi, dim_data)
        for dim in d:
            d[dim].append(doi_dim_data[dim]) # note that just 1 error here leads to all NaNs for 1 article
    except:
        print('Error. Recording data for {} as NaN.'.format(doi))
        for dim in d:
            d[dim].append(np.nan)
        if doi not in failures:
            failures.append(doi)
    else:
        pass
    i+=1

Retrieving data from Dimension Metrics API
Data retrieved for:  0 / 7710  articles
Data retrieved for:  100 / 7710  articles
Data retrieved for:  200 / 7710  articles
Data retrieved for:  300 / 7710  articles
Data retrieved for:  400 / 7710  articles
Data retrieved for:  500 / 7710  articles
Data retrieved for:  600 / 7710  articles
Data retrieved for:  700 / 7710  articles
Data retrieved for:  800 / 7710  articles
Data retrieved for:  900 / 7710  articles
Data retrieved for:  1000 / 7710  articles
Data retrieved for:  1100 / 7710  articles
Data retrieved for:  1200 / 7710  articles
Data retrieved for:  1300 / 7710  articles
Data retrieved for:  1400 / 7710  articles
Data retrieved for:  1500 / 7710  articles
Data retrieved for:  1600 / 7710  articles
Data retrieved for:  1700 / 7710  articles
Data retrieved for:  1800 / 7710  articles
Data retrieved for:  1900 / 7710  articles
Data retrieved for:  2000 / 7710  articles
Error. Recording data for 10.1523/JNEUROSCI.3579-13.014 as NaN.
Da

In [21]:
print('Total no of failures: ', len(failures))

Total no of failures:  45


In [22]:
# write data to files
print('Writing Dimensions data to file.')
with open(dim_data_p,'w+') as f:
    f.write(json.dumps(dim_data))
with open(dim_failures_p,'w+') as f:
    f.write(json.dumps(failures))    

Writing Dimensions data to file.


Construct dataframe

In [23]:
import pandas as pd
# add dois col
d['DOI'] = dois
# add links col
links = [r'http://dx.doi.org/'+doi for doi in dois]
d['Link'] = links
# reformat dict as a dataframe
df_dims = pd.DataFrame(d)

In [24]:
# if we're replacing the Dimensions data, we need to drop it from the existing df
try:
    df = df.drop(['field_citation_ratio','highly_cited_1',
                    'highly_cited_10','highly_cited_5','recent_citations',
                    'relative_citation_ratio','times_cited'], axis=1)
except:
    pass

In [25]:
print('Combining Dimensions citation metrics with existing dataset')
rows_before = df.shape[0]
df = df.merge(right=df_dims, left_on='DI',right_on='DOI',how='left',copy=False)
rows_after = df.shape[0]
if rows_before!=rows_after:
    print('Warning - merging Dimensions data is adding/losing rows')

Combining Dimensions citation metrics with existing dataset


In [26]:
# pick a column to be the 'citations' column.  Choose from: 
# TC  (WoS Core collection)
# Z9  (all WoS data)
# times_cited (Dimensions citation count (which seems to be best))
# note that you can also choose other columns from the Dimensions data, like RCR, FCR etc

df['Citations'] = df.times_cited

In [27]:
# write to file
df_dims.to_csv('data/Dimensions_data.csv')
df.to_csv(c.working_data)

In [28]:
print("Done in ",dt.now()-t_start)

Done in  0:00:04.900712
