In [1]:
import os
import requests
import json
import csv
import pandas as pd
from time import sleep
import numpy as np
from pybliometrics.scopus import CitationOverview, AuthorRetrieval, AbstractRetrieval
from pybliometrics.scopus.utils import config
config['Authentication'] ['APIKey'] = "127f5bf5de20d338f686704a9a328b86"
print(config['Authentication'] ['APIKey'])

127f5bf5de20d338f686704a9a328b86


## constants
publication_cutoff_year refers to the year where publications during and after will be counted

start_year refers to the first year that the publication count will be calculated (first year to be graphed)

end_year refers to the last year that the publication will be calculated (last year to be graphed)


In [2]:
INPUT_PATH_TO_CSV = "authorlist comparisons/scival_outputs/scival_ids_authorlist_official.csv"
OUTPUT_PATH_TO_CSV = "sv_authorlist_publications_updated4.csv"
OUTPUT_PATH_TO_CLEANED_CSV = "scival_authorlist_citations_nodupes.csv"

PUBLICATION_CUTOFF_YEAR = 0 #only publications DURING and AFTER this year will be counted
START_YEAR = 2018
END_YEAR = 2023

In [3]:
YEARS_INCLUDED = END_YEAR - START_YEAR + 1 #year range count, inclusive

# prepare columns of csv output
# output_cols = ["Name", "Scopus ID", "Document Count", "Cited by", "Cited by 5 years", "Publications", "Coauthors", "Affiliation"]
output_cols = ["Name", "Scopus ID", "Document Count", "Cited by", "Coauthors", "Affiliation"]

### helper functions

In [4]:
def get_all_citations(documents):
    '''return yearly citation data for all documents for the author'''
    yearly_citations_data = []

    #scopus ID is everything after 2nd hyphen for the eid
    scopus_id_list = [document.eid.split("-",2)[2] for document in documents]
    print(scopus_id_list)

    pages = int(len(documents)/25)
    
    if pages == 0:
        print(f"gathering citations")
        co = CitationOverview(scopus_id_list, start=START_YEAR, end=END_YEAR)
        yearly_citations_data = co.cc
    elif len(documents)%25 == 0:
        # urls = []
        for page in range(pages):
            print(f"gathering citations for page {page}/{pages-1}")
            list_per_page = scopus_id_list[(25*page):(25*(page+1))]
            co = CitationOverview(list_per_page, start=START_YEAR, end=END_YEAR)
            yearly_citations_data = yearly_citations_data + co.cc
            # urls = urls + co.url
    else:
        # urls = []
        for page in range(pages+1):
            print(f"gathering citations for page {page}/{pages}")
            list_per_page = scopus_id_list[(25*page):(25*(page+1))]
            co = CitationOverview(list_per_page, start=START_YEAR, end=END_YEAR)
            yearly_citations_data = yearly_citations_data + co.cc
            # urls = urls + co.url
        
    return yearly_citations_data

In [5]:
def remove_duplicate_pubs(out_arr):
    eids = []
    new_arr = []
    for i in range(len(out_arr)):
        current_eid = out_arr[i][2]
        if current_eid in eids:
            pass
        else:
            new_arr.append(out_arr[i])
            eids.append(current_eid)
    print("all publications searched")
    return new_arr, eids

# def remove_duplicate_pubs(out_dict):
#     eids = []
#     for i in range(len(out_dict[output_cols[0]])):
#         current_eid = out_dict["eid"][i]
#         if current_eid in eids:
#             #remove everything in that row
#             for key in out_dict.keys():
#                 # print("duplicate found")
#                 out_dict[key].pop(i)
#         else:
#             eids.append(current_eid)
#     print("all publications searched")
#     return out_dict, eids

In [92]:
def get_document_coauthors(document, author_id, ids_names_dict, coauthor_dict, publications=[], pub_dict={}, author_name =""):
    '''document: pybliometrics document object
    author_id: id of author to look for coauthors with
    ids_names_dict: dictionary where key is id and value is author name
    coauthor_dict: dictionary of the document's coauthors'''
    # publications.append(document.title)
    #for the SMS method

    #get all author IDs related to paper
    coauthors_list = document.author_ids.split(";")
    # if author_id in coauthors_list: 
    #     coauthors_list.remove(author_id)
    #filter for coauthors on list
    matching_coauthors = list(set(coauthors_list).intersection(ids_names_dict.keys())) 
    
    for id in matching_coauthors:
        coauthor_name = ids_names_dict[id]

        if (document.title not in pub_dict.keys()) or (coauthor_name not in pub_dict[document.title]): 
            pub_dict.setdefault(document.title,[]).append(coauthor_name)

        if id == author_id: 
            continue #do not add the author to their own coauthor dict
        
        try:
            coauthor_dict[coauthor_name] = coauthor_dict[coauthor_name] + 1
        except KeyError:
            coauthor_dict[coauthor_name] = 1
    return coauthor_dict,pub_dict

In [93]:
def fill_article_rows(author_details:tuple, documents, dict):
    '''author_details are (lastname, firstname, author ID)'''
    for document in documents:
        dict[output_cols[0]].append(document.title) #title
        dict[output_cols[1]].append(document.eid) #eid
        dict[output_cols[2]].append(document.coverDate.split("-")[0]) #publication date
        dict[output_cols[3]].append(document.author_names) #author names
        dict[output_cols[4]].append(author_details[0]) #last name
        dict[output_cols[5]].append(author_details[1]) #first name
        dict[output_cols[6]].append(author_details[2]) #author ID

## start here
1. read the input array that has a column containing author IDs

In [94]:
member_arr = []
with open(INPUT_PATH_TO_CSV, "r") as f:
    r = csv.reader(f)
    for row in r:
        if row[0] != "":
            member_arr.append(row)

# skip header rows
member_arr = member_arr[1:]
print(member_arr[0])
print(len(member_arr))

['Murphy', 'Timothy H.', '7401632487']
46


2. filter for only the IDs
- member_arr_filtered is a 2D array; each row is a new member and each col is that member's information.

In [95]:
#get members who have ids filled
id_col_idx = 2
lastname_col_idx = 0
firstname_col_idx = 1
print("first row:", member_arr[0])
try:
    member_arr_filtered = [row for row in member_arr if row[id_col_idx] != "" and int(row[id_col_idx])]
except ValueError:
    raise ValueError("make sure the index of the column ID from member array is valid!")
print("number of authors in list:",len(member_arr_filtered))

#subj search in query: 320 

first row: ['Murphy', 'Timothy H.', '7401632487']
number of authors in list: 46


In [96]:
member_ids = [row[id_col_idx] for row in member_arr]
member_names = [f"{member_arr_filtered[i][firstname_col_idx]} {member_arr_filtered[i][lastname_col_idx]}" for i in range(len(member_ids))]
name_id_dict = dict(zip(member_ids, member_names))
print(name_id_dict)

{'7401632487': 'Timothy H. Murphy', '55491054000': 'Annie Vogel Ciernia', '7006717711': 'Brian A. MacVicar', '24279161600': 'Fidel Vila-Rodriguez', '7005627810': 'Shernaz X. Bamji', '7101895712': 'Lara A. Boyd', '7004159655': 'Paul Pavlidis', '7005375626': 'Martin McKeown', '7004896118': 'A. Jon Stoessl', '6603629883': 'P. A. Cripton', '28167863900': 'Jason S. Snyder', '7006262846': 'Wolfram G. Tetzlaff', '7401658611': 'Anthony G. Phillips', '7005018303': 'Catharine A. Winstanley', '8908521300': 'YuTian Wang', '7003907622': 'Jeremy Keith Seamans', '35408131500': 'Terrance P. Snutch', '16407421900': 'Ian Mackenzie', '7102737545': 'Lynn A. Raymond', '55554765000': 'Kurt Haas', '37036981000': 'Mark S. Cembrowski', '7401756510': 'Fabio Rossi', '55889287500': 'Angela Jane I. Roskams', '7007032941': 'Catharine Rankin', '7402800449': 'Michael D. Gordon', '7103067039': 'Leonid Sigal', '11241026700': 'Z. Jane Wang', '6701751588': 'Peyman Servati', '7004079043': 'Liisa A. M. Galea', '7004549374'

3. get all the documents from the list of author IDs

In [103]:
output_dict = {f"{col_name}":[] for col_name in output_cols}
all_citations = [] #all citation per year information will be stored here
pub_authors = {}
orig_authors = {}

for member in member_arr_filtered:
    sleep(1)
    
    #create variables of interest
    lastname = member[lastname_col_idx]
    firstname = member[firstname_col_idx]
    scopus_id = member[id_col_idx]
    
    author_name = f"{firstname} {lastname}"
    coauthor_count = {}

    print(f"looking up {firstname} {lastname} ({scopus_id})...")
    ar = AuthorRetrieval(scopus_id)
    # author_name = f"{ar.given_name} {ar.surname}"
    documents = ar.get_documents()

    #get documents that are older than PUBLICATION CUTOFF YEAR
    adjusted_docs = [document for document in documents if int(document.coverDate.split("-")[0]) >= PUBLICATION_CUTOFF_YEAR]
    print(f"found {len(adjusted_docs)} publications between {PUBLICATION_CUTOFF_YEAR} and now")

    if len(adjusted_docs) == 0:
        print("no documents found in year range")
        continue
    else:
        coauthor_dict = {}
        for document in documents:
            #SMS no duplicates coauthor method
            if (document.title not in pub_authors.keys()) or (author_name not in pub_authors[document.title]): 
                pub_authors.setdefault(document.title,[]).append(author_name)
            
            # #SMS coauthor method
            # pub_authors.setdefault(document.title,[]).append(author_name)

            coauthor_dict, orig_authors = get_document_coauthors(document, scopus_id, name_id_dict, coauthor_dict, pub_dict = orig_authors, author_name=author_name)
        
        # output_cols = ["Name", "Scopus ID", "Document Count", "Cited by", "Coauthors", "Affiliation"]
        output_dict[output_cols[0]].append(author_name)
        output_dict[output_cols[1]].append(scopus_id)
        output_dict[output_cols[2]].append(ar.document_count)
        output_dict[output_cols[3]].append(ar.citation_count)
        output_dict[output_cols[4]].append(coauthor_dict)
        output_dict[output_cols[5]].append(ar.affiliation_current[0].preferred_name)
        
        # get all citations data
        # citations_data = get_all_citations(adjusted_docs)
        # author_citations = [[citation[year][1] for year in range(YEARS_INCLUDED)] for citation in citations_data]
        # all_citations = all_citations + author_citations #all citations keeps track of all the citations in the whole document
        
        # author_details = (lastname, firstname, mem_id)
        # fill_article_rows(author_details, adjusted_docs, output_dict)
        print("done\n")


looking up Timothy H. Murphy (7401632487)...
found 185 publications between 0 and now
done

looking up Annie Vogel Ciernia (55491054000)...
found 28 publications between 0 and now
done

looking up Brian A. MacVicar (7006717711)...
found 167 publications between 0 and now
done

looking up Fidel Vila-Rodriguez (24279161600)...
found 161 publications between 0 and now
done

looking up Shernaz X. Bamji (7005627810)...
found 44 publications between 0 and now
done

looking up Lara A. Boyd (7101895712)...
found 217 publications between 0 and now
done

looking up Paul Pavlidis (7004159655)...
found 158 publications between 0 and now
done

looking up Martin McKeown (7005375626)...
found 260 publications between 0 and now
done

looking up A. Jon Stoessl (7004896118)...
found 305 publications between 0 and now
done

looking up P. A. Cripton (6603629883)...
found 179 publications between 0 and now
done

looking up Jason S. Snyder (28167863900)...
found 37 publications between 0 and now
done

looki

In [104]:
collabs_dict={}
for key in pub_authors:
    for author in pub_authors[key]:
        for coauthor in pub_authors[key]:
            if coauthor is not author:
                if author not in collabs_dict.keys() or coauthor not in collabs_dict[author].keys():
                    collabs_dict.setdefault(author,{})[coauthor]=1
                else:
                    collabs_dict[author][coauthor]+=1

coauthored_pubs = [
    title
    for title, authors in pub_authors.items() 
    if len(authors) > 1]

coauthored_authors = [
    authors
    for title, authors in pub_authors.items() 
    if len(authors) > 1]

coauthored_df = pd.DataFrame({"Title": coauthored_pubs, "SMS Coauthors": coauthored_authors})

print(coauthored_df.head())

coauthored_df.to_csv("sv_SMS_pubs_nodupes.csv",index=False)

                                               Title  \
0  Chronic multiscale resolution of mouse brain n...   
1  Water-Reaching Platform for Longitudinal Asses...   
2  Multiscale imaging informs translational mouse...   
3  Towards a Visualizable, De-identified Syntheti...   
4  Altered cortical processing of sensory input i...   

                           SMS Coauthors  
0   [Timothy H. Murphy, Lynn A. Raymond]  
1   [Timothy H. Murphy, Lynn A. Raymond]  
2  [Timothy H. Murphy, Jeffrey M. LeDue]  
3      [Timothy H. Murphy, Helge Rhodin]  
4   [Timothy H. Murphy, Lynn A. Raymond]  


In [105]:
collabs_orig_dict={}
for key in orig_authors:
    for author in orig_authors[key]:
        for coauthor in orig_authors[key]:
            if coauthor is not author:
                if author not in collabs_orig_dict.keys() or coauthor not in collabs_orig_dict[author].keys():
                    collabs_orig_dict.setdefault(author,{})[coauthor]=1
                else:
                    collabs_orig_dict[author][coauthor]+=1

coauthored_orig_pubs = [
    title
    for title, authors in orig_authors.items() 
    if len(authors) > 1]

coauthored_orig_authors = [
    authors
    for title, authors in orig_authors.items() 
    if len(authors) > 1]

coauthored_orig_df = pd.DataFrame({"Title": coauthored_orig_pubs, "Direct Coauthors": coauthored_orig_authors})

print(coauthored_orig_df.head())

coauthored_orig_df.to_csv("sv_orig_pubs.csv",index=False)

                                               Title  \
0  Chronic multiscale resolution of mouse brain n...   
1  Water-Reaching Platform for Longitudinal Asses...   
2  Multiscale imaging informs translational mouse...   
3  Towards a Visualizable, De-identified Syntheti...   
4  Altered cortical processing of sensory input i...   

                        Direct Coauthors  
0   [Lynn A. Raymond, Timothy H. Murphy]  
1   [Lynn A. Raymond, Timothy H. Murphy]  
2  [Jeffrey M. LeDue, Timothy H. Murphy]  
3      [Helge Rhodin, Timothy H. Murphy]  
4   [Lynn A. Raymond, Timothy H. Murphy]  


In [106]:
print([title for title in coauthored_pubs if title not in coauthored_orig_pubs]) #SMS-original
print([title for title in coauthored_orig_pubs if title not in coauthored_pubs]) #original-SMS
# SMS is not capturing because..... Pannexin-1 opening in neuronal edema causes cell death but also leads to protection via increased microglia contacts 
# SMS captures very basic titles that may have multiple authors

[]
['Pannexin-1 opening in neuronal edema causes cell death but also leads to protection via increased microglia contacts']


In [81]:
# Write to rows dataframe
coauthors_new = [{}] * len(output_dict['Name'])

for author, coauthors in collabs_dict.items():
    try:
        author_idx = output_dict['Name'].index(author)
        coauthors_new[author_idx] = coauthors
    except ValueError:
        pass

# for author, coauthors in coauthors_new.items():
#     if coauthors == 0:
#         coauthors_new[author].update({author: {}})

print(coauthors_new)
output_dict["SMS Coauthors"] = coauthors_new

[{'Lynn A. Raymond': 17, 'Jeffrey M. LeDue': 25, 'Helge Rhodin': 5, 'Brian A. MacVicar': 2, 'YuTian Wang': 4, 'Wolfram G. Tetzlaff': 2, 'Craig E. Brown': 6, 'Andy Y. Shih': 9, 'Terrance P. Snutch': 1}, {}, {'Timothy H. Murphy': 2, 'Jeffrey M. LeDue': 10, 'Silke Appel-Cresswell': 1, 'Terrance P. Snutch': 5, 'Leigh Anne Swayne': 1, 'Shernaz X. Bamji': 1, 'YuTian Wang': 2, 'Anthony G. Phillips': 1}, {'Sophia Frangou': 5, 'Ian Mackenzie': 1, 'Jason S. Snyder': 2, 'Silke Appel-Cresswell': 1, 'Z. Jane Wang': 1}, {'Brian A. MacVicar': 1, 'YuTian Wang': 1, 'Kurt Haas': 3, 'Lynn A. Raymond': 2, 'Paul Pavlidis': 1, 'Catharine Rankin': 1, 'Terrance P. Snutch': 1, 'Anthony G. Phillips': 1, 'Ian Mackenzie': 1}, {'Liisa A. M. Galea': 1, 'Martin McKeown': 2, 'A. Jon Stoessl': 1, 'Silke Appel-Cresswell': 1, 'Vesna Sossi': 1, 'Todd Stephen Woodward': 2}, {'Shernaz X. Bamji': 1, 'Kurt Haas': 3, 'Catharine Rankin': 3, 'Jeffrey M. LeDue': 1}, {'Lara A. Boyd': 2, 'A. Jon Stoessl': 16, 'Silke Appel-Cresswel

4. format the citations information to append to dictionary, then export as csv

In [82]:

df = pd.DataFrame(output_dict)
df.to_csv(OUTPUT_PATH_TO_CSV)

In [None]:
for key in output_dict:
    print(f"{key}: {len(output_dict[key])}")
    
citations_array = np.array(all_citations)
citations_array = citations_array.T


title: 4195
eid: 4195
publication year: 4195
authors: 4195
last name: 4195
first name: 4195
member id: 4195


In [None]:

# check that all the columns have the same length
for key in output_dict:
    print(f"{key}: {len(output_dict[key])}")

# add citations of each year to the citations dictionary
for year in range(YEARS_INCLUDED):
    year_col = START_YEAR + year
    output_dict[str(year_col)] = citations_array[year].tolist()

df = pd.DataFrame(output_dict)
df.to_csv(OUTPUT_PATH_TO_CSV)

title: 4195
eid: 4195
publication year: 4195
authors: 4195
last name: 4195
first name: 4195
member id: 4195
2018: 4195
2019: 4195
2020: 4195
2021: 4195
2022: 4195
2023: 4195


4. remove duplicates

In [47]:
member_arr_filled = []
with open(OUTPUT_PATH_TO_CSV, "r") as f:
    r = csv.reader(f)
    for row in r:
        if row[0] != "":
            member_arr_filled.append(row)

# skip header rows
citations_header = member_arr_filled[0]
member_arr_filled = member_arr_filled[1:]

print("initial number of documents:",len(member_arr_filled))

# remove and store duplicates
no_dupes_arr, eid = remove_duplicate_pubs(member_arr_filled)

print("number of documents after duplicates are removed:",len(no_dupes_arr))

with open(OUTPUT_PATH_TO_CLEANED_CSV, 'w+', newline='') as file:
    mywriter = csv.writer(file, delimiter=',')
    mywriter.writerow(citations_header)
    mywriter.writerows(no_dupes_arr)

initial number of documents: 4194
all publications searched
number of documents after duplicates are removed: 3361


In [36]:
identifier = "7401632487"
ar = AuthorRetrieval(identifier)

In [38]:
print(ar.citation_count)

author_dict = {}
author_dict["Citation Count"] =ar.citation_count
print(author_dict)



14731
{'Citation Count': 14731}


In [50]:
pubs = ar.get_documents()

In [53]:
print(pubs[0].author_ids.split(";"))

['57217703996', '57204943506', '57211750582', '57211956459', '7401632487']


In [40]:
documents = ar.get_coauthors()