In [2]:
import os
import requests
import json
import csv
import pandas as pd
from time import sleep
import numpy as np
from pybliometrics.scopus import CitationOverview, AuthorRetrieval, AbstractRetrieval
from pybliometrics.scopus.utils import config
config['Authentication'] ['APIKey'] = "127f5bf5de20d338f686704a9a328b86"
print(config['Authentication'] ['APIKey'])

127f5bf5de20d338f686704a9a328b86


## constants
publication_cutoff_year refers to the year where publications during and after will be counted

start_year refers to the first year that the publication count will be calculated (first year to be graphed)

end_year refers to the last year that the publication will be calculated (last year to be graphed)


In [3]:
INPUT_PATH_TO_CSV = "scival_outputs/author_lookup_output.csv"
OUTPUT_PATH_TO_CSV = "scival_dmcbh_citations.csv"
OUTPUT_PATH_TO_CLEANED_CSV = "scival_dmcbh_citations_nodupes.csv"

PUBLICATION_CUTOFF_YEAR = 0 #only publications DURING and AFTER this year will be counted
START_YEAR = 2018
END_YEAR = 2023

In [4]:
YEARS_INCLUDED = END_YEAR - START_YEAR + 1 #year range count, inclusive

# prepare columns of csv output
output_cols = ["Name", "Scopus ID", "Document Count", "Cited by", "Cited by 5 years", "Publications", "Coauthors", "Affiliation"]

### helper functions

In [5]:
def get_all_citations(documents):
    '''return yearly citation data for all documents for the author'''
    yearly_citations_data = []

    #scopus ID is everything after 2nd hyphen for the eid
    scopus_id_list = [document.eid.split("-",2)[2] for document in documents]
    print(scopus_id_list)

    pages = int(len(documents)/25)
    
    if pages == 0:
        print(f"gathering citations")
        co = CitationOverview(scopus_id_list, start=START_YEAR, end=END_YEAR)
        yearly_citations_data = co.cc
    elif len(documents)%25 == 0:
        # urls = []
        for page in range(pages):
            print(f"gathering citations for page {page}/{pages-1}")
            list_per_page = scopus_id_list[(25*page):(25*(page+1))]
            co = CitationOverview(list_per_page, start=START_YEAR, end=END_YEAR)
            yearly_citations_data = yearly_citations_data + co.cc
            # urls = urls + co.url
    else:
        # urls = []
        for page in range(pages+1):
            print(f"gathering citations for page {page}/{pages}")
            list_per_page = scopus_id_list[(25*page):(25*(page+1))]
            co = CitationOverview(list_per_page, start=START_YEAR, end=END_YEAR)
            yearly_citations_data = yearly_citations_data + co.cc
            # urls = urls + co.url
        
    return yearly_citations_data

In [6]:
def remove_duplicate_pubs(out_arr):
    eids = []
    new_arr = []
    for i in range(len(out_arr)):
        current_eid = out_arr[i][2]
        if current_eid in eids:
            pass
        else:
            new_arr.append(out_arr[i])
            eids.append(current_eid)
    print("all publications searched")
    return new_arr, eids

# def remove_duplicate_pubs(out_dict):
#     eids = []
#     for i in range(len(out_dict[output_cols[0]])):
#         current_eid = out_dict["eid"][i]
#         if current_eid in eids:
#             #remove everything in that row
#             for key in out_dict.keys():
#                 # print("duplicate found")
#                 out_dict[key].pop(i)
#         else:
#             eids.append(current_eid)
#     print("all publications searched")
#     return out_dict, eids

In [7]:
def get_document_coauthors(document, author_id, ids_names_dict, coauthor_dict, publications=[]):
    '''document: pybliometrics document object
    author_id: id of author to look for coauthors with
    ids_names_dict: dictionary where key is id and value is author name
    coauthor_dict: dictionary of the document's coauthors'''
    # publications.append(document.title)
    coauthors_list = document.author_ids.split(";")
    if author_id in coauthors_list: coauthors_list.remove(author_id)
    matching_coauthors = list(set(coauthors_list).intersection(ids_names_dict.keys()))
    for id in matching_coauthors:
        coauthor_name = ids_names_dict[id]
        try:
            coauthor_dict[coauthor_name] = coauthor_dict[id] + 1
        except KeyError:
            coauthor_dict[coauthor_name] = 1
    return coauthor_dict

In [8]:
def fill_article_rows(author_details:tuple, documents, dict):
    '''author_details are (lastname, firstname, author ID)'''
    for document in documents:
        dict[output_cols[0]].append(document.title) #title
        dict[output_cols[1]].append(document.eid) #eid
        dict[output_cols[2]].append(document.coverDate.split("-")[0]) #publication date
        dict[output_cols[3]].append(document.author_names) #author names
        dict[output_cols[4]].append(author_details[0]) #last name
        dict[output_cols[5]].append(author_details[1]) #first name
        dict[output_cols[6]].append(author_details[2]) #author ID

## start here
1. read the input array that has a column containing author IDs

In [12]:
member_arr = []
with open(INPUT_PATH_TO_CSV, "r") as f:
    r = csv.reader(f)
    for row in r:
        if row[0] != "":
            member_arr.append(row)

# skip header rows
member_arr = member_arr[1:]
print(len(member_arr))

128


2. filter for only the IDs
- member_arr_filtered is a 2D array; each row is a new member and each col is that member's information.

In [14]:
#get members who have ids filled
id_col_idx = 3
lastname_col_idx = 0
firstname_col_idx = 1
print("first row:", member_arr[0])
try:
    member_arr_filtered = [row for row in member_arr if row[id_col_idx] != "" and int(row[id_col_idx])]
except ValueError:
    raise ValueError("make sure the index of the column ID from member array is valid!")
print("first row:", member_arr_filtered[0])
print("number of authors in list:",len(member_arr_filtered))

#subj search in query: 320 

first row: ['Abd-Elrahman', 'Khaled', 'University of Ottawa', '36653563400', '0000-0001-9724-1975', 'Learning Memory & Dementias']
first row: ['Abd-Elrahman', 'Khaled', 'University of Ottawa', '36653563400', '0000-0001-9724-1975', 'Learning Memory & Dementias']
number of authors in list: 128


In [15]:
member_ids = [row[id_col_idx] for row in member_arr]
member_names = [f"{member_arr_filtered[i][firstname_col_idx]} {member_arr_filtered[i][lastname_col_idx]}" for i in range(len(member_ids))]
name_id_dict = dict(zip(member_ids, member_names))
print(name_id_dict)

{'36653563400': 'Khaled Abd-Elrahman', '7201584872': 'Douglas Allan', '7004622834': 'Vanessa Auld', '7402093250': 'Jehannine Austin', '6506673405': 'Shelina Babul', '7005627810': 'Shernaz Bamji', '7402080727': 'Phil Barker', '57020811700': 'Steven Barnes', '35474239200': 'Alasdair Barr', '57026954300': 'Jason Barton', '7102979717': 'Clare Beasley', '7101895712': 'Lara Boyd', '16169124200': 'Neil Cashman', '37036981000': 'Mark Cembrowski', '56521302100': 'Trisha Chakrabarty', '55491054000': 'Annie Ciernia', '56009858300': 'Luke Clark', '7201507943': 'Ann-Marie Craig', '36237882900': 'Silke Cresswell', '6603629883': 'Peter Cripton', '35576627400': 'Max Cynader', '7004268181': "Ryan D'Arcy", '7102727063': 'Adele Diamond', '7005866700': 'Doris Doudet', '23004179900': 'Lauren Emberson', '8042510100': 'Thalia Field', '6701799130': 'Stan Floresco', '7004625596': 'Dean Foti', '7004549374': 'Sophia Frangou', '7004079043': 'Liisa Galea', '6603734160': 'Debbie Giaschi', '57191035886': 'Julien Gib

3. get all the documents from the list of author IDs

In [17]:

output_dict = {f"{col_name}":[] for col_name in output_cols}
all_citations = [] #all citation per year information will be stored here

names = []
ids = []
coauthors = []

for member in member_arr_filtered:
    sleep(1)
    #create variables of interest
    
    
    lastname = member[lastname_col_idx]
    firstname = member[firstname_col_idx]
    scopus_id = member[id_col_idx]
    
    author_name = f"{firstname} {lastname}"
    coauthor_count = {}

    print(f"looking up {firstname} {lastname} ({scopus_id})...")
    documents = AuthorRetrieval(scopus_id).get_documents()

    #get documents that are older than PUBLICATION CUTOFF YEAR
    adjusted_docs = [document for document in documents if int(document.coverDate.split("-")[0]) >= PUBLICATION_CUTOFF_YEAR]
    print(f"found {len(adjusted_docs)} publications between {PUBLICATION_CUTOFF_YEAR} and now")

    if len(adjusted_docs) == 0:
        print("no documents found in year range")
        continue
    else:
        coauthor_dict = {}
        for document in documents:
            coauthor_dict = get_document_coauthors(document, scopus_id, name_id_dict, coauthor_dict)
        names.append(author_name)
        ids.append(scopus_id)
        coauthors.append(coauthor_dict)
        # get all citations data
        # citations_data = get_all_citations(adjusted_docs)
        # author_citations = [[citation[year][1] for year in range(YEARS_INCLUDED)] for citation in citations_data]
        # all_citations = all_citations + author_citations #all citations keeps track of all the citations in the whole document
        
        # author_details = (lastname, firstname, mem_id)
        # fill_article_rows(author_details, adjusted_docs, output_dict)
        print("done\n")


looking up Khaled Abd-Elrahman (36653563400)...
found 34 publications between 0 and now
done

looking up Douglas Allan (7201584872)...
found 44 publications between 0 and now
done

looking up Vanessa Auld (7004622834)...
found 48 publications between 0 and now
done

looking up Jehannine Austin (7402093250)...
found 152 publications between 0 and now
done

looking up Shelina Babul (6506673405)...
found 53 publications between 0 and now
done

looking up Shernaz Bamji (7005627810)...
found 44 publications between 0 and now
done

looking up Phil Barker (7402080727)...
found 119 publications between 0 and now
done

looking up Steven Barnes (57020811700)...
found 29 publications between 0 and now
done

looking up Alasdair Barr (35474239200)...
found 218 publications between 0 and now
done

looking up Jason Barton (57026954300)...
found 2 publications between 0 and now
done

looking up Clare Beasley (7102979717)...
found 49 publications between 0 and now
done

looking up Lara Boyd (7101895712

4. format the citations information to append to dictionary, then export as csv

In [70]:
out_dict = {
    "Name":names,
    "Scopus ID":ids,
    "Coauthors":coauthors,
}

df = pd.DataFrame(out_dict)
df.to_csv(OUTPUT_PATH_TO_CSV)

In [37]:
for key in output_dict:
    print(f"{key}: {len(output_dict[key])}")
    
citations_array = np.array(all_citations)
citations_array = citations_array.T


title: 4195
eid: 4195
publication year: 4195
authors: 4195
last name: 4195
first name: 4195
member id: 4195


In [44]:

# check that all the columns have the same length
for key in output_dict:
    print(f"{key}: {len(output_dict[key])}")

# add citations of each year to the citations dictionary
for year in range(YEARS_INCLUDED):
    year_col = START_YEAR + year
    output_dict[str(year_col)] = citations_array[year].tolist()

df = pd.DataFrame(output_dict)
df.to_csv(OUTPUT_PATH_TO_CSV)

title: 4195
eid: 4195
publication year: 4195
authors: 4195
last name: 4195
first name: 4195
member id: 4195
2018: 4195
2019: 4195
2020: 4195
2021: 4195
2022: 4195
2023: 4195


4. remove duplicates

In [47]:
member_arr_filled = []
with open(OUTPUT_PATH_TO_CSV, "r") as f:
    r = csv.reader(f)
    for row in r:
        if row[0] != "":
            member_arr_filled.append(row)

# skip header rows
citations_header = member_arr_filled[0]
member_arr_filled = member_arr_filled[1:]

print("initial number of documents:",len(member_arr_filled))

# remove and store duplicates
no_dupes_arr, eid = remove_duplicate_pubs(member_arr_filled)

print("number of documents after duplicates are removed:",len(no_dupes_arr))

with open(OUTPUT_PATH_TO_CLEANED_CSV, 'w+', newline='') as file:
    mywriter = csv.writer(file, delimiter=',')
    mywriter.writerow(citations_header)
    mywriter.writerows(no_dupes_arr)

initial number of documents: 4194
all publications searched
number of documents after duplicates are removed: 3361


In [36]:
identifier = "7401632487"
ar = AuthorRetrieval(identifier)

In [38]:
print(ar.citation_count)

author_dict = {}
author_dict["Citation Count"] =ar.citation_count
print(author_dict)



14731
{'Citation Count': 14731}


In [50]:
pubs = ar.get_documents()

In [53]:
print(pubs[0].author_ids.split(";"))

['57217703996', '57204943506', '57211750582', '57211956459', '7401632487']


In [40]:
documents = ar.get_coauthors()