In [27]:
import os
import requests
import json
import csv
import pandas as pd
from time import sleep
import numpy as np
from pybliometrics.scopus import CitationOverview, AuthorRetrieval, AbstractRetrieval
from pybliometrics.scopus.utils import config
config['Authentication']['APIKey'] = "2e5922c95a0e4f1fb73cede25eca779a"

## constants
publication_cutoff_year refers to the year where publications during and after will be counted

start_year refers to the first year that the publication count will be calculated (first year to be graphed)

end_year refers to the last year that the publication will be calculated (last year to be graphed)


In [43]:
INPUT_PATH_TO_CSV = "in/manual_affils_check_test.csv"
OUTPUT_PATH_TO_CSV = "out/citations_per_year_PSYT_1.csv"
OUTPUT_PATH_TO_CLEANED_CSV = "out/citations_per_year_PSYT_1_cleaned.csv"

PUBLICATION_CUTOFF_YEAR = 2013 #only publications DURING and AFTER this year will be counted
START_YEAR = 2018
END_YEAR = 2023

In [30]:
YEARS_INCLUDED = END_YEAR - START_YEAR + 1 #year range count, inclusive

# prepare columns of csv output
output_cols = ["title", "eid", "publication year", "authors", "last name", "first name", "member id"]

### helper functions

In [31]:
def get_all_citations(documents):
    '''return yearly citation data for all documents in a 2d array with rows of documents and cols of years'''
    yearly_citations_data = []

    #scopus ID is everything after 2nd hyphen for the eid
    scopus_id_list = [document.eid.split("-",2)[2] for document in documents]
    print(scopus_id_list)

    pages = int(len(documents)/25)
    
    if pages == 0:
        print(f"gathering citations")
        co = CitationOverview(scopus_id_list, start=START_YEAR, end=END_YEAR)
        yearly_citations_data = co.cc
    elif len(documents)%25 == 0:
        # urls = []
        for page in range(pages):
            print(f"gathering citations for page {page}/{pages-1}")
            list_per_page = scopus_id_list[(25*page):(25*(page+1))]
            co = CitationOverview(list_per_page, start=START_YEAR, end=END_YEAR)
            yearly_citations_data = yearly_citations_data + co.cc
            # urls = urls + co.url
    else:
        # urls = []
        for page in range(pages+1):
            print(f"gathering citations for page {page}/{pages}")
            list_per_page = scopus_id_list[(25*page):(25*(page+1))]
            co = CitationOverview(list_per_page, start=START_YEAR, end=END_YEAR)
            yearly_citations_data = yearly_citations_data + co.cc
            # urls = urls + co.url
        
    return yearly_citations_data

In [32]:
def remove_duplicate_pubs(out_arr):
    eids = []
    new_arr = []
    for i in range(len(out_arr)):
        current_eid = out_arr[i][2]
        if current_eid in eids:
            pass
        else:
            new_arr.append(out_arr[i])
            eids.append(current_eid)
    print("all publications searched")
    return new_arr, eids

# def remove_duplicate_pubs(out_dict):
#     eids = []
#     for i in range(len(out_dict[output_cols[0]])):
#         current_eid = out_dict["eid"][i]
#         if current_eid in eids:
#             #remove everything in that row
#             for key in out_dict.keys():
#                 # print("duplicate found")
#                 out_dict[key].pop(i)
#         else:
#             eids.append(current_eid)
#     print("all publications searched")
#     return out_dict, eids

In [33]:
def fill_article_rows(author_details:tuple, documents, dict):
    '''author_details are (lastname, firstname, author ID)'''
    for document in documents:
        dict[output_cols[0]].append(document.title) #title
        dict[output_cols[1]].append(document.eid) #eid
        dict[output_cols[2]].append(document.coverDate.split("-")[0]) #publication date
        dict[output_cols[3]].append(document.author_names) #author names
        dict[output_cols[4]].append(author_details[0]) #last name
        dict[output_cols[5]].append(author_details[1]) #first name
        dict[output_cols[6]].append(author_details[2]) #author ID

## start here
1. read the input array that has a column containing author IDs

In [34]:
member_arr = []
with open(INPUT_PATH_TO_CSV, "r") as f:
    r = csv.reader(f)
    for row in r:
        if row[0] != "":
            member_arr.append(row)

# skip header rows
member_arr = member_arr[1:]

2. filter for only the IDs
- member_arr_filtered is a 2D array; each row is a new member and each col is that member's information.

In [35]:
#get members who have ids filled
id_col_idx = 4
try:
    member_arr_filtered = [row for row in member_arr if row[id_col_idx] != "" and int(row[id_col_idx])]
except ValueError:
    raise ValueError("make sure the index of the column ID from member array is valid!")
print("first row:", member_arr_filtered[0])
print("number of authors in list:",len(member_arr_filtered))

#subj search in query: 320 

first row: ['21', 'Anderson', 'Cameron M.', 'The University of British Columbia', '55453388200', '', 'NEUR (3); MEDI (1)', '']
number of authors in list: 292


3. get all the documents from the list of author IDs

In [36]:
output_dict = {f"{col_name}":[] for col_name in output_cols}
all_citations = [] #all citation per year information will be stored here

for member in member_arr_filtered:
    sleep(1)
    lastname = member[1]
    firstname = member[2]
    mem_id = member[4]

    print(f"looking up {firstname} {lastname} ({mem_id})...")
    documents = AuthorRetrieval(mem_id).get_documents()

    #get documents that are older than PUBLICATION CUTOFF YEAR
    adjusted_docs = [document for document in documents if int(document.coverDate.split("-")[0]) >= PUBLICATION_CUTOFF_YEAR]
    print(f"found {len(adjusted_docs)} publications between {PUBLICATION_CUTOFF_YEAR} and now")

    if len(adjusted_docs) == 0:
        print("no documents found in year range")
        continue
    else:
        # get all citations data
        citations_data = get_all_citations(adjusted_docs)
        author_citations = [[citation[year][1] for year in range(YEARS_INCLUDED)] for citation in citations_data]
        all_citations = all_citations + author_citations #all citations keeps track of all the citations in the whole document
        
        author_details = (lastname, firstname, mem_id)
        fill_article_rows(author_details, adjusted_docs, output_dict)
        print("done\n")


looking up Cameron M. Anderson (55453388200)...
found 0 publications between 2013 and now
no documents found in year range
looking up Jehannine C. Austin (7402093250)...
found 109 publications between 2013 and now
['85165664158', '85163799375', '85158872458', '85149406763', '85146083826', '85144146495', '85143210306', '85145751835', '85144022950', '85144019969', '85140252179', '85139203605', '85163880733', '85147875845', '85144526088', '85142273608', '85145716829', '85138282295', '85135147933', '85169617133', '85164561012', '85158871020', '85139714752', '85138060246', '85143318375', '85137532569', '85138017104', '85133414189', '85128980507', '85128304366', '85133919950', '85133819389', '85133776304', '85113754815', '85124818055', '85109343983', '85122270341', '85119968774', '85119040837', '85111375843', '85109113826', '85110709413', '85115760152', '85115223347', '85114738893', '85096708795', '85103962124', '85103888269', '85100290313', '85099191536', '85094198818', '85094187274', '8509

4. format the citations information to append to dictionary, then export as csv

In [37]:
for key in output_dict:
    print(f"{key}: {len(output_dict[key])}")
    
citations_array = np.array(all_citations)
citations_array = citations_array.T


title: 4195
eid: 4195
publication year: 4195
authors: 4195
last name: 4195
first name: 4195
member id: 4195


In [44]:

# check that all the columns have the same length
for key in output_dict:
    print(f"{key}: {len(output_dict[key])}")

# add citations of each year to the citations dictionary
for year in range(YEARS_INCLUDED):
    year_col = START_YEAR + year
    output_dict[str(year_col)] = citations_array[year].tolist()

df = pd.DataFrame(output_dict)
df.to_csv(OUTPUT_PATH_TO_CSV)

title: 4195
eid: 4195
publication year: 4195
authors: 4195
last name: 4195
first name: 4195
member id: 4195
2018: 4195
2019: 4195
2020: 4195
2021: 4195
2022: 4195
2023: 4195


4. remove duplicates

In [47]:
member_arr_filled = []
with open(OUTPUT_PATH_TO_CSV, "r") as f:
    r = csv.reader(f)
    for row in r:
        if row[0] != "":
            member_arr_filled.append(row)

# skip header rows
citations_header = member_arr_filled[0]
member_arr_filled = member_arr_filled[1:]

print("initial number of documents:",len(member_arr_filled))

# remove and store duplicates
no_dupes_arr, eid = remove_duplicate_pubs(member_arr_filled)

print("number of documents after duplicates are removed:",len(no_dupes_arr))

with open(OUTPUT_PATH_TO_CLEANED_CSV, 'w+', newline='') as file:
    mywriter = csv.writer(file, delimiter=',')
    mywriter.writerow(citations_header)
    mywriter.writerows(no_dupes_arr)

initial number of documents: 4194
all publications searched
number of documents after duplicates are removed: 3361
