In [16]:
import os
import requests
import json
import csv
import pandas as pd
from time import sleep
import numpy as np
from pybliometrics.scopus import CitationOverview, AuthorRetrieval, AbstractRetrieval

In [33]:
PATH_TO_INPUT_CSV = "in/PSYT_vancouver.csv"
PUBLICATION_CUTOFF_YEAR = 2013 #only publications DURING and AFTER this year will be counted
START_YEAR = 2018
END_YEAR = 2023
YEARS_INCLUDED = END_YEAR - START_YEAR + 1 #year range count, inclusive

In [25]:
member_arr = []
with open(PATH_TO_INPUT_CSV, "r") as f:
    r = csv.reader(f)
    for row in r:
        if row[0] != "":
            member_arr.append(row)

# skip header rows
member_arr = member_arr[1:]

In [26]:
# prepare columns 
output_cols = ["title", "eid", "publication year", "authors", "last name", "first name", "member id"]

In [21]:
def get_all_citations(documents):
    '''return yearly citation data for all documents in a 2d array with rows of documents and cols of years'''
    yearly_citations_data = []

    #scopus ID is everything after 2nd hyphen for the eid
    scopus_id_list = [document.eid.split("-",2)[2] for document in documents]
    print(scopus_id_list)

    pages = int(len(documents)/25)
    
    if pages == 0:
        print(f"gathering citations")
        co = CitationOverview(scopus_id_list, start=START_YEAR, end=END_YEAR)
        yearly_citations_data = co.cc
    elif len(documents)%25 == 0:
        # urls = []
        for page in range(pages):
            print(f"gathering citations for page {page}/{pages-1}")
            list_per_page = scopus_id_list[(25*page):(25*(page+1))]
            co = CitationOverview(list_per_page, start=START_YEAR, end=END_YEAR)
            yearly_citations_data = yearly_citations_data + co.cc
            # urls = urls + co.url
    else:
        # urls = []
        for page in range(pages+1):
            print(f"gathering citations for page {page}/{pages}")
            list_per_page = scopus_id_list[(25*page):(25*(page+1))]
            co = CitationOverview(list_per_page, start=START_YEAR, end=END_YEAR)
            yearly_citations_data = yearly_citations_data + co.cc
            # urls = urls + co.url
        
    return yearly_citations_data

In [35]:
member_arr_filtered = [row for row in member_arr if row[4] != ""]
print(member_arr_filtered[0])
print(len(member_arr_filtered))

['17', 'Allen', 'Katie', "BC\u200b Children\u200b'\u200b\u200bs Hospital", '57858657800', '', 'MEDI (1); PSYC (1)', '']
281


In [14]:
def remove_duplicate_pubs(out_arr):
    eids = []
    new_arr = []
    for i in range(len(out_arr)):
        current_eid = out_arr[i][2]
        if current_eid in eids:
            pass
        else:
            new_arr.append(out_arr[i])
            eids.append(current_eid)
    print("all publications searched")
    return new_arr, eids

# def remove_duplicate_pubs(out_dict):
#     eids = []
#     for i in range(len(out_dict[output_cols[0]])):
#         current_eid = out_dict["eid"][i]
#         if current_eid in eids:
#             #remove everything in that row
#             for key in out_dict.keys():
#                 # print("duplicate found")
#                 out_dict[key].pop(i)
#         else:
#             eids.append(current_eid)
#     print("all publications searched")
#     return out_dict, eids

In [15]:
def fill_article_rows(author_details:tuple, documents, dict):
    '''author_details are (lastname, firstname, author ID)'''
    for document in documents:
        dict[output_cols[0]].append(document.title) #title
        dict[output_cols[1]].append(document.eid) #eid
        dict[output_cols[2]].append(document.coverDate.split("-")[0]) #publication date
        dict[output_cols[3]].append(document.author_names) #author names
        dict[output_cols[4]].append(author_details[0]) #last name
        dict[output_cols[5]].append(author_details[1]) #first name
        dict[output_cols[6]].append(author_details[2]) #author ID

In [None]:
output_dict = {f"{col_name}":[] for col_name in output_cols}
all_citations = []

for member in member_arr_filtered:
    sleep(1)
    lastname = member[1]
    firstname = member[2]
    mem_id = member[4]

    print(f"looking up {firstname} {lastname} ({mem_id})...")
    documents = AuthorRetrieval(mem_id).get_documents()

    #get documents that are older than PUBLICATION CUTOFF YEAR
    adjusted_docs = [document for document in documents if int(document.coverDate.split("-")[0]) >= PUBLICATION_CUTOFF_YEAR]
    print(f"found {len(adjusted_docs)} publications between {PUBLICATION_CUTOFF_YEAR} and now")

    if len(adjusted_docs) == 0:
        print("no documents found in year range")
        continue
    else:
        # get all citations data
        
        citations_data = get_all_citations(adjusted_docs)
        author_citations = [[citation[year][1] for year in range(YEARS_INCLUDED)] for citation in citations_data]
        all_citations = all_citations + author_citations #all citations keeps track of all the citations in the whole document
        
        author_details = (lastname, firstname, mem_id)
        fill_article_rows(author_details, adjusted_docs, output_dict)
        print("done\n")


In [None]:
for key in output_dict:
    print(f"{key}: {len(output_dict[key])}")
    
citations_array = np.array(all_citations)
citations_array = citations_array.T


In [None]:

# check that all the columns have the same length
for key in output_dict:
    print(f"{key}: {len(output_dict[key])}")

# add citations of each year to the citations dictionary
for year in range(YEARS_INCLUDED):
    year_col = START_YEAR + year
    print(year_col)
    output_dict[str(year_col)] = citations_array[year].tolist()

# check that all the columns have the same length
for key in output_dict:
    print(f"{key}: {len(output_dict[key])}")

df = pd.DataFrame(output_dict)
df.to_csv("citations_per_year_psychiatry.csv")

In [None]:
for key in output_dict:
    print(f"{key}: {len(output_dict[key])}")

cleaned_output_dict, eids = remove_duplicate_pubs(output_dict)

for key in cleaned_output_dict:
    print(f"{key}: {len(cleaned_output_dict[key])}")

df = pd.DataFrame(cleaned_output_dict)
df.to_csv("cleaned_citations_per_year_psychiatry.csv")

In [None]:
member_arr_filled = []
with open("citations_per_year_psychiatry.csv", "r") as f:
    r = csv.reader(f)
    for row in r:
        if row[0] != "":
            member_arr_filled.append(row)

# skip header rows
citations_header = member_arr_filled[0]
member_arr_filled = member_arr_filled[1:]

In [None]:
print(member_arr_filled[0])
print(len(member_arr_filled))

In [None]:
no_dupes_arr, eid = remove_duplicate_pubs(member_arr_filled)
print(no_dupes_arr[0])
print(len(no_dupes_arr))

In [None]:
with open('PSYT_citations_cleaned.csv', 'w+', newline='') as file:
    mywriter = csv.writer(file, delimiter=',')
    mywriter.writerows(no_dupes_arr)