In [18]:
import os
import requests
import json
import csv
import pandas as pd
from time import sleep
from dotenv import load_dotenv
load_dotenv()
import numpy as np

In [45]:
from pybliometrics.scopus import CitationOverview, AuthorRetrieval, AbstractRetrieval


#to confirm API key used
from pybliometrics.scopus import config
#print(config['Authentication']['APIKey']) 
config['Authentication']['APIKey'] = "2e5922c95a0e4f1fb73cede25eca779a"
print(config['Authentication']['APIKey'])

2e5922c95a0e4f1fb73cede25eca779a


In [20]:
PATH_TO_INPUT_CSV = "in/filtered_psych_data.csv"
PUBLICATION_CUTOFF_YEAR = 2013 #only publications DURING and AFTER this year will be counted
START_YEAR = 2018
END_YEAR = 2023
YEARS_INCLUDED = END_YEAR - START_YEAR + 1 #year range count, inclusive

In [21]:
member_arr = []
with open(PATH_TO_INPUT_CSV, "r") as f:
    r = csv.reader(f)
    for row in r:
        if row[0] != "":
            member_arr.append(row)

# skip header rows
member_arr = member_arr[1:]

In [46]:
# prepare columns 
output_cols = ["title", "eid", "publication year", "authors", "last name", "first name", "member id"]

titles = []
eids = []
publication_years = []
articles_authors = []

member_lastnames = []
member_firstnames = []
member_ids = []
member_irps = []

all_citations = []

In [23]:
def get_all_citations(documents):
    '''return yearly citation data for all documents in a 2d array with rows of documents and cols of years'''
    yearly_citations_data = []

    #scopus ID is everything after 2nd hyphen for the eid
    scopus_id_list = [document.eid.split("-",2)[2] for document in documents]
    print(scopus_id_list)

    pages = int(len(documents)/25)
    
    if pages == 0:
        print(f"gathering citations")
        co = CitationOverview(scopus_id_list, start=START_YEAR, end=END_YEAR)
        yearly_citations_data = co.cc
    elif len(documents)%25 == 0:
        # urls = []
        for page in range(pages):
            print(f"gathering citations for page {page}/{pages-1}")
            list_per_page = scopus_id_list[(25*page):(25*(page+1))]
            co = CitationOverview(list_per_page, start=START_YEAR, end=END_YEAR)
            yearly_citations_data = yearly_citations_data + co.cc
            # urls = urls + co.url
    else:
        # urls = []
        for page in range(pages+1):
            print(f"gathering citations for page {page}/{pages}")
            list_per_page = scopus_id_list[(25*page):(25*(page+1))]
            co = CitationOverview(list_per_page, start=START_YEAR, end=END_YEAR)
            yearly_citations_data = yearly_citations_data + co.cc
            # urls = urls + co.url
        
    return yearly_citations_data

no irp distinction

In [24]:
member_arr_filtered = [row for row in member_arr if row[3] != ""]
print(member_arr_filtered)
print(len(member_arr_filtered))

[['12', 'Allen', 'Katie', "BC\u200b Children\u200b'\u200b\u200bs Hospital", '57858657800', '', 'MEDI (1); PSYC (1)', ''], ['16', 'Anderson', 'Cameron M.', 'The University of British Columbia', '55453388200', '', 'NEUR (3); MEDI (1)', ''], ['19', 'Austin', 'Jehannine C.', 'University of British Columbia, Faculty of Medicine', '7402093250', '0000-0003-0338-7055', 'MEDI (264); BIOC (47); NEUR (37)', ''], ['21', 'Aydin', 'Cristina M.', 'The University of British Columbia', '15130612300', '', 'MEDI (4); BIOC (1)', ''], ['22', 'Azarbar', 'Ataa', 'The University of British Columbia', '57193737040', '', 'MEDI (2); PSYC (1); NEUR (1)', ''], ['23', 'Azim', 'Hassan F.A.', 'The University of British Columbia', '6701805069', '', 'MEDI (35); PSYC (25); SOCI (2)', ''], ['24', 'Baer', 'Susan', 'The University of British Columbia', '7005865403', '', 'PHYS (23); MEDI (15); CHEM (12)', ''], ['25', 'Bailey', 'Anthony J.', 'University of British Columbia, Faculty of Medicine', '7402596134', '', 'MEDI (129)

In [42]:
def remove_duplicate_pubs(out_dict):
    eids = []
    for i in range(len(out_dict[output_cols[0]])):
        current_eid = out_dict[output_cols[1]][i]
        if current_eid in eids:
            #remove everything in that row
            for key in out_dict.keys():
                print("duplicate found")
                out_dict[key].pop(i)
        else:
            eids.append(current_eid)
    print("all publications searched")
    return out_dict, eids
        

In [26]:
def fill_article_rows(author_details:tuple, documents, dict):
    for document in documents:
        dict[output_cols[0]].append(document.title) #title
        dict[output_cols[1]].append(document.eid) #eid
        dict[output_cols[2]].append(document.coverDate.split("-")[0]) #publication date
        dict[output_cols[3]].append(document.author_names) #author names
        dict[output_cols[4]].append(author_details[0]) #last name
        dict[output_cols[5]].append(author_details[1]) #first name
        dict[output_cols[6]].append(author_details[2]) #author ID

In [44]:
output_dict = {f"{col_name}":[] for col_name in output_cols}

for member in member_arr_filtered:
    sleep(1)
    lastname = member[1]
    firstname = member[2]
    mem_id = member[4]

    print(f"looking up {firstname} {lastname} ({mem_id})...")
    documents = AuthorRetrieval(mem_id).get_documents()

    #get documents that are older than PUBLICATION CUTOFF YEAR
    adjusted_docs = [document for document in documents if int(document.coverDate.split("-")[0]) >= PUBLICATION_CUTOFF_YEAR]
    print(f"found {len(adjusted_docs)} publications between {PUBLICATION_CUTOFF_YEAR} and now")

    if len(adjusted_docs) == 0:
        print("no documents found in year range")
        continue
    else:
        # get all citations data
        author_details = (lastname, firstname, mem_id)
        
        citations_data = get_all_citations(adjusted_docs)
        author_citations = [[citation[year][1] for year in range(YEARS_INCLUDED)] for citation in citations_data]
        all_citations = all_citations + author_citations #all citations keeps track of all the citations in the whole document
        
        fill_article_rows(author_details, adjusted_docs, output_dict)
        print("done\n")


looking up Katie Allen (57858657800)...
found 1 publications between 2013 and now
['85136569317']
gathering citations


Scopus403Error: Requestor configuration settings insufficient for access to this resource.

In [39]:
for key in output_dict:
    print(f"{key}: {len(output_dict[key])}")
del output_dict["citations"]
df = pd.DataFrame(output_dict)
df.to_csv("only_documents.csv")

title: 332
eid: 332
publication year: 332
authors: 332
last name: 332
first name: 332
member id: 332


KeyError: 'citations'

In [43]:
for key in output_dict:
    print(f"{key}: {len(output_dict[key])}")

output_dict, eids = remove_duplicate_pubs(output_dict)

for key in output_dict:
    print(f"{key}: {len(output_dict[key])}")

print(eids)

title: 332
eid: 332
publication year: 332
authors: 332
last name: 332
first name: 332
member id: 332
all publications searched
title: 332
eid: 332
publication year: 332
authors: 332
last name: 332
first name: 332
member id: 332
['2-s2.0-85136569317', '2-s2.0-85165664158', '2-s2.0-85163799375', '2-s2.0-85158872458', '2-s2.0-85149406763', '2-s2.0-85146083826', '2-s2.0-85144146495', '2-s2.0-85143210306', '2-s2.0-85145751835', '2-s2.0-85144022950', '2-s2.0-85144019969', '2-s2.0-85140252179', '2-s2.0-85139203605', '2-s2.0-85163880733', '2-s2.0-85147875845', '2-s2.0-85144526088', '2-s2.0-85142273608', '2-s2.0-85145716829', '2-s2.0-85138282295', '2-s2.0-85135147933', '2-s2.0-85169617133', '2-s2.0-85164561012', '2-s2.0-85158871020', '2-s2.0-85139714752', '2-s2.0-85138060246', '2-s2.0-85143318375', '2-s2.0-85137532569', '2-s2.0-85138017104', '2-s2.0-85133414189', '2-s2.0-85128980507', '2-s2.0-85128304366', '2-s2.0-85133919950', '2-s2.0-85133819389', '2-s2.0-85133776304', '2-s2.0-85113754815', '

In [None]:
citations_array = np.array(all_citations)
citations_array = citations_array.T

# check that all the columns have the same length
for key in output_dict:
    print(f"{key}: {len(output_dict[key])}")

# add citations of each year to the citations dictionary
for year in range(YEARS_INCLUDED):
    year_col = START_YEAR + year
    print(year_col)
    output_dict[str(year_col)] = citations_array[year].tolist()

# check that all the columns have the same length
for key in output_dict:
    print(f"{key}: {len(output_dict[key])}")

df = pd.DataFrame(output_dict)
df.to_csv("citations_per_year_psychiatry.csv")

yes irp distinction

In [None]:
def fill_article_rows(lastname, firstname, mem_id, mem_irp, documents):
    for document in documents:
        titles.append(document.title)
        eids.append(document.eid)
        articles_authors.append(document.author_names)
        publication_years.append(document.coverDate.split("-")[0])
        member_lastnames.append(lastname)
        member_firstnames.append(firstname)
        member_ids.append(mem_id)
        member_irps.append(mem_irp)

In [None]:
titles = []
eids = []
# scopus_urls = []
api_urls = []
publication_years = []
articles_authors = []

member_lastnames = []
member_firstnames = []
member_ids = []
member_irps = []

all_citations = []


for member in member_arr:
    sleep(1)
    lastname = member[0]
    firstname = member[1]
    mem_id = member[3]
    mem_irp = member[5]

    print(f"looking up {firstname} {lastname} ({mem_id}) in {mem_irp} IRP...")

    documents = AuthorRetrieval(mem_id).get_documents()

    #get documents that are older than PUBLICATION CUTOFF YEAR
    adjusted_docs = [document for document in documents if int(document.coverDate.split("-")[0]) >= PUBLICATION_CUTOFF_YEAR]

    print(f"found {len(adjusted_docs)} publications between {PUBLICATION_CUTOFF_YEAR} and now")
    
    if len(adjusted_docs) == 0:
        print("no documents found in year range")
        pass
    else:
        citations_data = get_all_citations(adjusted_docs, all_citations)
        author_citations = [[citation[year][1] for year in range(YEARS_INCLUDED)] for citation in citations_data]
        all_citations = all_citations + author_citations

        fill_article_rows(lastname, firstname, mem_id, mem_irp, adjusted_docs)
        print("done\n")

In [None]:
print(api_urls)
#  https://api.elsevier.com/content/abstract/scopus_id/

In [None]:
csv_dict = {
    "title": titles,
    "eid": eids,
    "publication year": publication_years,
    "authors": articles_authors,
    
    "last name": member_lastnames,
    "first name": member_firstnames,
    "member id": member_ids, 
}

citations_array = np.array(all_citations)
citations_array = citations_array.T

for key in csv_dict:
    print(f"{key}: {len(csv_dict[key])}")

for year in range(YEARS_INCLUDED):
    year_col = START_YEAR + year
    print(year_col)
    csv_dict[str(year_col)] = citations_array[year].tolist()

for key in csv_dict:
    print(f"{key}: {len(csv_dict[key])}")

df = pd.DataFrame(csv_dict)
df.to_csv("citations_per_year_psychiatry.csv")

In [None]:
documents = AuthorRetrieval(36653563400).get_documents()
docs_eids = AuthorRetrieval(36653563400).get_document_eids()
# for document in documents:
    

In [None]:
#print(docs_eids)
for document in documents:
    print(document.coverDate)

adjusted_docs = [document for document in documents if int(document.coverDate.split("-")[0]) >= PUBLICATION_CUTOFF_YEAR]
print(adjusted_docs)
scopus_id_list = [eid.split("-",2)[2] for eid in docs_eids]
print(scopus_id_list)

In [None]:
print()

In [None]:
co = CitationOverview(scopus_id_list[0:25], start=2018, end=2023)

In [None]:
article_yeardata = co.cc

In [None]:
co2 = CitationOverview(scopus_id_list[25:], start=2018, end=2023)
print(co2.cc)

In [None]:
all_citations = co.cc + co2.cc
#print(len(documents))
#print(documents[25:50])
#print(len(documents[25:50]))
print(all_citations[0])
print(all_citations[0][5][1])
print(YEARS_INCLUDED)
print([[citation[year][1] for year in range(YEARS_INCLUDED)] for citation in all_citations])

print(len([[[citation[year][1]] for year in range(YEARS_INCLUDED)] for citation in all_citations]))

In [None]:
citation_test = np.array([[citation[year][1] for year in range(YEARS_INCLUDED)] for citation in all_citations])
print(citation_test.T)
dict = {"a":0}
dict[f"{START_YEAR}"] = citation_test[0].tolist()
print(dict)

In [None]:
API_KEY = os.getenv("OFFICIAL_API_KEY")
API_COUNT = 25
header = {
    "X-ELS-APIKey": API_KEY,
}
api_url = "https://api.elsevier.com/content/search/scopus?"

def get_api_json(query: str, start=0):
    params = {
        "query":query,
    }
    try:
        response = requests.get(url=api_url, params=params, headers=header)
        response.raise_for_status
        data = json.loads(response.text)
    except requests.exceptions.HTTPError as e:
        print("HTTPS error: ", e)
    except requests.exceptions.RetryError as e:
        print("Max retries exceeded: ", e)
    return data

In [None]:
query = f"au-id(36653563400)"
data = get_api_json(query, start=0)
print(data)

In [None]:
query = f"au-id(36653563400)"
data = get_api_json(query, start=0)
print(data)

In [None]:
data_str = json.dumps(data["search-results"]["entry"][0], indent=4)
print(data_str)