# UM Dimensions.ai Publications and References

Uses Dimensions Analytics API to collect publications from University of Mississippi, and to collect references from those publications for analysis. 

Additional cells at the bottom allow the user to look up a particular journal (using a Dimensions journal ID number) and to calculate citation ages for each reference. 

A Dimensions Analytics API key is required. The scripts also use the Dimcli package, which has extensive documentation here:

In [None]:
import json
import os
import sys
import time
import requests
import datetime
import dimcli
from dimcli.utils import *
from tqdm.notebook import tqdm
import pandas as pd
import plotly.express as px
if not 'google.colab' in sys.modules:
  # make js dependecies local / needed by html exports
  from plotly.offline import init_notebook_mode
  init_notebook_mode(connected=True)
    
print("==\nCHANGELOG\nThis notebook was last run on %s\n==" % datetime.date.today().strftime('%b %d, %Y'))
print("==\nLogging in..")
# https://digital-science.github.io/dimcli/getting-started.html#authentication
ENDPOINT = "https://app.dimensions.ai"
KEY = ""
dimcli.login(key=KEY, endpoint=ENDPOINT)
dsl = dimcli.Dsl()

## Get UM-authored publications for a date range

In [None]:
GRIDID = 'grid.251313.7' #@param {type:"string"}
# University of Mississippi organizational identifier (excludes UMMC)

YEAR_START = input("Enter the year at which to start the search: ") #@param {type: "slider", min: 1950, max: 2024}
YEAR_END = input("Enter the year at which to end the search: ") #@param {type: "slider", min: 1950, max: 2024}
YEAR_END = max(YEAR_END, YEAR_START)

In [None]:
### Initial lookup: all UM publications for the specified time period ###
publications = dsl.query_iterative(f"""

    search publications
        where research_orgs.id = "{GRIDID}"
        and year in [{YEAR_START}:{YEAR_END}]
        return publications[id+authors+reference_ids+year+funders+funding_section]

""")

In [None]:
# This caused major headaches in testing. Make sure to drop na values from the data - they will stop the next query in its tracks
pubs_and_citations = publications.as_dataframe().explode("reference_ids")
df = pubs_and_citations.copy()
df = df.dropna(axis=0)
citations = df.copy()
citations.rename(columns={"year": "UM_year","id": "UM_publication_ID","authors": "UM_pub_authors"}, inplace=True)

In [None]:
pubs_and_citations.info()

## Get references for UM-authored publications

In [None]:
### Start the references query using the list of publications previously generated  ###

# get a list of citation IDs
pubids = list(set(citations['reference_ids']))  # use set() to remove duplicates


#
# DSL query - change the return statement to extract different metadata of interest
query_template = """search publications
                    where id in {}
                    return publications[id+doi+journal+year+publisher+type+issn]
                    limit 1000"""


#
# loop through all references-publications IDs in chunks and query Dimensions
print(f"===\nExtracting publications data for {len(pubids)} citations...")
results = []
BATCHSIZE = 400
VERBOSE = False # set to True to see extraction logs

for chunk in tqdm(list(chunks_of(pubids, BATCHSIZE))):
    query = query_template.format(json.dumps(chunk))
    data = dsl.query(query, verbose=VERBOSE)
    results += data.publications
    time.sleep(0.5)

#
# save the citing pub data into a dataframe, remove duplicates and save
pubs_cited = pd.DataFrame().from_dict(results)
print("===\nCited Publications found: ", len(pubs_cited))


#
# transform the 'journal' column because it contains nested data
temp = pubs_cited['journal'].apply(pd.Series).rename(columns={"id": "journal.id",
                                                              "title": "journal.title"}).drop([0], axis=1)
pubs_cited = pd.concat([pubs_cited.drop(['journal'], axis=1), temp], axis=1).sort_values('type')
pubs_cited.head(10)

pubs_cited = pubs_cited.merge(citations, left_on='id', right_on='reference_ids')

In [None]:
pubs_cited.head(5)

## Add citation ages to the dataframe

In [None]:
# Handle NaN values in either of the year columns
pubs_cited.dropna(subset=['year', 'UM_year'], inplace = True)
pubs_cited.reset_index(inplace = True)  # reset the index - missing index values will cause issues later
pubs_cited.drop(['index'], axis = 1, inplace = True)  # drop the original index column

# Check the new dataframe
pubs_cited.info()
pubs_cited

In [None]:
# Make a list of citation ages by subtracting the citation year from the UM year
cycle = 0
citation_ages = []

for um_year in list(pubs_cited["UM_year"]):
    citation_age = int(um_year) - int(pubs_cited["year"][cycle])
    citation_ages.append(citation_age)
    cycle = cycle + 1

# Add the list of citation ages as a new column in the pubs_cited data
pubs_cited['citation_age'] = citation_ages
pubs_cited

## More info about journals and publishers

In [None]:
pubs_cited['journal.id'].describe()

In [None]:
journals = pubs_cited.value_counts(['journal.title', 'publisher'])
journals = journals.to_frame().reset_index().rename(columns= {0: 'citations', 'journal.title' : 'title'})
journals.index.name = 'index'

#preview
journals.head(10)

In [None]:
journals.shape

In [None]:
px.bar(journals[:50],
       x="title", y="citations", color="publisher",
       height=900,
       title=f"Top 50 journals cited by {GRIDID} (Time span {YEAR_START}-{YEAR_END})")

## Journal Lookup - Has Anyone Cited This Journal?

Search through the list of journal ids for the specified timeframe. If it matches a particular journal, pull the UM-affiliated authors

In [None]:
JOUR_ID_LOOKUP = "jour.1050134"

cited_count = 0
index = 0
um_authors = []
author_count = 0

for id in list(pubs_cited["journal.id"]):
    if id == JOUR_ID_LOOKUP:  # check if the journal appears in the overall citations list
        cited_count = cited_count + 1
        all_authors = pubs_cited["authors"][index]  # pull a list of the authors of the paper that cited the journal

        for author in all_authors:
            if author["affiliations"][0]["id"] == GRIDID:  # check if they are a UM author
                um_author = f'{author["first_name"]} {author["last_name"]}'
                um_affil = author["raw_affiliation"]
                if um_author not in um_authors:  # check if this author is already on the list
                    um_authors.append(um_author)  # if not, add them
                    um_authors.append(um_affil)  # and any departmental information they provided
                    author_count = author_count + 1
    
    index = index + 1

if cited_count > 0:
    print(f'Journal was cited in {cited_count} publications by {author_count} UM authors:',"\n")
    for author in um_authors:
        print(author, '\n')
else:
    print(f'This journal (journal id {JOUR_ID_LOOKUP}) was not cited by UM authors.')

## Export the data to Excel

In [None]:
print(f'Do you want to generate an Excel file of all UM publications for {YEAR_START} to {YEAR_END}? Y/N')
excel_UM_pubs = input()
if excel_UM_pubs in ["Y", "y", "yes", "Yes"]:
    print("Saving Excel file...")
    pubs_and_citations.to_excel(f"UM_pubs_{YEAR_START}_to_{YEAR_END}.xlsx")
    print("File saved.", "\n")
else:
    print("No Excel file will be generated.", "\n")


print(f'Do you want to generate an Excel file of all references & citation ages in UM publications for {YEAR_START} to {YEAR_END}? Y/N', 
      "\n" ,"WARNING: This may be a very large file")
excel_UM_refs = input()
if excel_UM_refs in ["Y", "y", "yes", "Yes"]:
    print("Saving Excel file...")
    pubs_cited.to_excel(f"References_in_UM_pubs_{YEAR_START}_to_{YEAR_END}.xlsx")
    print("File saved.", "\n")
else:
    print("No Excel file will be generated.", "\n")