In [1]:
# from google.colab import drive
# drive.mount('/content/drive')

In [2]:
import requests
from urllib.parse import urlencode, quote_plus
import numpy as np
import sys
from dotenv import find_dotenv, load_dotenv
import os
import pandas as pd

import nltk
import ssl

try:
    _create_unverified_https_context = ssl._create_unverified_context
except AttributeError:
    pass
else:
    ssl._create_default_https_context = _create_unverified_https_context
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/sayemkamal/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/sayemkamal/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     /Users/sayemkamal/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

# **Citing this code**
This code is the second version of a Expertise finding tool developed by Volz et al. 2023 (https://ui.adsabs.harvard.edu/abs/2023AAS...24210207V/abstract).<br>
It utilizes NASA ADS API to query for articles (refereed or not) in the "Astronomy" database (cite ADS).
Please, cite "Helfenbein et al. 2023 (in prep) and refer to the README file in the github.

**Directory set up**<br>
The file *stopwords.txt* is utilized to create meaningful N-grams. Make sure to provide an accurate path in the following cell.<br> Also, the path will be used by the code in other instances to identify other useful files.

In [3]:
path_stop= '/Users/sayemkamal/NASA_Project/ReviewerExtractor/codeV3/'
stop_file='stopwords.txt'
stop_dir=path_stop+stop_file
sys.path.append(path_stop)



In [5]:
#token = 'Your own token from ADS API page ' #Insert your API token
token = 'WIhvOVWrt4ksJRFq7P0fuYToniASsS5Wlp5NC28V' 


In [6]:
#For the TextAnalysis File, please refer to M. Volze et al. 2023
import TextAnalysis as TA
import ADSsearcherpkg as AP

# **Example 1: Searching expertises of a single person based on their name**

The search will focus on papers published by a specific author in the past 15 years independently of the current affiliation:<br>
The format for a single author search is as follows: **"Last, First"**<br>
In the following example we search for Dr. Joshua Pepper expertise. <br>
**Note:** the user can decide to query ONLY refereed paper adding, before the token keyword the following keyword:<br>
**refereed="property:refereed"**


In [7]:
datf=AP.ads_search(name="Cucchiara, Antonino",
               token=token, stop_dir=stop_dir)

I will search for papers matching the following criteria:
author:"^Cucchiara, Antonino"

I am now querying ADS.



In [8]:
import inspect
print(inspect.signature(AP.ads_search))

(name=None, institution=None, year=None, refereed='property:notrefereed OR property:refereed', token=None, stop_dir=None, second_auth=False)


In [9]:
# To display the data frame run the following:
datf
print(len(datf.iloc[0]['Title']))
# To save it in a excel format run the following:
#datf.to_csv(path_stop+"output.csv")

139


# Testing Second Author

In [10]:
datf2=AP.ads_search(name="Cucchiara, Antonino",
               token=token, stop_dir=stop_dir, second_auth=True)

I will search for papers matching the following criteria:
author:"Cucchiara, Antonino",2

I am now querying ADS.



In [11]:
(len(datf2.iloc[0]['Title']))

127

# **Example 2: Searching expertises of ALL scientists that published as first authors when affiliated to single institution name**

The search will focus on papers and all authors that have published in the past 15 years at a specific institution (academic or otherwise): <br>
The format for a single institution is as follows: **institution="Institution Name"**. <br>
**Caveat**: It is possible that the institutions as input by the user does not match what has been cataloged in ADS, therefore if the final output is empty, make sure to try different versions of the institution names (e.g. Cal Poly Pomona, Cal Poly, California Polytechnic State University) to get the most complete list of authors.

In [None]:
datf=AP.ads_search(institution="Hampton University",refereed="property:refereed",
               token=token, stop_dir=stop_dir)

In [None]:
# To display the data frame run the following:
datf
# To save it in a excel format run the following:
#datf.to_csv(path_stop+"output.csv")

# **Example 3: Searching a single author publication while affiliated to a specific institution**

The search will focus on papers published by a single author while they are affiliated to a specific institution, in the past 15 years:<br>

The format for a single author and institution is as follows: **name= 'Last, First', institution= 'Institution Name'**.

In [None]:
datf=AP.ads_search(name= 'Capper, Daniel', institution="University of Southern Mississippi",
               token=token, stop_dir=stop_dir)

In [None]:
# To display the data frame run the following:
datf
# To save it in a excel format run the following:
#datf.to_csv(path_stop+"output.csv")

# **Example 4: Searching a single author name within a different time-frame**

The search will focus on papers from one single author that were published in a different time-frame. There are two options for doing so:
   - A single year (e.g. 2010): in this case the code will query ADS for articles published by the specified authors between one year prior to 4  years after. So searching year='2010' will search articles between 2009 and 2014<br>
   - A year range: in this case the syntax is year='[YEAR TO YEAR]' (e.g. year='[2009 TO 2023]') <br>

The format for a single author name remains the same as before: **name= 'Last, First'**. <br>

Here are two examples:
- Searching for Dr. Pepper's articles between year 1999 and 2004
- Searching for Dr. Pepper's articles between year 2019 and 2023

In [None]:
datf=AP.ads_search(name= 'Pepper, Joshua', year='2000',
               token=token, stop_dir=stop_dir)

In [None]:
# To display the data frame run the following:
datf
# To save it in a excel format run the following:
#datf.to_csv(path_stop+"output.csv")

In [None]:
datf=AP.ads_search(name= 'Pepper, Joshua', year='[2019 TO 2023]',
               token=token, stop_dir=stop_dir)

In [None]:
# To display the data frame run the following:
datf
# To save it in a excel format run the following:
#datf.to_csv(path_stop+"output.csv")

# **Example 5: Searching a single institution name within a specific time-frame**

The search will focus on authors that publishes as first authors affiliated to a specific institution in a defined timespan. <br>
The format for a author name is the same in previous example (**"Last name, First name"**) and specified year range is similar to the option provided earlier:<br>
   - A single year (e.g. 2010): in this case the code will query ADS for articles published by the specified authors between one year prior to 4  years after. So searching year='2010' will search articles between 2009 and 2014<br>
   - A year range: in this case the syntax is year='[YEAR TO YEAR]' (e.g. year='[2009 TO 2023]') <br>

Following we present two examples:

In [None]:
datf=AP.ads_search(institution="University of Southern Mississippi",year='2000',
               token=token, stop_dir=stop_dir)

In [None]:
# To display the data frame run the following:
datf
# To save it in a excel format run the following:
#datf.to_csv(path_stop+"output.csv")

# **Example 6: Searching a single Author, at a specific institution and within a specific time-frame**

The following example combines several of the previous ones in a single search.
Specifically:<br>
   - A single author<br>
   - Affiliated to a single institutions<br>
   - In a specific time frame of publications<br>
    
Please, refer to the previous examples for the sintax required. <br>
Here are an example

In [None]:
datf=AP.ads_search(name= 'Brown, Beth A.', institution="Howard university",year='[2009 TO 2022]',
               token=token, stop_dir=stop_dir)

In [None]:
# To display the data frame run the following:
datf
# To save it in a excel format run the following:
#datf.to_csv(path_stop+"output.csv")

# **Example 7: Searching through a list of institutions**

The search will focus on papers from a list of institutions, so the input is a csv file that has multiple institution names stored in it. This will then find all papers from those institutions (**see CAVEATS in Example 2 above related to Institution Names)**:<br>

The input file has to be a .csv file (e.g."top10inst.csv"), and must contain at least one column titled  **"Current Institution"** or **"Institution"** (the first cell of the column is usually interpreted as such). The file can contains other columns, they will be ignored.<br>
If the file is in a different directory than the one where the code it, include the whole path. <br>

The code will run as in Example 2 above for each institutions and append the results at each iteration providing a final dataframe with all the researchers at all the institutions in the list provided.<br>
**NOTE: at the moment if an institution query returns an empty dataframe the code will ignore it and continue to the following one.**


In [None]:
datf=AP.run_file_search(filename='Fellows_Example.csv',
               token=token, stop_dir=stop_dir)


In [None]:
# To display the data frame run the following:
datf
# To save it in a excel format run the following:
#datf.to_csv(path_stop+"output.csv")

# **Example 8: Searching through a list of Authors names**

The search will focus on papers from a list of authors names (similar format as Example 1 above, **'Last, First'**). <br>
The input is a .csv file that has multiple authors names stored in it under a column Title: **"Name"**. <br>
The ADS search will focus on the period 2003 to 2023.
<br>
If the file is in a different directory than the one where the code it, include the whole path. <br>

The code will then execute the search one name after the other and uppend each result to the previous one.<br>
In the following example we use, for convenience, the same example file as before which also contain a list of researchers names.


In [None]:
datf=AP.run_file_search(filename='Fellows_Example.csv',
               token=token, stop_dir=stop_dir)


In [None]:
# To display the data frame run the following:
datf
# To save it in a excel format run the following:
#datf.to_csv(path_stop+"output.csv")

# Example 9: Searching a list of Authors at Institutionsd during specific times

In [None]:
datf=AP.run_file_search(filename='Fellows_Example.csv',
               token=token, stop_dir=stop_dir)

In [None]:
# To display the data frame run the following:
datf
# To save it in a excel format run the following:
#datf.to_csv(path_stop+"output.csv")