# Scopus

In [257]:
import scopus
from scopus import AuthorSearch

scopus.load_api_key()

In [258]:
def search_author_pubs(first, last, affil):
    s = AuthorSearch('AUTHLASTNAME(' + last + ') and AUTHFIRST(' + first + ') and AFFIL(' + affil + ')', refresh=True)
    
    if(len(s.authors)==0):
        print("Narrowing search to name only")
        s = AuthorSearch('AUTHLASTNAME(' + first + ') and AUTHFIRST(' + last + ')', refresh=True)
        
        if(s._json==[]):
            print("Found no results..")
            return None
    
    print(str(len(s.authors)) + " author(s) found")
    sAuthor = scopus.ScopusAuthor(s.authors[0].eid)
    
    return sAuthor.get_abstracts()

In [170]:
pubs = search_author_pubs("Ehab", "Abouheif", "McGill")

1 author(s) found


In [171]:
raw = vars(test)
raw['xml'] = None
raw['_authors'] = [x.given_name + " " + x.surname for x in test.authors]
raw['_affiliations'] = [x.affilname for x in test.affiliations]
json.dumps(raw)

In [6]:
s = ScopusSearch('TITLE( Fuzzy logic approach for layered architecture cognitive radio systems )', count=20)

In [7]:
vars(s)

{'_EIDS': ['2-s2.0-85049989127'],
 'query': 'TITLE( Fuzzy logic approach for layered architecture cognitive radio systems )'}

In [8]:
ab = ScopusAbstract('2-s2.0-0035934588')

# PubMed

In [301]:
import pandas as pd
import numpy as np
from difflib import SequenceMatcher
import requests
import json
import numpy as np
import time
import urllib3
import datetime

_SESSION = requests.session()
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
proxy_port = "8010"
proxy_host = "proxy.crawlera.com"
proxy_auth = ":" # Make sure to include ':' at the end
proxies = {"https": "https://{}@{}:{}/".format(proxy_auth, proxy_host, proxy_port),
      "http": "http://{}@{}:{}/".format(proxy_auth, proxy_host, proxy_port)}

In [302]:
def get_page_content(url, retry=0):
    global _SESSION
    
    if retry==5:
        return None
    
    response = _SESSION.get(url, proxies=proxies, verify=False)
    
    if response.status_code==200:
        return response.content
    else:
        print(str(response.status_code) + " Code, waiting 10s before retrying")
        time.sleep(10)
        _SESSION = requests.Session()
        return get_page_content(url)

In [316]:
def search_pubmed(query):
    base_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?db=pubmed&retmode=json&retmax=1000&term="
    end_url = "&field=title"
    
    full_url = base_url + query.replace(" ", "%20") + end_url
    
    content = get_page_content(full_url)
    
    if content==None:
        return []
    
    x = json.loads(content)
    
    try:
        return x['esearchresult']['idlist']
    except KeyError:
        return []

In [317]:
def get_pubmed_pub(pubid):
    full_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esummary.fcgi?retmode=json&db=pubmed&id=" + pubid

    content = get_page_content(full_url)
    
    if content==None:
        return []
    
    x = json.loads(content)

    return x

In [318]:
def get_author_pubs(name, affilation):
    pub_ids = search_pubmed("(" + name + "[Author]) AND " + affilation + "[Affiliation]")

    pubs = []
    for pub_id in pub_ids:
        pubs.append(get_pubmed_pub(pub_id))
        
    return pubs

In [319]:
x = get_author_pubs("Jeremy Beaulieu", "Yale University")

['26062690', '25944476', '25903435', '25407924', '24889934', '24870037', '24151998', '24118264', '24026822', '23676760', '23565668', '22988083', '22907523', '22834738', '21436104', '20707851']


In [361]:
from pprint import pprint
for pub in x:
    stripped_pub = list(pub['result'].values())[1]
    title = stripped_pub['title']
    authors = " and ".join([y['name'] for y in stripped_pub['authors']])
    year = int(stripped_pub['pubdate'][0:4])
    journal = stripped_pub['fulljournalname']
    raw = json.dumps(pub)
    cited_by = stripped_pub.get('pmcrefcount')
    date = datetime.datetime.today().strftime('%Y-%m-%d')
    


In [356]:
cited_by

1

# Combining Data Sets
Combine data sets with Google Scholar ID and without for PubMed, Scopus, and WoS scraping

In [228]:
no_scholar_id = pd.read_csv("turk_grouped.csv", index_col=0)

In [229]:
scholar_id = pd.read_excel("NESCent Google Scholar IDs.xlsx")

In [254]:
no_scholar_id.shape

(180, 16)

In [255]:
scholar_id.shape

(441, 7)

In [252]:
combined_df = pd.DataFrame(columns=["NameLast", "NameFirst", "institution_name", "profession_role", "dept_current"])
for index, row in scholar_id.iterrows():
    new_row = dict()
    new_row["NameFirst"] = row.NameFirst
    new_row["NameLast"] = row.NameLast
    new_row["institution_name"] = row.institution_name
    new_row["profession_role"] = row.profession_role
    new_row["dept_current"] = row.dept_current
    
    combined_df = combined_df.append(new_row, ignore_index=True)
    
for index, row in no_scholar_id.iterrows():
    new_row = dict()
    
    name_split = row["Input.name"].split(" ")
    
    new_row["NameFirst"] = " ".join(str(x) for x in name_split[0:-1])
    new_row["NameLast"] = name_split[-1]
    new_row["institution_name"] = row['Input.university']
    new_row["profession_role"] = row['Input.discipline']
    new_row["dept_current"] = row['Input.department']
    
    combined_df = combined_df.append(new_row, ignore_index=True)

In [256]:
combined_df.to_csv("all_researchers.csv")