In [None]:
import pandas as pd
import numpy as np
from difflib import SequenceMatcher
import requests
import json
import numpy as np
import time
import urllib3
import datetime

_SESSION = requests.session()
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
proxy_port = "8010"
proxy_host = "proxy.crawlera.com"
proxy_auth = ":" # Make sure to include ':' at the end
proxies = {"https": "https://{}@{}:{}/".format(proxy_auth, proxy_host, proxy_port),
      "http": "http://{}@{}:{}/".format(proxy_auth, proxy_host, proxy_port)}

In [None]:
nescent_df = pd.read_csv("turk_grouped_with_middle_initial_only.csv", index_col=0)

In [None]:
scraped_df = pd.read_csv("NESCent_No_ID.csv", index_col=0)

In [None]:
def get_page_content(url, retry=0):
    global _SESSION
    
    if retry==5:
        return None
    
    response = _SESSION.get(url, proxies=proxies, verify=False)
    
    if response.status_code==200:
        return response.content
    else:
        print(str(response.status_code) + " Code, waiting 10s before retrying")
        time.sleep(10)
        _SESSION = requests.Session()
        return get_page_content(url)

In [None]:
def similar(a, b):
    return SequenceMatcher(None, a, b).ratio()

In [None]:
def search_pubmed(query):
    base_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?db=pubmed&retmode=json&retmax=3&term="
    end_url = "&field=title"
    
    full_url = base_url + query.replace(" ", "%20") + end_url
    
    content = get_page_content(full_url)
    
    if content==None:
        return []
    
    x = json.loads(content)
    
    try:
        return x['esearchresult']['idlist']
    except KeyError:
        return []

In [None]:
def get_pubmed_pub(pubid):
    full_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esummary.fcgi?retmode=json&db=pubmed&id=" + pubid

    content = get_page_content(full_url)
    
    if content==None:
        return []
    
    x = json.loads(content)
    
    return x

In [None]:
def search_scopus(query):
    search = 'Title(' + query + ')'
    s = scopus.ScopusSearch(search, refresh=True, count=10)
    return s._EIDS

In [None]:
def get_scopus_pub(pubid):
    try:
        return ScopusAbstract(pubid)
    except:
        return None

In [None]:
def check_pubmed(base_row, pub):
    print("\t\tChecking PubMed..")
    
    pubmed_info = None
    pubmed_id = None
    ids = search_pubmed(pub)

    for pub_id in ids:
        pubmed_json = get_pubmed_pub(pub_id)
        if similar(pubmed_json["result"][pub_id]["title"], pub) > threshold:
            pubmed_match = True
            pubmed_info = pubmed_json
            pubmed_id = pub_id
            break

    if pubmed_info:
        print("\t\tFound new publication")
        new_row = base_row[:]
        new_row.append(pubmed_info["result"][pubmed_id]["title"])
        new_row.append(" and ".join([z['name'] for z in pubmed_info["result"][pubmed_id]["authors"]]))
        new_row.append(int(pubmed_info['result'][pubmed_id]['pubdate'][0:4]))
        new_row.append(pubmed_info["result"][pubmed_id]["source"])
        new_row.append(json.dumps(pubmed_info))
        new_row.append(datetime.datetime.today().strftime('%Y-%m-%d'))
        new_row.append(pubmed_info["result"][pubmed_id]["pmcrefcount"])
        new_row.append("Pub Med")
        
        return new_row
    
    return None
        

In [None]:
import scopus
from scopus import ScopusAbstract
scopus.load_api_key()

def check_scopus(base_row, pub):
    print("\t\tChecking Scopus..")
    
    scopus_info = None
    scopus_id = None
    ids = search_scopus(pub)

    for pub_id in ids:
        try:
            scopus_json = get_scopus_pub(pub_id)
        except:
            continue
        
        if scopus_json==None:
            continue
        
        if similar(scopus_json.title, pub) > threshold:
            pubmed_match = True
            scopus_info = scopus_json
            scopus_id = pub_id
            break

    if scopus_info:
        print("\t\tFound new publication")
        new_row = base_row[:]
        new_row.append(scopus_info.title)
        new_row.append(" and ".join([z['name'] for z in scopus_info.authors[0].given_name]))
        new_row.append(int(scopus_info.coverDate[0:4]))
        new_row.append(scopus_info.publicationName)
        new_row.append(json.dumps(scopus_info))
        new_row.append(datetime.datetime.today().strftime('%Y-%m-%d'))
        new_row.append(scopus_info.citedby_count)
        new_row.append("Scopus")
        
        return new_row
    
    return None

In [None]:
from IPython.display import clear_output


In [None]:
from tqdm import tqdm_notebook as tqdm

In [None]:
threshold = .9

new_scraped_df = pd.DataFrame(columns=scraped_df.columns)
# new_scraped_df = pd.read_csv("pub_med_cv.csv", index_col=0)

for i, row in nescent_df.iterrows():
    clear_output()
    print("Author " + str(i) + " of " + str(nescent_df.shape[0]))

    unconfirmed_pubs = row.unconfirmed_publications
    scholar_pubs = list(scraped_df[scraped_df["initials"] == row["Google Scholar Middle Initial"]]['publication'])

    initials = row["Google Scholar Middle Initial"]
    name = row["Input.name"]
    name = name.replace('-', ' ')
    name_split = name.split()

    base_row = [name, initials, row["Input.university"], row["Input.department"], row["Input.discipline"]]

    checked_ids = []
    if not isinstance(unconfirmed_pubs, float):
        unconfirmed_pubs_split = set(unconfirmed_pubs.split(', '))
        for j, pub in enumerate(unconfirmed_pubs_split):
            print("\tPublication " + str(j) + " of " + str(len(unconfirmed_pubs_split)))
            already_have = False

            # Check if already have
            for scholar_pub in scholar_pubs:
                if similar(scholar_pub, pub) > threshold:
                    # Already captured by Google Scholar
                    already_have = True
                    print("\t\tAlready in GS")
                    break

            found_match = False
            if not already_have:
                # Search in other sources

                # Start with pubmed
                new_row = check_pubmed(base_row, pub)

                if new_row is None:
                    new_row = check_scopus(base_row, pub)

            if new_row is not None:
                new_scraped_df.loc[new_scraped_df.shape[0]] = new_row    

    new_scraped_df.to_csv("pub_med_cv.csv")


                