#### Researchers for Pilot Study

Ben Gurion Psych Department
- Yoav Kessler (5093570)
- Almog Simchon (5526046)
- Andrea Berger (39434094)
- Florina Uzefovsky (3983316)
- Nachshon Meiran (3240089)

Harvard Cognition, Brain, and Behavior (CBB) group
- Steven Pinker (2693903)
- Daniel L. Schacter (10129966)
- Tomer D. Ullman (37774552)
- Samuel J. Gershman (1831199)
- Joshua D. Greene (21112145)

Max Planck Institute for Human Cognitive and Brain Sciences (heads of independent groups)
- Falk Eippert (2983957)
- Veronika Engert (2701472)
- Roland G Benoit (2523627)
- Lars Meyer (145243956)
- Stephanie Theves (4663606)

MRC Cognition and Brain Sciences Unit (Programme Leaders)
- Michael Anderson (145540877)
- Duncan Astle (2339620)
- Kate Baker (32328650)
- John Duncan (145822452)
- Tim Dalgleish (2193978)

In [213]:
import numpy as np
import pandas as pd
import requests # for accessing API
from datetime import datetime  # for dealing with dates

In [226]:
# Semantic Scholar API key (by request - mine allows for 100 requests / second)
headers={'x-api-key':'***REMOVED***'}

In [227]:
def get_career(authorId, name):
    """
    Given authorId, retrieve paperId, title, embedding, references, citations, and publication date for each publication.
    Output tuple: name, datafame.
    """
    # list of publications (+title) as dataframe
    http = requests.get("https://api.semanticscholar.org/graph/v1/author/%s/papers?limit=1000" %authorId, headers=headers)
    if http.status_code == 429:
        print("Waiting 1 second for access to the API...")
        time.sleep(1)
        http = requests.get("https://api.semanticscholar.org/graph/v1/author/%s/papers?limit=1000" %authorId, headers=headers)
    json = http.json()
    pubs_df = pd.DataFrame(json['data'])
    
    # loop through publications, getting embedding for each
    print("Estimated time to retrieve "+str(len(pubs_df.index))+" publications: "+str(1*len(pubs_df.index)/100)+" minutes")
    pubs_df['scholar_name'] = name
    pubs_df['authors'] = None
    pubs_df['embedding'] = None
    pubs_df['date'] = None
    pubs_df['references'] = None
    pubs_df['citations'] = None
    for index, row in pubs_df.iterrows():
        temp_http = requests.get("https://api.semanticscholar.org/graph/v1/paper/%s?fields=year,authors,publicationDate,embedding,references,citations" %row['paperId'], headers=headers)
        if temp_http.status_code == 429:
            print("Waiting 1 second for access to the API...")
            time.sleep(1)
            temp_http = requests.get("https://api.semanticscholar.org/graph/v1/paper/%s?fields=year,authors,publicationDate,embedding,references,citations" %row['paperId'], headers=headers)
        if temp_http.status_code == 404:
            nodes[n].append(None)
            continue
        temp_json = temp_http.json()
        if not temp_json['publicationDate'] == None:
            row['date'] = datetime.strptime(temp_json['publicationDate'], '%Y-%m-%d')
        elif not temp_json['year'] == None:
            row['date'] = datetime.strptime(str(temp_json['year']), '%Y')
        row['authors'] = temp_json['authors']
        row['embedding'] = temp_json['embedding']['vector']
        row['references'] = temp_json['references']
        row['citations'] = temp_json['citations']
    return pubs_df

In [228]:
def add_references_vec(pubs_df):
    pubs_df['references_vec'] = [list(filter(lambda item: item is not None, [x['paperId'] for x in row['references']])) for index, row in pubs_df.iterrows()]
    return pubs_df

In [240]:
pilot_scholars = {
    'Ben Gurion University Psychology Department': {
        'Yoav Kessler': 5093570,
        'Almog Simchon': 5526046,
        'Andrea Berger': 39434094,
        'Florina Uzefovsky': 3983316,
        'Nachshon Meiran': 3240089
    },
    'Harvard CBB Group': {
        'Steven Pinker': 2693903,
        'Daniel L Schacter': 10129966,
        'Tomer D Ullman': 37774552,
        'Samuel J Gershman': 1831199,
        'Joshua D Greene': 21112145
    },
    'Max Planck Institute for Human Cognitive and Brain Sciences': {
        'Falk Eippert': 2983957,
        'Veronika Engert': 2701472,
        'Roland G Benoit': 2523627,
        'Lars Meyer': 145243956,
        'Stephanie Theves': 4663606
    },
    'MRC Cognition and Brain Sciences Unit': {
        'Michael Anderson': 145540877,
        'Duncan Astle': 2339620,
        'Kate Baker': 32328650,
        'John Duncan': 145822452,
        'Tim Dalgleish': 2193978
    }
}

In [None]:
for group in list(pilot_scholars.keys()):
    for scholar in list(pilot_scholars[group].keys()):
        d = get_career(pilot_scholars[group][scholar], scholar)
        d = add_references_vec(d)
        d['group'] = group
        d['scholar_name'] = scholar
        d['authorId'] = pilot_scholars[group][scholar]
        # Expand vectors to multiple columns
        embeddings = d.embedding.apply(pd.Series)
        embeddings.columns = [(scholar.replace(" ", "_").lower() + '_embedding_{}'.format(x)) for x in list(range(1, 769))]
        references = d.references_vec.apply(pd.Series)
        references.columns = [(scholar.replace(" ", "_").lower() + '_reference_{}'.format(x)) for x in list(range(1, len(d.references_vec.apply(pd.Series).columns) + 1))]
        # Count citations and authors
        d['n_citations'] = [len(i) for i in [list(filter(lambda item: item is not None, [x['paperId'] for x in row['citations']])) for index, row in d.iterrows()]]
        d['n_authors'] = [len(i) for i in [list(filter(lambda item: item is not None, [x['authorId'] for x in row['authors']])) for index, row in d.iterrows()]]
        d = pd.concat([d, embeddings, references], axis = 1)
        # Save dataset to csv
        d.to_csv('data/' + scholar.replace(" ", "_").lower() + '_career.csv')