In [None]:
#To improve, add key word cross validation! 
# To improve, add papers that cite the original paper to create the author corpus! 

In [3]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from Bio import Entrez
from Bio import Medline

%matplotlib inline

In [4]:
Entrez.email = "ajwright@gmail.com"
Entrez.api_key = "86ac8038bfc913213f007df2803127ebc908" 

In [5]:
def user_entered_info():
    """Stores user-provided scientist name and affiliation.
    
    Arguments:
    none
    
    Returns:
    name - str; scientist name in the format "firstname middleinit lastname"
    affiliation - str; scientist's institutional affiliation
    """
    print("Type the answer to each question then press return.  If you do not know the answer, just hit return.")
    first_name = input("What is the first name of the scientist of interest? ")
    middle_name = input("What is the middle initial of the scientist of interest? ")
    try:
        middle_initial = middle_name[0]
    except IndexError:
        middle_initial = ""
    last_name = input("What is the last name of the scientist of interest? ")
    affliation = input("What is the affiliation of the scientist of interest? No abbreviations, please. ")
    name = "{0} {1} {2}".format(first_name.lower(), middle_initial.lower(), last_name.lower())
    return name, affliation

In [6]:
name, affiliation = user_entered_info()

Type the answer to each question then press return.  If you do not know the answer, just hit return.
What is the first name of the scientist of interest? Carolyn
What is the middle initial of the scientist of interest? G
What is the last name of the scientist of interest? Rasmussen
What is the affliation of the scientist of interest? No abbreviations please. University of California Riverside


In [7]:
print(name, affiliation)

carolyn g rasmussen University of California Riverside


In [8]:
# Given a scientist's name and optional affliation, retrive a list of their papers from pubmed

# package as a dictionary?
def get_scientist_papers(name, affiliation=None):
    """Search PubMed for papers whose author list and affiliation list contain the provided author name and
    affiliation. 
    
    Arguments:
    name - str; complete scientist name in the format "lastname, firstname middleinitial"
    affiliation (optional) - str
    
    Returns:
    ids - list; list of paper IDs
    webenv - str; used to reference cached NCBI search session in future efetch queries
    query_key - str; used to reference cached NCBI search session in future efetch queries
    """
    if affiliation == None:
        handle = Entrez.esearch(db='pubmed', term=name, retmax=200, usehistory="y")
    else: 
        terms = "{} AND {}".format(name, affiliation)
        handle = Entrez.esearch(db='pubmed', term=terms, retmax=200, usehistory="y")
    record = Entrez.read(handle)
    ids = record['IdList']
    webenv = record["WebEnv"]
    query_key = record["QueryKey"]
    return ids, webenv, query_key

In [9]:
id_list, webenv, query_key = get_scientist_papers(name, affiliation)

In [10]:
# Allow the user to select up to 3 papers by the scientist

# need to return the number of references as well! 

# To do:  write a new function that returns a citation and use it in this function 

def user_selected_papers(id_list, webenv, query_key):
    """Allows the user to select up to 3 papers authored by a scientist of interest. Uses NCBI cached search
    history.
    
    Arguments:  
    id_list - list; paper ids
    webenv - str; used to reference cached NCBI search session in efetch queries
    query_key - str; used to reference cached NCBI search session in efetch queries
    
    Returns 
    select_list - list; paper ids of user selected papers
    """ 
    from Bio import Medline
    print("Please select up to 3 papers by keying in the corresponding number(s). Seperate each number by a comma.")
    handle = Entrez.efetch(db="pubmed", id=id_list, rettype='medline', retmode='text', webenv=webenv, query_key=query_key)
    records = Medline.parse(handle)
    for index, record in enumerate(records, 1):
        print("{}. {} {}. {}. {}. ({})".format(index, record.get("TI", "?"), record.get("AU", "?"), record.get("JT", "?"),
                                         record.get("DP", "?"), record.get("PMID", "?")))
    paper_num = input("Which papers would you like to select? ")
    paper_num = paper_num.split(',')
    #print(paper_num)
    while paper_num == ['']:
        paper_num = input("No papers selected.  Please select up to 3 papers by keying in the corresponding number(s). Seperate each number by a comma.")
        paper_num = paper_num.split(',')
    while len(paper_num) > 4:
        paper_num = input("Too many papers selected.  Please select up to 3 papers by keying in the corresponding number(s). Seperate each number by a comma.")
        paper_num = paper_num.split(',')
    select_list = []
    for num in paper_num:
        select_list.append(id_list[int(num)-1])
    return select_list

In [11]:
chosen_papers = user_selected_papers(id_list, webenv, query_key)

Please select up to 3 papers by keying in the corresponding number(s). Seperate each number by a comma.
1. Cell-Based Model of the Generation and Maintenance of the Shape and Structure of the Multilayered Shoot Apical Meristem of Arabidopsis thaliana. ['Banwarth-Kuhn M', 'Nematbakhsh A', 'Rodriguez KW', 'Snipes S', 'Rasmussen CG', 'Reddy GV', 'Alber M']. Bulletin of mathematical biology. 2019 Aug. (30552627)
2. A plane choice: coordinating timing and orientation of cell division during plant development. ['Facette MR', 'Rasmussen CG', 'Van Norman JM']. Current opinion in plant biology. 2019 Feb. (30261337)
3. Predicting Division Planes of Three-Dimensional Cells by Soap-Film Minimization. ['Martinez P', 'Allsman LA', 'Brakke KA', 'Hoyt C', 'Hayes J', 'Liang H', 'Neher W', 'Rui Y', 'Roberts AM', 'Moradifam A', 'Goldstein B', 'Anderson CT', 'Rasmussen CG']. The Plant cell. 2018 Oct. (30150312)
4. The Microtubule-Associated Protein IQ67 DOMAIN5 Modulates Microtubule Dynamics and Pavement 

In [12]:
print(chosen_papers)

['30150312', '29146775', '28202734']


In [13]:
# Currently not used 
def get_citedin_refs(paper_id):
    """Returns paper ids corresponding to all the papers that cite a given paper.

    Arguments:
    paper_id - str; unique paper id
    
    Returns:
    ref_ids - list of strs; paper ids of the references
    """
    pubmed_refs = []
    ref_ids = []
    pub_records = Entrez.read(Entrez.elink(dbfrom="pubmed", id=paper_id))
    for entry in pub_records[0]["LinkSetDb"]:
        #print(entry)
        if entry["LinkName"] == 'pubmed_pubmed_citedin':
            pubmed_refs = entry["Link"]
    for ref in pubmed_refs:
        ref_ids.append(ref['Id'])      
    return ref_ids            

In [14]:
# Currently not used
def get_paper_refs(paper_id):
    """Returns paper ids corresponding to all the papers cited by a given paper.

    Arguments:
    paper_id - str; unique paper id
    
    Returns:
    ref_ids - list of strs; paper ids of the references
    """
    pubmed_refs = []
    ref_ids = []
    pub_records = Entrez.read(Entrez.elink(dbfrom="pubmed", id=paper_id))
    for entry in pub_records[0]["LinkSetDb"]:
        #print(entry)
        if entry["LinkName"] == 'pubmed_pubmed_refs':
            pubmed_refs = entry["Link"]
    for ref in pubmed_refs:
        ref_ids.append(ref['Id'])      
    return ref_ids 

In [None]:
print(len(get_citedin_refs("28202734")))
print(len(get_paper_refs("28202734")))

In [15]:
# Currently not used  
def compile_refs(paper_list):
    """Takes a list of paper IDs and returns a list of all the references they contain.  Alerts the user if the there
    are no references available for any of the papers.  
    
    Arguments:
    paper_list - list; paper IDs (str)
    
    Returns:
    all_ref_list - list; paper IDs (str)
    """
    all_ref_list = []
    for paper in paper_list:
        ref_list = get_paper_refs(paper)
        print("For paper {}, {} references were retrieved.".format(paper, len(ref_list)))
        all_ref_list.extend(ref_list)  
    return all_ref_list

In [16]:
# replaces the original get_paper_refs and compile_refs functions.  Uses epost and efetch to similify the process

def compile_refs_and_citedin(paper_list):
    """Takes a list of paper IDs and returns a list of the IDs for papers referenced in the original paper list.  The 
    reference list may be incomplete since PubMed does not provide references for all papers.  If concerned, run
    the related function 'compile_refs' to identify which papers are missing references.
    
    Arguments:
    paper_list - list; paper IDs (str)
    
    Returns:
    pubmed_refs - list; paper IDs (str)
    """
    str_paper_list = ",".join(paper_list)
    search_results = Entrez.read(Entrez.epost("pubmed", id=str_paper_list))
    query_key = search_results["QueryKey"]
    webenv = search_results["WebEnv"]
    pub_records = Entrez.read(Entrez.elink(dbfrom="pubmed", WebEnv=webenv, query_key=query_key))
    ref_ids = []
    for entry in pub_records[0]["LinkSetDb"]:
        if entry["LinkName"] == 'pubmed_pubmed_refs':
            refs = entry["Link"]
            try:
                for ref in refs:
                    ref_ids.append(ref['Id'])
            except UnboundLocalError:
                print("No references found")        
        if entry["LinkName"] == 'pubmed_pubmed_citedin':
            cites = entry["Link"]
            try:
                for cite in cites:
                    ref_ids.append(cite['Id'])
            except UnboundLocalError:
                print("No cited in found")
    print("Length ref_ids list", len(ref_ids))
    print("Length ref_ids set", len(set(ref_ids)))
    return list(set(ref_ids))

In [None]:
# if user is not happy with the number of references retrieved - need to have some way to restart the selection process. 

In [18]:
def get_first_last_authors(paper_id):
    """Given a paper, returns the first and last authors of the paper.
    
    Arguments:
    paper_id - str; paper ids
    
    Returns:
    authors - list of strs; list of full names of the first and last authors of the provided paper id
    """
    handle = Entrez.efetch(db='pubmed', id=paper_id, rettype='medline', retmode="text", retmax=200)
    record = Medline.read(handle)
    authors = record.get("FAU", "?")
    first_last_authors = [authors[0], authors[-1]]
    print(first_last_authors)
    return first_last_authors
    

In [19]:
def author_formatting(author_list):
    """Changes the formatting of author name strings to give the best PubMed search results. 
    
    Arguments:
    author_list - list of str; list of author names
    
    Returns:
    formatted_author_list - list of str; alphabetized list of author names formatted "lastname, firstname"
    """
    formatted_author_list = []
    for author in author_list:
        #print(author)
        last_name = author.split(',')[0]
        first_name = author.split(',')[1].lstrip()
        try:
            if first_name[1] == " ":
                try:
                    first_name = first_name[0] + first_name[-1]
                except:
                    first_name = first_name[0]
        except:            
            first_name = first_name[0]
        new_name = "{}, {}".format(last_name, first_name)
        #print(new_name)
        formatted_author_list.append(new_name)
    return sorted(formatted_author_list)

In [None]:
author_formatting(['Ambrose, J Christian'])

In [20]:
def remove_duplicates(sorted_list):
    """Takes a sorted list of authors and if 2 authors have the same last name and first initial, removes the name
    with only the first initial leaving behind the one that has the full first name. 
    
    Arguments:
    sorted_list - list; alphabetized list of authors in the format "lastname, firstname middleinit" OR 
    "lastname, firstinit middleinit"
    
    Returns:
    author_no_dup_list - list; alphabetized list of authors with duplicates removed 
    """
    author_no_dup_list = []
    most_recent_author = " , "
    for i, author in enumerate(sorted_list):
        if author.split(',')[0] == most_recent_author.split(',')[0]:
            if author.split(',')[1].lstrip()[0] != most_recent_author.split(',')[1].lstrip()[0]:
                author_no_dup_list.append(author)
                #print("(1) Compared {} to {} and appended {}".format(author, most_recent_author, author))
            else:
                #print("(2) Compared {} to {} and deleted {} and appended {}".format(author, most_recent_author, author_no_dup_list[-1], author)) 
                del author_no_dup_list[-1]
                author_no_dup_list.append(author)                       
        else:
            author_no_dup_list.append(author)
            #print("(3) Compared {} to {} and appended {}".format(author, most_recent_author, author))
        most_recent_author = author
    return author_no_dup_list 

In [21]:
# create a list of scientists who wrote the paper(s) referenced by or who cited the user selected scientist 

def create_master_biologist_list(paper_list):
    """Searches PubMed for all the papers cited by or that cites a paper on the paper list.  Returns a list
    of the first and last authors of those papers (duplicates removed).
    
    Arguments:
    paper_list - list; paper IDs (str)
    
    Returns: 
    biologist_master_list - list; biologist names (str)
    """
    ref_citedin_ids = compile_refs_and_citedin(paper_list)
    first_last_author_list = []   
    for paper in ref_citedin_ids:
        print("Getting the authors of paper {}.".format(paper))
        authors = get_first_last_authors(paper)
        first_last_author_list.extend(authors) 
    set_f_l_author_list = list(set(first_last_author_list)) 
    print(len(set_f_l_author_list))
    format_f_l_author_list = author_formatting(set_f_l_author_list)
    print(len(format_f_l_author_list))
    no_dup_f_l_author_list = remove_duplicates(format_f_l_author_list)
    print(len(no_dup_f_l_author_list))
    return no_dup_f_l_author_list

In [22]:
master_biologist_list = create_master_biologist_list(chosen_papers)

Length ref_ids list 168
Lenght ref_ids set 167
Getting the authors of paper 22045917.
['Van Damme, Daniel', 'Russinova, Eugenia']
Getting the authors of paper 11044722.
['Hellens, R', 'Klee, H']
Getting the authors of paper 18583534.
['Paredez, Alexander R', 'Somerville, Chris R']
Getting the authors of paper 16179950.
['Thery, Manuel', 'Bornens, Michel']
Getting the authors of paper 19825595.
['Ambrose, J Christian', 'Cyr, Richard']
Getting the authors of paper 22645544.
['Weizbauer, Renate', 'Schulz, Burkhard']
Getting the authors of paper 30742612.
['Moukhtar, Julien', 'Andrey, Philippe']
Getting the authors of paper 24258577.
['Lang Selker, J M', 'Green, P B']
Getting the authors of paper 22301654.
['Wadsworth, Patricia', 'Wadsworth, Patricia']
Getting the authors of paper 15155883.
['Sedbrook, John C', 'Somerville, Chris R']
Getting the authors of paper 30804009.
['Rui, Yue', 'Anderson, Charles T']
Getting the authors of paper 29394250.
['Chakrabortty, Bandan', 'Mulder, Bela M']
G

['Yuen, Christen Y L', 'Masson, Patrick H']
Getting the authors of paper 3867671.
['Gunning, B E', 'Wick, S M']
Getting the authors of paper 23299369.
['Peterson, Kylee M', 'Torii, Keiko U']
Getting the authors of paper 2391003.
['Becraft, P W', 'Freeling, M']
Getting the authors of paper 11738380.
['Migliaccio, F', 'Piconese, S']
Getting the authors of paper 17964159.
['Walker, Keely L', 'Smith, Laurie G']
Getting the authors of paper 2324196.
['Flanders, D J', 'Lloyd, C W']
Getting the authors of paper 27930326.
['Willis, Lisa', 'Jonsson, Henrik']
Getting the authors of paper 29317523.
['Szymanski, Dan', 'Sakamoto, Wataru']
Getting the authors of paper 27426272.
['Van Norman, Jaimie M', 'Van Norman, Jaimie M']
Getting the authors of paper 18932023.
['Sakaguchi, Jun', 'Fukuda, Hiroo']
Getting the authors of paper 11732054.
['Baskin, T I', 'Baskin, T I']
Getting the authors of paper 30987599.
['Jaquez-Gutierrez, Marybel', 'Moreno, Vicente']
Getting the authors of paper 7664733.
['Hemer

In [24]:
def create_biologist_paper_dict(biologist_list):
    """Takes a list of biologists and looks up the IDs of all the papers they authored in PubMed.  Returns
    a dictionary where the biologist's name is the key and the value is a list of their paper IDs. 
    
    Arguments:
    biologist_list - list; biologist names (str)
    
    Returns:
    biologist_paper_dict - dict; keys are biologist names (str) and the values are a list of IDs of the papers 
    authored by the biologist
    """
    biologist_papers_dict = {}
    for biologist in biologist_list[0:5]:
        biologist_nocomma = biologist.replace(',', '')
        print("Getting papers authored by {}.".format(biologist_nocomma))
        papers = get_scientist_papers(biologist_nocomma)[0]
        biologist_papers_dict[biologist] = papers
    zero_papers = []
    has_papers = []
    total = 0
    for key, value in biologist_papers_dict.items():
        if len(value) == 0:
            zero_papers.append(key)
        else:
            has_papers.append(key)
        total += len(value)
    print("total papers:", total)
    print("Zero papers were retrieved for the following authors:", zero_papers)
    print("More than one paper was retreieved for the following authors:", has_papers)
    return biologist_papers_dict


In [25]:
the_biologist_paper_dict = create_biologist_paper_dict(master_biologist_list)

Getting papers authored by Abrash Emily B.
Getting papers authored by Ambrose Jn.
Getting papers authored by Anderson Charles T.
Getting papers authored by Andrey Philippe.
Getting papers authored by Arganda-Carreras Ignacio.
Getting papers authored by Asada Tetsuhiro.
Getting papers authored by Azimzadeh Juliette.
Getting papers authored by Barton Mn.
Getting papers authored by Baskin TI.
Getting papers authored by Bassel George W.
Getting papers authored by Bassham Diane C.
Getting papers authored by Beauzamy Lena.
Getting papers authored by Becraft PW.
Getting papers authored by Beeckman Tom.
Getting papers authored by Beemster Gerrit T S.
Getting papers authored by Bellaiche Yohanns.
Getting papers authored by Bellinger Marschal.
Getting papers authored by Benfey Philip N.
Getting papers authored by Bent AF.
Getting papers authored by Bergmann Dominique C.
Getting papers authored by Besson Sebastien.
Getting papers authored by Bezanilla Magdalena.
Getting papers authored by Blesch 

Getting papers authored by Somerville Chris R.
Getting papers authored by Sozzani Rosangela.
Getting papers authored by Spinner Lara.
Getting papers authored by Stockle Dorothee.
Getting papers authored by Sugimoto Keiko.
Getting papers authored by Sylvester AW.
Getting papers authored by Szymanski Dan.
Getting papers authored by Takada Shinobu.
Getting papers authored by Thery Manuel.
Getting papers authored by Thitamadee Siripong.
Getting papers authored by Tomancak Pavel.
Getting papers authored by Torii Keiko U.
Getting papers authored by Van Damme Daniel.
Getting papers authored by Van Norman Jaimie M.
Getting papers authored by Vofely Roza V.
Getting papers authored by Voxeur Aline.
Getting papers authored by Wachsman Guy.
Getting papers authored by Wadsworth Patricia.
Getting papers authored by Walbot Virginia.
Getting papers authored by Walker Keely L.
Getting papers authored by Wang H.
Getting papers authored by Wasteneys Geoffrey O.
Getting papers authored by Weijers Dolf.
Ge

In [30]:
def get_and_compile_refs(paper_list):
    """Takes a list of paper IDs and returns a list of the IDs of the papers referenced by papers in the original list.
    The reference list may be incomplete since PubMed does not provide references for all papers.  If concerned, run
    the related function 'compile_refs' to identify which papers are missing references.
    
    Arguments:
    paper_list - list, paper IDs (str)
    
    Returns:
    pubmed_refs - list, paper IDs 
    """
    str_paper_list = ",".join(paper_list)
    search_results = Entrez.read(Entrez.epost("pubmed", id=str_paper_list))
    query_key = search_results["QueryKey"]
    webenv = search_results["WebEnv"]
    pub_records = Entrez.read(Entrez.elink(dbfrom="pubmed", WebEnv=webenv, query_key=query_key))
    for entry in pub_records[0]["LinkSetDb"]:
        if entry["LinkName"] == 'pubmed_pubmed_refs':
            pubmed_refs = entry["Link"]
    ref_ids = []
    try:
        for ref in pubmed_refs:
            ref_ids.append(ref['Id'])
    except UnboundLocalError:
        print("No references found")
    return ref_ids

In [31]:
def create_biologist_cited_papers_dict(biologist_paper_dict):
    """Takes a dictionary of biologists and the papers they wrote, looks up each paper in PubMed and returns a new
    dictionary containing the biologist and a list of all the papers they cite(reference) within the papers they wrote. 
    
    Arguments:
    biologist_paper_dict - dict; keys are biologist's names (str) and the values are a list of IDs of the papers 
    authored by the biologist
    
    Returns:
    biologist_cited_papers_dict - dict; keys are biologist's names (str) and the values are a list of IDS of papers
    cited by the biologist 
    """
    biologist_cited_papers_dict = {}
    for key, value in biologist_paper_dict.items():
        print("Looking up papers cited by ", key)
        try:
            paper_list = get_and_compile_refs(value)
            biologist_cited_papers_dict[key] = paper_list
        except:
            biologist_cited_papers_dict[key] = []
    return biologist_cited_papers_dict


In [32]:
the_biologist_cited_papers_dict = create_biologist_cited_papers_dict(the_biologist_paper_dict)

Looking up papers cited by  Abrash, Emily B
Looking up papers cited by  Ambrose, Jn
Looking up papers cited by  Anderson, Charles T
Looking up papers cited by  Andrey, Philippe
Looking up papers cited by  Arganda-Carreras, Ignacio
Looking up papers cited by  Asada, Tetsuhiro
Looking up papers cited by  Azimzadeh, Juliette
Looking up papers cited by  Barton, Mn
No references found
Looking up papers cited by  Baskin, TI
Looking up papers cited by  Bassel, George W
Looking up papers cited by  Bassham, Diane C
Looking up papers cited by  Beauzamy, Lena
Looking up papers cited by  Becraft, PW
Looking up papers cited by  Beeckman, Tom
Looking up papers cited by  Beemster, Gerrit T S
Looking up papers cited by  Bellaiche, Yohanns
Looking up papers cited by  Bellinger, Marschal
Looking up papers cited by  Benfey, Philip N
Looking up papers cited by  Bent, AF
Looking up papers cited by  Bergmann, Dominique C
Looking up papers cited by  Besson, Sebastien
Looking up papers cited by  Bezanilla, Ma

Looking up papers cited by  Shao, Wanchen
Looking up papers cited by  Shapiro, Bruce E
Looking up papers cited by  Shen, Li
Looking up papers cited by  Smertenko, Andrei P
Looking up papers cited by  Smith, Laurie G
Looking up papers cited by  Smith, Richard S
Looking up papers cited by  Somerville, Chris R
Looking up papers cited by  Sozzani, Rosangela
Looking up papers cited by  Spinner, Lara
Looking up papers cited by  Stockle, Dorothee
Looking up papers cited by  Sugimoto, Keiko
Looking up papers cited by  Sylvester, AW
Looking up papers cited by  Szymanski, Dan
Looking up papers cited by  Takada, Shinobu
Looking up papers cited by  Thery, Manuel
Looking up papers cited by  Thitamadee, Siripong
Looking up papers cited by  Tomancak, Pavel
Looking up papers cited by  Torii, Keiko U
Looking up papers cited by  Van Damme, Daniel
Looking up papers cited by  Van Norman, Jaimie M
Looking up papers cited by  Vofely, Roza V
Looking up papers cited by  Voxeur, Aline
Looking up papers cited b

In [38]:
# combine all the cited papers (removing duplicates) to create the feature set aka repo vocabulary

def create_paper_features_list(biologist_cited_papers_dict):
    """Takes a dictionary of cited papers and combines them into a list with no duplicates.
    
    Arguments:
    biologist_cited_papers_dict - dict; keys are biologist's names (str) and the values are a list of paper IDs cited in 
    papers authored by the biologist
    
    Returns:
    paper_features - list; list of all the papers cited by every author in the starting dict (no duplicates)
    """
    paper_features_temp = []
    for value in biologist_cited_papers_dict.values():
        paper_features_temp.extend(value)
    print(len(paper_features_temp))    
    paper_features = list(set(paper_features_temp))
    print(len(paper_features))
    return paper_features
          

In [39]:
the_paper_features_list = create_paper_features_list(the_biologist_cited_papers_dict)

220727
137462


In [43]:
# run every author against the feature set to create a binary vector that contains a 1 if they cited a paper or a 0 if not

def create_binary_feature_vectors(biologist_cited_papers_dict, paper_features_list):
    """Builds a set of feature vectors for each biologist (key) in the biologist_cited_papers_dict by looking to see 
    if each paper in the paper_feature_list is cited by the biologist (a list of cited papers is the value associated 
    with each biologist key).  Adds a one to the feature vector if the paper is cited and a 0 if it is not. 
    
    Arguments:
    biologist_cited_papers_dict - dict; keys are biologist's names (str) and the values are a list of paper IDs cited 
    in papers authored by the biologist 
    paper_features_list - list; list of all the papers cited by every author in the starting dict
    
    Returns:
    all_binary_feature_vectors - list of lists; lists of 1s and 0s that indicate if a biologist cited a paper in the 
    paper_features_list or not
    """
    all_binary_feature_vectors = []
    for key, value in biologist_cited_papers_dict.items():
        biologist_vector = []
        for paper in paper_features_list:
            if paper in value:
                biologist_vector.append(1)
            else:
                biologist_vector.append(0)
        all_binary_feature_vectors.append(biologist_vector)
    return all_binary_feature_vectors

In [44]:
the_binary_feature_vectors = create_binary_feature_vectors(the_biologist_cited_papers_dict, the_paper_features_list)

In [51]:
def create_comparison_binary_vector(paper_list, paper_features_list):
    """Builds a feature vector for the originating set of papers by looking to see if each paper in the paper_
    features_list is cited by any of the originating set of papers. Adds a one to the feature vector if the paper is 
    cited and a 0 if it is not. 
    
    Arguments:
    paper_list - list; list of paper IDs (str) corresponding to the originating set of papers
    paper_features_list - list; list of all the papers cited by every author in the biologist_cited_papers_dict
    
    Returns:
    comparision_binary_vector - list; list of 1s and 0s that indicate if the originating set of papers cited a paper
    in the paper_features_list or not
    """
    comparison_binary_vector = []
    comparison_refs = get_and_compile_refs(chosen_papers)
    for i in paper_features_list:
        if i in comparison_refs:
            comparison_binary_vector.append(1)
        else:
            comparison_binary_vector.append(0)
    return comparison_binary_vector        

In [52]:
the_comparision_binary_vector = create_comparison_binary_vector(chosen_papers, the_paper_features_list)

In [69]:
def create_biologist_finder_df(binary_feature_vectors, paper_features_list, biologist_cited_papers_dict, comparison_vector):
    """Takes the binary feature vectors and comparison binary vector and builds a pandas dataframe that has biologist's names
    as the index and paper IDs as the columns.  Dataframe is filled with 1s and 0s to indicate if a biologist has cited
    the column paper or not. The last row contains the data for the originating set of papers. 
    
    Arguments: 
    binary_feature_vectors - list of lists; lists of 0s and 1s that indicate if a biologist cited a paper in the 
    paper_features_list
    paper_features_list - list; list of all the papers cited by every author in the starting dict
    biologist_cited_papers_dict - dict; keys are biologist names (str) and the values are a list of paper IDS 
    corresponding to papers cited by the biologist
    comparison_vector - list; list of 0s and 1s that indicates if the originating set of papers cited a paper
    in the paper_features_list
    
    Returns:
    final_df - Pandas df; has biologist's names as the index and paper IDs as the columns.  Dataframe is filled with 
    1s and 0s to indicate if a biologist has cited the column paper or not. The last row represents the originating 
    set of papers and is indexed as "comparison" 
    """
    temp_df = pd.DataFrame(binary_feature_vectors, columns=paper_features_list, index=biologist_cited_papers_dict.keys())
    comp_series = pd.Series(comparison_vector).to_frame("comparison").T
    comp_series.columns = temp_df.columns
    final_df = pd.concat([temp_df, comp_series])
    final_df.dropna(axis=0, inplace=True)
    return final_df

In [70]:
bf_df = create_biologist_finder_df(the_binary_feature_vectors, the_paper_features_list, the_biologist_cited_papers_dict, 
                                   the_comparision_binary_vector, name)

In [71]:
bf_df.shape

(238, 137462)

In [73]:
bf_df.head()

Unnamed: 0,18337155,20379956,20974816,15048123,24249263,30178601,20625003,25657044,20926419,23711327,...,23970914,11877373,12239373,24012599,26523777,24921257,23318657,25089249,11430871,12195433
"Abrash, Emily B",0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
"Ambrose, Jn",0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
"Anderson, Charles T",0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
"Andrey, Philippe",0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
"Arganda-Carreras, Ignacio",0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [74]:
bf_df.tail()

Unnamed: 0,18337155,20379956,20974816,15048123,24249263,30178601,20625003,25657044,20926419,23711327,...,23970914,11877373,12239373,24012599,26523777,24921257,23318657,25089249,11430871,12195433
"Zhu, Chuanmei",0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
"van Dop, Maritza",0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
"van der Zaal, BJ",0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
"von Wangenheim, Daniel",0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
comparision,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [68]:
from fastparquet import write

write('Rasmussen_ver3_df.parq',bf_df)

In [None]:
# Find out which scientists are closest to the original scientist.

from sklearn.metrics import jaccard_similarity_score
from scipy.stats import pearsonr

In [77]:
def create_similarity_scores_df(biologist_finder_df):
    """
    """
    from sklearn.metrics import jaccard_similarity_score
    from scipy.stats import pearsonr
    sim_score = {}
    for i in range(len(biologist_finder_df)):
        score = pearsonr(biologist_finder_df.iloc[-1,:], biologist_finder_df.iloc[i, :])
        sim_score.update({i: score[0]})
    sim_df = pd.Series(sim_score).to_frame("similarity")
    sim_df["Scientist"]= biologist_finder_df.index
    sorted_sim_df = sim_df.sort_values('similarity', ascending=False)
    return sorted_sim_df

In [78]:
ss_df = create_similarity_scores_df(bf_df)



In [80]:
ss_df.shape

(238, 2)

In [None]:
top_15_per = int(author_num * .15)
print(top_15_per)
sorted_sim_df.head(top_15_per)


In [None]:
# function to return a list of the X% of scientists on the list

def return_most_sim_biologists(similarity_df, per):
    """
    """
    num_biologists = similarity_df.shape[0]
    top_per = int(num_biologists * per)
    print(similarity_df.head(top_per)
    return similarity_df.head(top_per)

In [None]:
def get_citations(rec_paper_list):
    """
    """
    id_list = ",".join(rec_paper_list)
    search_results = Entrez.read(Entrez.epost("pubmed", id=id_list))
    query_key = search_results["QueryKey"]
    webenv = search_results["WebEnv"]
    handle = Entrez.efetch(db="pubmed", id=id_list, rettype='medline', retmode='text', webenv=webenv, query_key=query_key)
    records = Medline.parse(handle)
    for index, record in enumerate(records, 1):
        print("{}. {} {}. {}. {}. ({})".format(index, record.get("TI", "?"), record.get("AU", "?"), record.get("JT", "?"),
                                         record.get("DP", "?"), record.get("PMID", "?")))

In [None]:
# function to return a list of the shared citations of the X% of scientists on the list

def return_most_cited_papers(biologist_finder_df, most_sim_biologist_df):
    """
    """
    top_per_biologists = list(most_sim_biologist_df.iloc[1:, 1])
    num_top_biologists = len(top_per_biologists)
    most_cited_papers = biologist_finder_df.loc[top_per_biologist, :][biologist_finder_df.loc[top_per_biologist, :] ==1].fillna(0).T
    most_cited_papers['sum'] = most_cited_papers.sum(axis=1)
    most_cited_papers_sort = most_cited_papers.sort_values('sum', ascending=False)
    
    total_num_papers = most_cited_papers_sort.shape[0]
    print("{} papers were cited by at least one of the most similar biologists.".format(total_num_papers))
    25_per_df = top_per_biologist *.25
    print("{} papers were cited at least once by 25% of the most similar biologists".format())
    print("{} papers were cited at least once by 50% of the most similar biologists".format())
    print("{} papers were cited at least once by 75% of the most similar biologists".format())
    print("{} papers were cited at least once by 100% of the most similar biologists".format())
    print("Papers are now sorted by most cited to least cited.")
    num_papers = input("How many papers do you want on the recommended reading list?")
    get_citations(most_cited_papers_sort[iloc[num_papers, :].index.to_list())
    

In [None]:
# function for interaction

In [None]:
top_15per_scientists = list(sorted_sim_df.iloc[1:top_15_per+1, 1])

In [None]:
recommendations_15 = final_df.loc[top_15per_scientists, :][final_df.loc[top_15per_scientists, :]==1].fillna(0).T

In [None]:
recommendations_15.shape

In [None]:
recommendations_15['sum'] = recommendations_15.sum(axis=1)

In [None]:
recommendations_15_sort = recommendations_15.sort_values('sum', ascending=False)

In [None]:
rank = recommendations_15_sort.iloc[30, -1]
rank

In [None]:
def get_citations(rec_paper_list):
    id_list = ",".join(rec_paper_list)
    search_results = Entrez.read(Entrez.epost("pubmed", id=id_list))
    query_key = search_results["QueryKey"]
    webenv = search_results["WebEnv"]
    handle = Entrez.efetch(db="pubmed", id=id_list, rettype='medline', retmode='text', webenv=webenv, query_key=query_key)
    records = Medline.parse(handle)
    for index, record in enumerate(records, 1):
        print("{}. {} {}. {}. {}. ({})".format(index, record.get("TI", "?"), record.get("AU", "?"), record.get("JT", "?"),
                                         record.get("DP", "?"), record.get("PMID", "?")))


get_citations(recommendations_15_sort[recommendations_15_sort["sum"] >= rank].index.to_list())