In [None]:
# To improve, add key word cross validation! 

In [1]:
import pandas as pd
from Bio import Entrez
from Bio import Medline
from fastparquet import write
from scipy.stats import pearsonr

In [2]:
# PubMed credentials
Entrez.email = "ajwright@gmail.com"
Entrez.api_key = "86ac8038bfc913213f007df2803127ebc908" 

In [3]:
def user_entered_info():
    """Stores user-provided scientist name and affiliation.
    
    Arguments:
    none
    
    Returns:
    name - str; scientist name in the format "firstname middleinit lastname"
    affiliation - str; scientist's institutional affiliation
    """
    print("Type the answer to each question then press return.  If you do not know the answer, just hit return.")
    first_name = input("What is the first name of the scientist of interest? ")
    middle_name = input("What is the middle initial of the scientist of interest? ")
    try:
        middle_initial = middle_name[0]
    except IndexError:
        middle_initial = ""
    last_name = input("What is the last name of the scientist of interest? ")
    affliation = input("What is the affiliation of the scientist of interest? No abbreviations, please. ")
    name = "{0} {1} {2}".format(first_name.lower(), middle_initial.lower(), last_name.lower())
    return name, affliation

In [6]:
# Obtains name and affiliation of the biologist of interest
name, affiliation = user_entered_info()

Type the answer to each question then press return.  If you do not know the answer, just hit return.
What is the first name of the scientist of interest? Carolyn
What is the middle initial of the scientist of interest? G
What is the last name of the scientist of interest? Rasmussen
What is the affliation of the scientist of interest? No abbreviations please. University of California Riverside


In [7]:
print(name, affiliation)

carolyn g rasmussen University of California Riverside


In [4]:
def get_scientist_papers(name, affiliation=None):
    """Searches PubMed for papers whose author list and affiliation list contain the provided author name and
    affiliation. 
    
    Arguments:
    name - str; complete scientist name in the format "lastname, firstname middleinitial"
    affiliation (optional) - str
    
    Returns:
    ids - list; list of paper IDs
    webenv - str; used to reference cached NCBI search session in future efetch queries
    query_key - str; used to reference cached NCBI search session in future efetch queries
    """
    if affiliation == None:
        handle = Entrez.esearch(db='pubmed', term=name, retmax=200, usehistory="y")
    else: 
        terms = "{} AND {}".format(name, affiliation)
        handle = Entrez.esearch(db='pubmed', term=terms, retmax=200, usehistory="y")
    record = Entrez.read(handle)
    ids = record['IdList']
    webenv = record["WebEnv"]
    query_key = record["QueryKey"]
    return ids, webenv, query_key

In [9]:
# Retrieves papers authored by the biologist of interest
id_list, webenv, query_key = get_scientist_papers(name, affiliation)

In [5]:
def user_selected_papers(id_list, webenv, query_key):
    """Allows the user to select up to 3 papers authored by a scientist of interest. Uses NCBI cached search
    history.
    
    Arguments:  
    id_list - list; paper ids
    webenv - str; used to reference cached NCBI search session in efetch queries
    query_key - str; used to reference cached NCBI search session in efetch queries
    
    Returns 
    select_list - list; paper ids of user selected papers
    """ 
    from Bio import Medline
    print("Please select up to 3 papers by keying in the corresponding number(s). Seperate each number by a comma.")
    handle = Entrez.efetch(db="pubmed", id=id_list, rettype='medline', retmode='text', webenv=webenv, query_key=query_key)
    records = Medline.parse(handle)
    for index, record in enumerate(records, 1):
        print("{}. {} {}. {}. {}. ({})".format(index, record.get("TI", "?"), record.get("AU", "?"), record.get("JT", "?"),
                                         record.get("DP", "?"), record.get("PMID", "?")))
    paper_num = input("Which papers would you like to select? ")
    paper_num = paper_num.split(',')
    #print(paper_num)
    while paper_num == ['']:
        paper_num = input("No papers selected.  Please select up to 3 papers by keying in the corresponding number(s). Seperate each number by a comma.")
        paper_num = paper_num.split(',')
    while len(paper_num) > 4:
        paper_num = input("Too many papers selected.  Please select up to 3 papers by keying in the corresponding number(s). Seperate each number by a comma.")
        paper_num = paper_num.split(',')
    select_list = []
    for num in paper_num:
        select_list.append(id_list[int(num)-1])
    return select_list

In [11]:
# User specifies up to 3 papers authored by the biologist of interest to base BiologyFinder session on
chosen_papers = user_selected_papers(id_list, webenv, query_key)

Please select up to 3 papers by keying in the corresponding number(s). Seperate each number by a comma.
1. Cell-Based Model of the Generation and Maintenance of the Shape and Structure of the Multilayered Shoot Apical Meristem of Arabidopsis thaliana. ['Banwarth-Kuhn M', 'Nematbakhsh A', 'Rodriguez KW', 'Snipes S', 'Rasmussen CG', 'Reddy GV', 'Alber M']. Bulletin of mathematical biology. 2019 Aug. (30552627)
2. A plane choice: coordinating timing and orientation of cell division during plant development. ['Facette MR', 'Rasmussen CG', 'Van Norman JM']. Current opinion in plant biology. 2019 Feb. (30261337)
3. Predicting Division Planes of Three-Dimensional Cells by Soap-Film Minimization. ['Martinez P', 'Allsman LA', 'Brakke KA', 'Hoyt C', 'Hayes J', 'Liang H', 'Neher W', 'Rui Y', 'Roberts AM', 'Moradifam A', 'Goldstein B', 'Anderson CT', 'Rasmussen CG']. The Plant cell. 2018 Oct. (30150312)
4. The Microtubule-Associated Protein IQ67 DOMAIN5 Modulates Microtubule Dynamics and Pavement 

In [12]:
print(chosen_papers)

['30150312', '29146775', '28202734']


In [6]:
def compile_refs_and_citedin(paper_list):
    """Takes a list of paper IDs and returns a list of the IDs for papers referenced in the original paper list.  The 
    reference list may be incomplete since PubMed does not provide references for all papers.  If concerned, run
    the related function 'compile_refs' to identify which papers are missing references.
    
    Arguments:
    paper_list - list; paper IDs (str)
    
    Returns:
    pubmed_refs - list; paper IDs (str)
    """
    str_paper_list = ",".join(paper_list)
    search_results = Entrez.read(Entrez.epost("pubmed", id=str_paper_list))
    query_key = search_results["QueryKey"]
    webenv = search_results["WebEnv"]
    pub_records = Entrez.read(Entrez.elink(dbfrom="pubmed", WebEnv=webenv, query_key=query_key))
    ref_ids = []
    for entry in pub_records[0]["LinkSetDb"]:
        if entry["LinkName"] == 'pubmed_pubmed_refs':
            refs = entry["Link"]
            try:
                for ref in refs:
                    ref_ids.append(ref['Id'])
            except UnboundLocalError:
                print("No references found")        
        if entry["LinkName"] == 'pubmed_pubmed_citedin':
            cites = entry["Link"]
            try:
                for cite in cites:
                    ref_ids.append(cite['Id'])
            except UnboundLocalError:
                print("No cited in found")
    print("Length ref_ids list", len(ref_ids))
    print("Length ref_ids set", len(set(ref_ids)))
    return list(set(ref_ids))

In [43]:
# Ian has provided the ref IDs for the papers of interest, so I do not need to run compile_refs_and_citedin

ref_IDs_8970153 = ['24827152', '24296585', '23812750', '22836995', '26225566', '10731023', '9360927', 
                     '15514006', '18794201', '9021050', '1597853', '26891037', '15457682', '26225771', '12682019', 
                     '12134017', '11698284', '25822022', '22292937', '21461149', '22500169', '26047941', '27189225',
                     '22074367', '27349729', '26974532', '19564922', '24385908', '27350006', '16014943', '27768313', 
                     '28473734', '28067535', '15284245', '19924079', '23270785']
ref_IDs_Tietjen_2016 = ['3016298', '25569520', '23330927', '23812750', '16182382', '18671200', '24152939', '10731023',
                        '21860787', '2944961', '22326358', '22465592', '26152583', '22817991', '25522685',
                        '15081276', '20542300', '23289738', '10616956', '23450245', '18678237', '25493601', '25830320',
                        '17504362', '31634862', '15951145', '23210782']
ref_IDs_Yang_2019 = ['24827152', '23984250', '28714868', '22837004', '25402363', '19897215', '19100719', '22020221',
                     '11514226', '19696893', '7665567', '23255218', '26681773', '24658076', '27977742', '9371822',
                     '25730868', '10611346', '26225566', '22836995', '22814509', '27757411', '26199173', '23541084',
                     '23134979', '22739395', '26614966', '19239360', '9360927', '9287228', '26891037', '26775808', 
                     '24804860', '25287643', '26225771', '29457784', '12682019', '11285236', '23907534', '25375990',
                     '11698284', '23087374', '30031972', '25963564', '21498519', '25014309', '9144290', '12563294', 
                     '23804764', '26974532', '23370291', '26423811', '26690612', '19265012','18525257','23536672',
                     '23517573', '26169416', '26691297', '30481211', '12350140', '28842560', '24722454', '27898737',
                     '15284245', '9360926', '23270785', '24896263']
ref_IDs_Sun_2019 = ['18833270', '21051991', '24107261', '25185360', '25190350', '26577109', '25361696', '26699285',
                    '25533861', '21287791']

ref_IDs_Mediouni_2018 = ['14999114', '21848464', '11533182', '18264102', '19933330', '10721995', '14610207', 
                         '10332737', '10644369', '9641673', '21377881', '22470838', '14998221', '14519027', '22686620',
                         '16251317', '19249948', '22001594', '23009669', '21355238', '26048637', '25679337', '20036771', 
                         '24339367', '23677886', '24760888', '20145735', '12386343', '15040537', '16302461',
                         '1975844', '26035023', '28099856', '28679752', '21357743', '21672195', '10581250', '15797855',
                         '22817991', '26995550', '19493996', '26152583', '12554735', '15947137', '26810656',
                         '12719560']

ref_IDs_Wang_2017 = ['26577109', '21695615', '7603178', '19295331', '19852669', '27349729', '26130226', '20467286']


In [44]:
all_ref_IDs = ref_IDs_8970153 + ref_IDs_Tietjen_2016 + ref_IDs_Yang_2019 + ref_IDs_Sun_2019 + ref_IDs_Mediouni_2018 + ref_IDs_Wang_2017
print(len(all_ref_IDs))
print(len(set(all_ref_IDs)))


195
178


In [45]:
def get_first_last_authors(paper_id):
    """Given a paper, returns the first and last authors of the paper.
    
    Arguments:
    paper_id - str; paper ids
    
    Returns:
    authors - list of strs; list of full names of the first and last authors of the provided paper id
    """
    handle = Entrez.efetch(db='pubmed', id=paper_id, rettype='medline', retmode="text", retmax=200)
    record = Medline.read(handle)
    authors = record.get("FAU", "?")
    first_last_authors = [authors[0], authors[-1]]
    print(first_last_authors)
    return first_last_authors
    

In [46]:
def author_formatting(author_list):
    """Changes the formatting of author name strings to give the best PubMed search results. 
    
    Arguments:
    author_list - list of str; list of author names
    
    Returns:
    formatted_author_list - list of str; alphabetized list of author names formatted "lastname, firstname"
    """
    formatted_author_list = []
    for author in author_list:
        print(author)
        last_name = author.split(',')[0]
        first_name = author.split(',')[1].lstrip()
        try:
            if first_name[1] == " ":
                try:
                    first_name = first_name[0] + first_name[-1]
                except:
                    first_name = first_name[0]
        except:            
            first_name = first_name[0]
        new_name = "{}, {}".format(last_name, first_name)
        print(new_name)
        formatted_author_list.append(new_name)
    return sorted(formatted_author_list)

In [47]:
def remove_duplicates(sorted_list):
    """Takes a sorted list of authors and if 2 authors have the same last name and first initial, removes the name
    with only the first initial leaving behind the one that has the full first name. 
    
    Arguments:
    sorted_list - list; alphabetized list of authors in the format "lastname, firstname middleinit" OR 
    "lastname, firstinit middleinit"
    
    Returns:
    author_no_dup_list - list; alphabetized list of authors with duplicates removed 
    """
    author_no_dup_list = []
    most_recent_author = " , "
    for i, author in enumerate(sorted_list):
        if author.split(',')[0] == most_recent_author.split(',')[0]:
            if author.split(',')[1].lstrip()[0] != most_recent_author.split(',')[1].lstrip()[0]:
                author_no_dup_list.append(author)
                #print("(1) Compared {} to {} and appended {}".format(author, most_recent_author, author))
            else:
                #print("(2) Compared {} to {} and deleted {} and appended {}".format(author, most_recent_author, author_no_dup_list[-1], author)) 
                del author_no_dup_list[-1]
                author_no_dup_list.append(author)                       
        else:
            author_no_dup_list.append(author)
            #print("(3) Compared {} to {} and appended {}".format(author, most_recent_author, author))
        most_recent_author = author
    return author_no_dup_list 

In [21]:
def create_master_biologist_list(paper_list):
    """Searches PubMed for all the papers cited by or that cites a paper on the paper list.  Returns a list
    of the first and last authors of those papers (duplicates removed).
    
    Arguments:
    paper_list - list; paper IDs (str)
    
    Returns: 
    biologist_master_list - list; biologist names (str)
    """
    ref_citedin_ids = compile_refs_and_citedin(paper_list)
    first_last_author_list = []   
    for paper in ref_citedin_ids:
        print("Getting the authors of paper {}.".format(paper))
        authors = get_first_last_authors(paper)
        first_last_author_list.extend(authors) 
    set_f_l_author_list = list(set(first_last_author_list)) 
    print(len(set_f_l_author_list))
    format_f_l_author_list = author_formatting(set_f_l_author_list)
    print(len(format_f_l_author_list))
    no_dup_f_l_author_list = remove_duplicates(format_f_l_author_list)
    print(len(no_dup_f_l_author_list))
    return no_dup_f_l_author_list

In [48]:
def create_master_biologist_list_from_refIDS(ref_IDs):
    first_last_author_list = []   
    for paper in ref_IDs:
        print("Getting the authors of paper {}.".format(paper))
        authors = get_first_last_authors(paper)
        first_last_author_list.extend(authors) 
    set_f_l_author_list = list(set(first_last_author_list)) 
    print(len(set_f_l_author_list))
    format_f_l_author_list = author_formatting(set_f_l_author_list)
    print(len(format_f_l_author_list))
    no_dup_f_l_author_list = remove_duplicates(format_f_l_author_list)
    print(len(no_dup_f_l_author_list))
    return no_dup_f_l_author_list

In [49]:
master_biologist_list = create_master_biologist_list_from_refIDS(set(all_ref_IDs))

Getting the authors of paper 25569520.
['Akkouh, Ouafae', 'Cheung, Randy Chi Fai']
Getting the authors of paper 20145735.
['Lara, Humberto H', 'Rodriguez-Padilla, Cristina']
Getting the authors of paper 19933330.
['Cicala, Claudia', 'Arthos, James']
Getting the authors of paper 20542300.
['Mulholland, Dulcie A', 'Nuzillard, Jean-Marc']
Getting the authors of paper 28067535.
['Wang, Yifei', 'Suo, Zucai']
Getting the authors of paper 15947137.
['Lindenbach, Brett D', 'Rice, Charles M']
Getting the authors of paper 15951145.
['Woradulayapinij, Warunya', 'Wiwat, Chanpen']
Getting the authors of paper 24722454.
['Wei, Datsen George', 'Cihlar, Tomas']
Getting the authors of paper 24760888.
['Abram, Michael E', 'Hughes, Stephen H']
Getting the authors of paper 27350006.
['Tietjen, Ian', 'Andrae-Marobela, Kerstin']
Getting the authors of paper 25402363.
['Archin, Nancie M', 'Margolis, David M']
Getting the authors of paper 31634862.
['Wensing, Annemarie M', 'Richman, Douglas D']
Getting the au

['Abreu, Celina M', 'Gama, Lucio']
Getting the authors of paper 9641673.
['Moore, J P', 'Binley, J']
Getting the authors of paper 26225771.
['Jiang, Guochun', 'Dandekar, Satya']
Getting the authors of paper 9287228.
['Gulick, R M', 'Chodakewitz, J A']
Getting the authors of paper 19897215.
['Avila, Liliana', 'Echeverri, Fernando']
Getting the authors of paper 24296585.
['Archin, Nancie M', 'Margolis, David M']
Getting the authors of paper 23255218.
['Boehm, Daniela', 'Ott, Melanie']
Getting the authors of paper 25190350.
['Wang, Jian', 'Wang, Yuguang']
Getting the authors of paper 14519027.
['Duffalo, Melody L', 'James, Christopher W']
Getting the authors of paper 19239360.
['Archin, Nancie M', 'Margolis, David M']
Getting the authors of paper 11698284.
['Kulkosky, J', 'Pomerantz, R J']
Getting the authors of paper 25679337.
['Liu, Bingrui', 'Staerk, Dan']
Getting the authors of paper 26690612.
['Rasmussen, Thomas Aagaard', 'Sogaard, Ole Schmeltz']
Getting the authors of paper 25014309

In [52]:
def create_biologist_paper_dict(biologist_list):
    """Takes a list of biologists and looks up the IDs of all the papers they authored in PubMed.  Returns
    a dictionary where the biologist's name is the key and the value is a list of their paper IDs. 
    
    Arguments:
    biologist_list - list; biologist names (str)
    
    Returns:
    biologist_paper_dict - dict; keys are biologist names (str) and the values are a list of IDs of the papers 
    authored by the biologist
    """
    biologist_papers_dict = {}
    for biologist in biologist_list:
        biologist_nocomma = biologist.replace(',', '')
        print("Getting papers authored by {}.".format(biologist_nocomma))
        papers = get_scientist_papers(biologist_nocomma)[0]
        biologist_papers_dict[biologist] = papers
    zero_papers = []
    has_papers = []
    total = 0
    for key, value in biologist_papers_dict.items():
        if len(value) == 0:
            zero_papers.append(key)
        else:
            has_papers.append(key)
        total += len(value)
    print("Total papers:", total)
    print("Zero papers were retrieved for the following authors:", zero_papers)
    print("More than one paper was retrieved for the following authors:", has_papers)
    return biologist_papers_dict


In [53]:
the_biologist_paper_dict = create_biologist_paper_dict(master_biologist_list)

Getting papers authored by Abram Michael E.
Getting papers authored by Abreu Celina M.
Getting papers authored by Adachi A.
Getting papers authored by Aditya Suruchi.
Getting papers authored by Aguiar Renato Santana.
Getting papers authored by Aiyegoro Olayinka A.
Getting papers authored by Akkouh Ouafae.
Getting papers authored by Alcami Jose.
Getting papers authored by Allen Todd M.
Getting papers authored by Andersen Raymond J.
Getting papers authored by Andrae-Marobela Kerstin.
Getting papers authored by Appendino G.
Getting papers authored by Archin Nancie M.
Getting papers authored by Arthos James.
Getting papers authored by Asiimwe S.
Getting papers authored by Avila Liliana.
Getting papers authored by Badley Andrew D.
Getting papers authored by Baran Phil S.
Getting papers authored by Barre-Sinoussi Francoise.
Getting papers authored by Beans Elizabeth J.
Getting papers authored by Bedoya Luis M.
Getting papers authored by Bernhard Wendy.
Getting papers authored by Beutler JA.


Getting papers authored by Ren J.
Getting papers authored by Reuse Sophie.
Getting papers authored by Rice Charles M.
Getting papers authored by Richman Douglas D.
Getting papers authored by Rodriguez-Padilla Cristina.
Getting papers authored by Romanelli Frank.
Getting papers authored by Routy Jean-Pierre.
Getting papers authored by Rouzioux Christine.
Getting papers authored by Ruprecht Claudia R.
Getting papers authored by Sadowski Ivan.
Getting papers authored by Sagot-Lerolle Nathalie.
Getting papers authored by Sanchez-Madrid F.
Getting papers authored by Saphire AC.
Getting papers authored by Sato Yuichiro.
Getting papers authored by Schulz TF.
Getting papers authored by Shafer Robert W.
Getting papers authored by Shan Liang.
Getting papers authored by Shirakawa Kotaro.
Getting papers authored by Siliciano Robert F.
Getting papers authored by Skolnik Paul R.
Getting papers authored by Sluis-Cremer Nicolas.
Getting papers authored by Sogaard Ole Schmeltz.
Getting papers authored 

In [54]:
def get_and_compile_refs(paper_list):
    """Takes a list of paper IDs and returns a list of the IDs of the papers referenced by papers in the original list.
    The reference list may be incomplete since PubMed does not provide references for all papers.  If concerned, run
    the related function 'compile_refs' to identify which papers are missing references.
    
    Arguments:
    paper_list - list, paper IDs (str)
    
    Returns:
    pubmed_refs - list, paper IDs 
    """
    str_paper_list = ",".join(paper_list)
    search_results = Entrez.read(Entrez.epost("pubmed", id=str_paper_list))
    query_key = search_results["QueryKey"]
    webenv = search_results["WebEnv"]
    pub_records = Entrez.read(Entrez.elink(dbfrom="pubmed", WebEnv=webenv, query_key=query_key))
    for entry in pub_records[0]["LinkSetDb"]:
        if entry["LinkName"] == 'pubmed_pubmed_refs':
            pubmed_refs = entry["Link"]
    ref_ids = []
    try:
        for ref in pubmed_refs:
            ref_ids.append(ref['Id'])
    except UnboundLocalError:
        print("No references found")
    return ref_ids

In [55]:
def create_biologist_cited_papers_dict(biologist_paper_dict):
    """Takes a dictionary of biologists and the papers they wrote, looks up each paper in PubMed and returns a new
    dictionary containing the biologist and a list of all the papers they cite(reference) within the papers they wrote. 
    
    Arguments:
    biologist_paper_dict - dict; keys are biologist's names (str) and the values are a list of IDs of the papers 
    authored by the biologist
    
    Returns:
    biologist_cited_papers_dict - dict; keys are biologist's names (str) and the values are a list of IDS of papers
    cited by the biologist 
    """
    biologist_cited_papers_dict = {}
    for key, value in biologist_paper_dict.items():
        print("Looking up papers cited by ", key)
        try:
            paper_list = get_and_compile_refs(value)
            biologist_cited_papers_dict[key] = paper_list
        except:
            biologist_cited_papers_dict[key] = []
    return biologist_cited_papers_dict


In [56]:
the_biologist_cited_papers_dict = create_biologist_cited_papers_dict(the_biologist_paper_dict)

Looking up papers cited by  Abram, Michael E
Looking up papers cited by  Abreu, Celina M
Looking up papers cited by  Adachi, A
Looking up papers cited by  Aditya, Suruchi
Looking up papers cited by  Aguiar, Renato Santana
Looking up papers cited by  Aiyegoro, Olayinka A
Looking up papers cited by  Akkouh, Ouafae
Looking up papers cited by  Alcami, Jose
Looking up papers cited by  Allen, Todd M
Looking up papers cited by  Andersen, Raymond J
Looking up papers cited by  Andrae-Marobela, Kerstin
Looking up papers cited by  Appendino, G
Looking up papers cited by  Archin, Nancie M
Looking up papers cited by  Arthos, James
Looking up papers cited by  Asiimwe, S
Looking up papers cited by  Avila, Liliana
No references found
Looking up papers cited by  Badley, Andrew D
Looking up papers cited by  Baran, Phil S
Looking up papers cited by  Barre-Sinoussi, Francoise
Looking up papers cited by  Beans, Elizabeth J
Looking up papers cited by  Bedoya, Luis M
Looking up papers cited by  Bernhard, Wen

Looking up papers cited by  Pieters, Luc
Looking up papers cited by  Planelles, Vicente
Looking up papers cited by  Pomerantz, Roger J
Looking up papers cited by  Poorolajal, J
Looking up papers cited by  Poveda, Eva
Looking up papers cited by  Qu, Xiying
Looking up papers cited by  Rapista, Aprille
Looking up papers cited by  Rasmussen, Thomas Aagaard
Looking up papers cited by  Rausch, Keiko
Looking up papers cited by  Ren, J
Looking up papers cited by  Reuse, Sophie
Looking up papers cited by  Rice, Charles M
Looking up papers cited by  Richman, Douglas D
Looking up papers cited by  Rodriguez-Padilla, Cristina
Looking up papers cited by  Romanelli, Frank
Looking up papers cited by  Routy, Jean-Pierre
Looking up papers cited by  Rouzioux, Christine
Looking up papers cited by  Ruprecht, Claudia R
Looking up papers cited by  Sadowski, Ivan
Looking up papers cited by  Sagot-Lerolle, Nathalie
No references found
Looking up papers cited by  Sanchez-Madrid, F
Looking up papers cited by  Sa

In [57]:
# combine all the cited papers (removing duplicates) to create the feature set aka repo vocabulary

def create_paper_features_list(biologist_cited_papers_dict):
    """Takes a dictionary of cited papers and combines them into a list with no duplicates.
    
    Arguments:
    biologist_cited_papers_dict - dict; keys are biologist's names (str) and the values are a list of paper IDs cited in 
    papers authored by the biologist
    
    Returns:
    paper_features - list; list of all the papers cited by every author in the starting dict (no duplicates)
    """
    paper_features_temp = []
    for value in biologist_cited_papers_dict.values():
        paper_features_temp.extend(value)
    print(len(paper_features_temp))    
    paper_features = list(set(paper_features_temp))
    print(len(paper_features))
    return paper_features
          

In [58]:
the_paper_features_list = create_paper_features_list(the_biologist_cited_papers_dict)

384339
251858


In [59]:
# run every author against the feature set to create a binary vector that contains a 1 if they cited a paper or a 0 if not

def create_binary_feature_vectors(biologist_cited_papers_dict, paper_features_list):
    """Builds a set of feature vectors for each biologist (key) in the biologist_cited_papers_dict by looking to see 
    if each paper in the paper_feature_list is cited by the biologist (a list of cited papers is the value associated 
    with each biologist key).  Adds a one to the feature vector if the paper is cited and a 0 if it is not. 
    
    Arguments:
    biologist_cited_papers_dict - dict; keys are biologist's names (str) and the values are a list of paper IDs cited 
    in papers authored by the biologist 
    paper_features_list - list; list of all the papers cited by every author in the starting dict
    
    Returns:
    all_binary_feature_vectors - list of lists; lists of 1s and 0s that indicate if a biologist cited a paper in the 
    paper_features_list or not
    """
    all_binary_feature_vectors = []
    for key, value in biologist_cited_papers_dict.items():
        biologist_vector = []
        for paper in paper_features_list:
            if paper in value:
                biologist_vector.append(1)
            else:
                biologist_vector.append(0)
        all_binary_feature_vectors.append(biologist_vector)
    return all_binary_feature_vectors

In [60]:
the_binary_feature_vectors = create_binary_feature_vectors(the_biologist_cited_papers_dict, the_paper_features_list)

In [61]:
def create_comparison_binary_vector(paper_list, paper_features_list):
    """Builds a feature vector for the originating set of papers by looking to see if each paper in the paper_
    features_list is cited by any of the originating set of papers. Adds a one to the feature vector if the paper is 
    cited and a 0 if it is not. 
    
    Arguments:
    paper_list - list; list of paper IDs (str) corresponding to the originating set of papers
    paper_features_list - list; list of all the papers cited by every author in the biologist_cited_papers_dict
    
    Returns:
    comparision_binary_vector - list; list of 1s and 0s that indicate if the originating set of papers cited a paper
    in the paper_features_list or not
    """
    comparison_binary_vector = []
    comparison_refs = get_and_compile_refs(chosen_papers)
    for i in paper_features_list:
        if i in comparison_refs:
            comparison_binary_vector.append(1)
        else:
            comparison_binary_vector.append(0)
    return comparison_binary_vector        

In [63]:
def create_comparison_binary_vector_using_refs(ref_IDs, paper_features_list):
    """Builds a feature vector for the originating set of papers by looking to see if each paper in the paper_
    features_list is cited by any of the originating set of papers. Adds a one to the feature vector if the paper is 
    cited and a 0 if it is not. 
    
    Arguments:
    paper_list - list; list of paper IDs (str) corresponding to the originating set of papers
    paper_features_list - list; list of all the papers cited by every author in the biologist_cited_papers_dict
    
    Returns:
    comparision_binary_vector - list; list of 1s and 0s that indicate if the originating set of papers cited a paper
    in the paper_features_list or not
    """
    comparison_binary_vector = []
    for i in paper_features_list:
        if i in ref_IDs:
            comparison_binary_vector.append(1)
        else:
            comparison_binary_vector.append(0)
    return comparison_binary_vector   

In [64]:
the_comparision_binary_vector = create_comparison_binary_vector_using_refs(all_ref_IDs, the_paper_features_list)

In [65]:
def create_biologist_finder_df(binary_feature_vectors, paper_features_list, biologist_cited_papers_dict, comparison_vector):
    """Takes the binary feature vectors and comparison binary vector and builds a pandas dataframe that has biologist's names
    as the index and paper IDs as the columns.  Dataframe is filled with 1s and 0s to indicate if a biologist has cited
    the column paper or not. The last row contains the data for the originating set of papers. 
    
    Arguments: 
    binary_feature_vectors - list of lists; lists of 0s and 1s that indicate if a biologist cited a paper in the 
    paper_features_list
    paper_features_list - list; list of all the papers cited by every author in the starting dict
    biologist_cited_papers_dict - dict; keys are biologist names (str) and the values are a list of paper IDS 
    corresponding to papers cited by the biologist
    comparison_vector - list; list of 0s and 1s that indicates if the originating set of papers cited a paper
    in the paper_features_list
    
    Returns:
    final_df - Pandas df; has biologist's names as the index and paper IDs as the columns.  Dataframe is filled with 
    1s and 0s to indicate if a biologist has cited the column paper or not. The last row represents the originating 
    set of papers and is indexed as "comparison" 
    """
    temp_df = pd.DataFrame(binary_feature_vectors, columns=paper_features_list, index=biologist_cited_papers_dict.keys())
    comp_series = pd.Series(comparison_vector).to_frame("comparison").T
    comp_series.columns = temp_df.columns
    final_df = pd.concat([temp_df, comp_series])
    final_df.dropna(axis=0, inplace=True)
    return final_df

In [67]:
bf_df = create_biologist_finder_df(the_binary_feature_vectors, the_paper_features_list, the_biologist_cited_papers_dict, 
                                   the_comparision_binary_vector)

In [68]:
bf_df.shape

(278, 251858)

In [69]:
bf_df.head()

Unnamed: 0,15184506,27934476,26343504,26316679,8647346,24257436,25450805,8790603,17540168,10679007,...,3584988,22410007,17382883,16935573,27924033,21989829,27325287,22955054,1572977,16567715
"Abram, Michael E",0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
"Abreu, Celina M",0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
"Adachi, A",0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
"Aditya, Suruchi",0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
"Aguiar, Renato Santana",0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [70]:
bf_df.tail()

Unnamed: 0,15184506,27934476,26343504,26316679,8647346,24257436,25450805,8790603,17540168,10679007,...,3584988,22410007,17382883,16935573,27924033,21989829,27325287,22955054,1572977,16567715
"Zou, Wen",0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
"Zuo, Zhong",0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
"van Kooyk, Y",0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
"van der Kooy, Frank",0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
comparison,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [83]:
from fastparquet import write

write('Ian_6_refs_df.parq',bf_df)

In [5]:
bf_df = pd.read_parquet('Rasmussen_ver3_df.parq', engine='fastparquet')

In [6]:
bf_df.head()

Unnamed: 0_level_0,18337155,20379956,20974816,15048123,24249263,30178601,20625003,25657044,20926419,23711327,...,23970914,11877373,12239373,24012599,26523777,24921257,23318657,25089249,11430871,12195433
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
"Abrash, Emily B",0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
"Ambrose, Jn",0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
"Anderson, Charles T",0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
"Andrey, Philippe",0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
"Arganda-Carreras, Ignacio",0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [71]:
# Find out which scientists are closest to the original scientist.
def create_similarity_scores_df(biologist_finder_df):
    """Calculates pearsonr correlation coefficients between the last row of a dataframe and all the remaining rows. 
    Reports the coefficient ("similarity") in a new dataframe with the same row index as the original dataframe.
    
    Arguments:
    biologist_finder_df - pandas dataframe; rows are individual feature vectors and the last row is compared to all 
    other rows
    
    Returns:
    sorted_sim_df - pandas dataframe; 2 columns - "scientist" which is the biologist's name and "similarity" which is
    the pearsonr coefficient between scientist's feature vector and the last row of biologist_finder_df.  Dataframe is 
    sorted based on similarity scores from highest to lowest. 
    """
    sim_score = {}
    for i in range(len(biologist_finder_df)):
        score = pearsonr(biologist_finder_df.iloc[-1, :], biologist_finder_df.iloc[i, :])
        sim_score.update({i: score[0]})
    sim_df = pd.Series(sim_score).to_frame("similarity")
    sim_df["scientist"] = biologist_finder_df.index
    sorted_sim_df = sim_df.sort_values('similarity', ascending=False)
    return sorted_sim_df

In [72]:
ss_df = create_similarity_scores_df(bf_df)



In [73]:
ss_df.shape

(278, 2)

In [74]:
# function to return a list of the X% of scientists on the list

def most_sim_biologists(similarity_df, per):
    """Takes a sorted dataframe and a float representing a percent.  Returns that percentage of the dataframe.
    
    Arguments:
    similarity_df - pandas dataframe; sorted dataframe
    per - float; represents a percentage
    
    Returns:
    similarity_df_per- pandas dataframe; truncated version of starting dataframe containing the user-specified percentage
    of rows
    """
    num_biologists = similarity_df.shape[0]
    top_per = int(num_biologists * per)
    similarity_df_per = similarity_df.head(top_per)
    print(similarity_df_per)
    return similarity_df_per

In [79]:
top_sim_bio_df = most_sim_biologists(ss_df, 0.2)

     similarity                  scientist
277    1.000000                 comparison
272    0.168154             Zhu, Huanzhang
190    0.145142                 Qu, Xiying
54     0.137949    Delagreverie, Heloise M
19     0.137747         Beans, Elizabeth J
218    0.134886             Spivak, Adam M
51     0.122545             Darcis, Gilles
12     0.122386           Archin, Nancie M
110    0.119494                 Ji, Haiyan
36     0.118871            Cary, Daniele C
260    0.110801                Xing, Sifei
176    0.098735        Pandelo Jose, Diego
28     0.097985            Bouchat, Sophie
111    0.097621             Jiang, Guochun
152    0.093277           McKernan, Lisa N
124    0.092959           Laird, Gregory M
186    0.092170         Planelles, Vicente
233    0.091449  Valadao, Ana Luiza Chaves
150    0.087350          Margolis, David M
1      0.085998            Abreu, Celina M
217    0.084136             Spina, Celsa A
195    0.083677              Reuse, Sophie
227    0.08

In [80]:
def get_citations(rec_paper_list):
    """Takes a list of paper ID numbers and return a PubMed reference for each paper on the list. 
    
    Arguments:
    rec_paper_list - list of strs; paper IDs numbers
    
    Returns:
    None
    (prints the references to the screen)
    """
    id_list = ",".join(rec_paper_list)
    search_results = Entrez.read(Entrez.epost("pubmed", id=id_list))
    query_key = search_results["QueryKey"]
    webenv = search_results["WebEnv"]
    handle = Entrez.efetch(db="pubmed", id=id_list, rettype='medline', retmode='text', webenv=webenv, query_key=query_key)
    records = Medline.parse(handle)
    for index, record in enumerate(records, 1):
        print("{}. {} {}. {}. {}. ({})".format(index, record.get("TI", "?"), record.get("AU", "?"), record.get("JT", "?"),
                                         record.get("DP", "?"), record.get("PMID", "?")))

In [81]:
# function to return a list of the shared citations of the X% of scientists on the list

def reading_list(biologist_finder_df, most_sim_bio_df):
    """Takes a dataframe containing the most similar biologists as well as the binary feature vector dataframe. Creates a new
    dataframe with paper IDs as the index and biologists as columns and it contains a 1 if the biologist ever cited
    the paper and a 0 if not.  Citations per paper are summed and then the df is sorted so that the most cited
    papers are at the top.  Prints to the terminal the number of papers cited by 10%, 20%, 30%, etc of the most
    similar biologists.  Prints to the terminal citations for the user-specified number of papers. 
    
    Arguments:
    biologist_finder_df - pandas dataframe; rows are individual biologist feature vectors
    most_sim_bio_df - pandas dataframe; 2 columns - "scientist" which is the biologist's name and "similarity" which is
    the pearsonr coefficient between scientist's feature vector and the last row of biologist_finder_df.  
    
    Returns:
    None
    (prints to the terminal citations for a user-specified number of most cited papers)
    """
    top_per_biologists = list(most_sim_bio_df.iloc[1:, 1])
    num_top_biologists = len(top_per_biologists)
    most_cited_papers = biologist_finder_df.loc[top_per_biologists, :][biologist_finder_df.loc[top_per_biologists, :] ==1].fillna(0).T
    most_cited_papers['sum'] = most_cited_papers.sum(axis=1)
    most_cited_papers_sort = most_cited_papers.sort_values('sum', ascending=False)
    total_num_papers = most_cited_papers_sort.shape[0]
    for i in range(10, 110, 10):
        per_of_top_biol = round(num_top_biologists * i/100)
        num_papers = most_cited_papers_sort[most_cited_papers_sort["sum"] >= per_of_top_biol].shape[0]
        if num_papers != 1:
            print("{} papers were cited at least once by {}% ({}) of the most similar biologists.".format(num_papers, i, per_of_top_biol))
        else:
            print("{} paper was cited at least once by {}% ({}) of the most similar biologists.".format(num_papers, i, per_of_top_biol))   
    num_papers = int(input("How many papers do you want on the recommended reading list? "))
    reading_list_ids = most_cited_papers_sort.head(num_papers).index.to_list()
    get_citations(reading_list_ids) 

In [82]:
reading_list(bf_df, top_sim_bio_df)

1778 papers were cited at least once by 10% (5) of the most similar biologists.
440 papers were cited at least once by 20% (11) of the most similar biologists.
227 papers were cited at least once by 30% (16) of the most similar biologists.
104 papers were cited at least once by 40% (22) of the most similar biologists.
54 papers were cited at least once by 50% (27) of the most similar biologists.
30 papers were cited at least once by 60% (32) of the most similar biologists.
15 papers were cited at least once by 70% (38) of the most similar biologists.
3 papers were cited at least once by 80% (43) of the most similar biologists.
0 papers were cited at least once by 90% (49) of the most similar biologists.
0 papers were cited at least once by 100% (54) of the most similar biologists.
How many papers do you want on the recommended reading list? 104
1. Administration of vorinostat disrupts HIV-1 latency in patients on antiretroviral therapy. ['Archin NM', 'Liberty AL', 'Kashuba AD', 'Choudh

69. Disulfiram reactivates latent HIV-1 in a Bcl-2-transduced primary CD4+ T cell model without inducing global T cell activation. ['Xing S', 'Bullen CK', 'Shroff NS', 'Shan L', 'Yang HC', 'Manucci JL', 'Bhat S', 'Zhang H', 'Margolick JB', 'Quinn TC', 'Margolis DM', 'Siliciano JD', 'Siliciano RF']. Journal of virology. 2011 Jun. (21471244)
70. An inducible transcription factor activates expression of human immunodeficiency virus in T cells. ['Nabel G', 'Baltimore D']. Nature. 1987 Apr 16-22. (3031512)
71. HMBA releases P-TEFb from HEXIM1 and 7SK snRNA via PI3K/Akt and activates HIV transcription. ['Contreras X', 'Barboric M', 'Lenasi T', 'Peterlin BM']. PLoS pathogens. 2007 Oct 12. (17937499)
72. A pilot study assessing the safety and latency-reversing activity of disulfiram in HIV-1-infected adults on antiretroviral therapy. ['Spivak AM', 'Andrade A', 'Eisele E', 'Hoh R', 'Bacchetti P', 'Bumpus NN', 'Emad F', 'Buckheit R 3rd', 'McCance-Katz EF', 'Lai J', 'Kennedy M', 'Chander G', 'Sil