In [73]:
import pandas as pd
import numpy as np

In [74]:
all_df = pd.read_csv("all_researchers.csv", index_col=0)

In [75]:
supplemental_df = pd.read_excel("user_excel.xlsx")

# Matching other info to all researchers

In [90]:
rows_to_match = ['Profession', 'phd_yr', 'phd_degree', 'phd_institution', 'biologydiscipline1', 'biologydiscipline2', 'nonbiologydiscipline', 'affiliation_country', 'affiliation_state']
columns = [[] for x in rows_to_match]
count = 0
for index, row in all_df.iterrows():
    matched_row = supplemental_df[(supplemental_df['NameFirst']==row.NameFirst) & (supplemental_df['NameLast']==row.NameLast)]
    if int(matched_row.shape[0])>0:
        
        # Special case where author shares same first and last name, manauly checked
        if int(matched_row.shape[0])>1:
            matched_row = matched_row[matched_row['Institution'] == 'California State University-Fullerton']

        matched_row = matched_row[rows_to_match]
        for y, x in enumerate(columns):
            x.append(matched_row.iloc[0][y])

           
    else:
        for x in columns:
            x.append(np.NaN)
        

In [93]:
for index, rows in enumerate(rows_to_match):
    all_df[rows] = columns[index]

In [95]:
all_df.to_csv("all_researchers_with_supplement.csv")

# Creating Weights
## Weights
- overlap in relevant words from dept_current, phd_degree, biologydiscipline1, biologydiscipline2, and nonbiologydiscipline that show up in publication title/abstract
- for the projects from 2009-2015 (see attached spreadsheet with project name, participant names, and meeting dates), overlap in words from ProjectName that show up in publication title/abstract after first meeting date 
- overlap in relevant words that show up in titles/abstracts from verified publications from co-author
- overlap in relevant words that show up in Title and Source Title from same author in the NESCent administrator Web of Science Citation Report spreadsheet
- overlap in relevant words that show up in scraped CV/website 
- overlap in date range of publication based on phd_yr



In [6]:
import pandas as pd
from difflib import SequenceMatcher

def similar(a, b):
    return SequenceMatcher(None, a, b).ratio()

researchers_df = pd.read_csv("all_researchers_with_supplement.csv", index_col=0)

In [30]:
import re

relevant = []

for index, row in researchers_df.iterrows():
    relevant_rows = " ".join([str(x) for x in [row.dept_current, row.phd_degree, row.biologydiscipline1, row.biologydiscipline2, row.nonbiologydiscipline]])
    relevant_rows = re.sub('nan\s*', '', relevant_rows)
    relevant_rows = re.sub('[^a-zA-Z\d\s]', ' ', relevant_rows)
    relevant.append(relevant_rows)

researchers_df['relevant_terms'] = relevant 

In [33]:
publications_df = pd.read_csv("NESCent_ID.csv", index_col=0)

In [71]:
title_similarity = []
abstract_similarity = []

last_person = None

for index, row in publications_df.iterrows():
    if(row.NameFirst + " " + row.NameLast != last_person):
        matched_row = researchers_df[(researchers_df['NameFirst']==row.NameFirst) & (researchers_df['NameLast']==row.NameLast)]
    
    rel_terms = matched_row['relevant_terms'].values[0]
    
    title_similarity.append( similar(rel_terms, row.article_title) )
    
    try:
        abstract_similarity.append( similar(rel_terms, row.abstract) )
    except TypeError:
        abstract_similarity.append(0)

    last_person = row.NameFirst + " " + row.NameLast
        

In [74]:
publications_df['title_similarity'] = title_similarity
publications_df['abstract_similarity'] = abstract_similarity

In [78]:
publications_df

Unnamed: 0,NameLast,NameFirst,UID,institution_name,profession_role,dept_current,ScholarID,google_scholar_page,article_title,year,authors,journal,abstract,cited_by,raw,source,date_collected,title_similarity,abstract_similarity
0,Abouheif,Ehab,1369.0,McGill University,Assistant Professor,,ogYgVLwAAAAJ,https://scholar.google.com/citations?user=ogYg...,The evolution of transcriptional regulation in...,2003.0,Gregory A Wray and Matthew W Hahn and Ehab Abo...,Molecular biology and evolution,Gene expression is central to the genotype-phe...,1055,"{""bib"": {""title"": ""The evolution of transcript...",Google Scholar,2018-08-06,0.325581,0.011331
1,Abouheif,Ehab,1369.0,McGill University,Assistant Professor,,ogYgVLwAAAAJ,https://scholar.google.com/citations?user=ogYg...,A method for testing the assumption of phyloge...,1970.0,Ehab Abouheif,Evolutionary Ecology Research,,350,"{""bib"": {""title"": ""A method for testing the as...",Google Scholar,2018-08-06,0.294872,0.000000
2,Abouheif,Ehab,1369.0,McGill University,Assistant Professor,,ogYgVLwAAAAJ,https://scholar.google.com/citations?user=ogYg...,A comparative analysis of allometry for sexual...,1997.0,Ehab Abouheif and Daphne J Fairbairn,The American Naturalist,Rensch's rule states that sexual size dimorphi...,338,"{""bib"": {""title"": ""A comparative analysis of a...",Google Scholar,2018-08-06,0.264151,0.014953
3,Abouheif,Ehab,1369.0,McGill University,Assistant Professor,,ogYgVLwAAAAJ,https://scholar.google.com/citations?user=ogYg...,Evolution of the gene network underlying wing ...,2002.0,Ehab Abouheif and Gregory A Wray,Science,"Wing polyphenism in ants evolved once, 125 mil...",337,"{""bib"": {""title"": ""Evolution of the gene netwo...",Google Scholar,2018-08-06,0.248175,0.004695
4,Abouheif,Ehab,1369.0,McGill University,Assistant Professor,,ogYgVLwAAAAJ,https://scholar.google.com/citations?user=ogYg...,The role of developmental plasticity in evolut...,2011.0,Armin P Moczek and Sonia Sultan and Susan Fost...,,,308,"{""bib"": {""title"": ""The role of developmental p...",Google Scholar,2018-08-06,0.518519,0.000000
5,Abouheif,Ehab,1369.0,McGill University,Assistant Professor,,ogYgVLwAAAAJ,https://scholar.google.com/citations?user=ogYg...,Draft genome of the globally widespread and in...,2011.0,Christopher D Smith and Aleksey Zimin and Cars...,Proceedings of the National Academy of Sciences,,224,"{""bib"": {""title"": ""Draft genome of the globall...",Google Scholar,2018-08-06,0.276730,0.000000
6,Abouheif,Ehab,1369.0,McGill University,Assistant Professor,,ogYgVLwAAAAJ,https://scholar.google.com/citations?user=ogYg...,Draft genome of the red harvester ant Pogonomy...,2011.0,Chris R Smith and Christopher D Smith and Hugh...,Proceedings of the National Academy of Sciences,,219,"{""bib"": {""title"": ""Draft genome of the red har...",Google Scholar,2018-08-06,0.229008,0.000000
7,Abouheif,Ehab,1369.0,McGill University,Assistant Professor,,ogYgVLwAAAAJ,https://scholar.google.com/citations?user=ogYg...,The genome sequence of the leaf-cutter ant Att...,2011.0,Garret Suen and Clotilde Teiling and Lewyn Li ...,PLoS Genetics,Leaf-cutter ants are one of the most important...,206,"{""bib"": {""title"": ""The genome sequence of the ...",Google Scholar,2018-08-06,0.118919,0.003436
8,Abouheif,Ehab,1369.0,McGill University,Assistant Professor,,ogYgVLwAAAAJ,https://scholar.google.com/citations?user=ogYg...,Homology and developmental genes,1970.0,Ehab Abouheif and Michael Akam and William J D...,Trends in genetics,,191,"{""bib"": {""title"": ""Homology and developmental ...",Google Scholar,2018-08-06,0.500000,0.000000
9,Abouheif,Ehab,1369.0,McGill University,Assistant Professor,,ogYgVLwAAAAJ,https://scholar.google.com/citations?user=ogYg...,Limitations of metazoan 18S rRNA sequence data...,1998.0,Ehab Abouheif and Rafael Zardoya and Axel Meyer,Journal of Molecular Evolution,We document the phylogenetic behavior of the 1...,164,"{""bib"": {""title"": ""Limitations of metazoan 18S...",Google Scholar,2018-08-06,0.135021,0.003788
