In [1]:
import pandas as pd
import re
import numpy as np
import regex

In [2]:
data = pd.read_csv('../ete3/Plt_sci_publications_geo_species_8.26.csv', low_memory=False)

# Get dict of names and genders to reduce calls to GenderAPI

In [3]:
names_dict = data[['First_names_currated', 'inferred_gender']].dropna()

In [4]:
names_dict['First_names_currated'] = names_dict['First_names_currated'].str.strip()

In [5]:
# need to drop duplicates here otherwise issues when merging
names_dict = names_dict.drop_duplicates(subset=['First_names_currated'])

In [6]:
names_dict = names_dict.rename(columns = {'First_names_currated': 'First name'})

# Fix problems

Running the line
    
    re.findall(one_loc_out.iloc[i]['Corresponding_author_last_name']+"+[^;]*", one_loc_out.iloc[i]['Author Full Names'])
    
in case 2.1 made me realize entries 24144, 25071, and 40426 have an extra semicolon in the corresp. author addresses. Fix these manually here.

In [7]:
for i in [24144, 25071, 40426]:
    data.at[i,'Corresponding author Addresses'] = data.loc[i]['Corresponding author Addresses'].replace(';', '')

# Get first names

In [8]:
# if they have no names we don't care
data = data.dropna(subset='Author Full Names')

In [9]:
# gather all names of the form Lastname, AB or Lastname, A.B.
all_initials = []

for i in data.index:
    author_list = data.loc[i]['Author Full Names'].split('; ')
    if len([x for x in author_list if not re.findall(', [A-Z]{1,}$',x) and not re.findall(', [A-Z]. [A-Z].$',x) and not re.findall(', [A-Z].$',x)]) == 0:
        all_initials.append(i)

# remove entries with names found above from the dataset
data = data[~data.index.isin(all_initials)]

In [10]:
# many of these are one author with multiple affiliations

mult_corresp = []

for i in range(len(data['Corresponding author Addresses'])):
    if ';' in data.iloc[i]['Corresponding author Addresses']:
        mult_corresp.append(i)

## Case 1: one author, mult affiliations
Don't need to do anything with these

In [11]:
one_author = []

for i in mult_corresp:
    test_str = data.iloc[i]['Corresponding author Addresses']
    test_str_list = test_str.split(';')
    stripped_list = [re.findall('[^()]+', i)[0].strip() for i in test_str_list]
    
    if len(set(stripped_list)) == 1:
        one_author.append(i)

## Case 2: mult authors

In [12]:
# get only true mult authors
mult_authors = sorted(list(set(mult_corresp)-set(one_author)))

In [13]:
mult_authors_df = data.iloc[mult_authors]

In [14]:
mult_authors_df['Corresponding author Addresses'] = mult_authors_df['Corresponding author Addresses'].str.split(';')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  mult_authors_df['Corresponding author Addresses'] = mult_authors_df['Corresponding author Addresses'].str.split(';')


In [15]:
mult_authors_df['Corresponding author geocoords'] = mult_authors_df['Corresponding author geocoords'].str.split(';')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  mult_authors_df['Corresponding author geocoords'] = mult_authors_df['Corresponding author geocoords'].str.split(';')


### Subcase 1: Authors all have same location
In this case, there will be only one location listed under corresp author geocoords

In [16]:
one_loc_df = mult_authors_df[mult_authors_df['Corresponding author geocoords'].map(len)==1]

In [17]:
one_loc_df['map_props'] = [1 for i in range(len(one_loc_df))]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  one_loc_df['map_props'] = [1 for i in range(len(one_loc_df))]


In [18]:
one_loc_out = one_loc_df.explode('Corresponding author Addresses')
one_loc_out['map_props'] /= one_loc_out['map_props'].groupby(level=0).transform('count')

In [19]:
one_loc_out['Corresponding_author_last_name'] = one_loc_out['Corresponding author Addresses'].str.split(', ').str[0]

In [20]:
one_loc_out['Corresponding_author_last_name'] = one_loc_out['Corresponding_author_last_name'].str.strip()

In [21]:
# names that just have initials as first names will just have the initials as the first name
one_loc_out["first and last names"]=[re.findall(one_loc_out.iloc[i]['Corresponding_author_last_name']+"+[^;]*", one_loc_out.iloc[i]['Author Full Names']) for i in range(len(one_loc_out))]

In [22]:
one_loc_out = one_loc_out.reset_index()

In [23]:
no_authors = []

for i in one_loc_out.index:
    if len(one_loc_out.loc[i]['first and last names']) == 0:
        no_authors.append(i)
        
no_authors_df = one_loc_out.loc[no_authors]

In [24]:
# need to do the german last name thing
new_last_names = []

for i in list(no_authors_df.index):
    old_last_name = no_authors_df.loc[i]['Corresponding_author_last_name']
    author_full_name = no_authors_df.loc[i]['Author Full Names']
    # if the old last name is one insertion away from something in the author full names, then consider it a match
    new_last_name = regex.findall("("+old_last_name+"){i<=1,d<=1}", author_full_name, overlapped=True)
    # if there are still no matches, ignore for now
    if len(new_last_name) == 0:
        pass    
    else:
        new_last_names.append([i,new_last_name[0]])

In [25]:
# replace last names in old dataframe with new last names (which contain an extra letter)
for i in range(len(new_last_names)):
    no_authors_df.at[new_last_names[i][0],'Corresponding_author_last_name'] = new_last_names[i][1]

In [26]:
no_authors_df["first and last names"]=[re.findall(no_authors_df.loc[i]['Corresponding_author_last_name']+"+[^;]*", no_authors_df.loc[i]['Author Full Names']) for i in no_authors_df.index]

In [27]:
# drop people with no first names at this point
one_loc_out = one_loc_out[one_loc_out["first and last names"].map(len)==1]

In [28]:
one_loc_out["first and last names"] = [i[0].split(', ') for i in one_loc_out["first and last names"]]

In [29]:
one_loc_out["First name"] = [i[-1] for i in one_loc_out["first and last names"]]

In [30]:
one_loc_names = one_loc_out.merge(names_dict, how='left', on='First name')

In [31]:
one_loc_names["Last name"] = [i[0] for i in one_loc_names["first and last names"]]

In [32]:
one_loc_na = one_loc_names[one_loc_names['inferred_gender_y'].isna()].drop_duplicates(subset=['First name'])

In [33]:
one_loc_na['Corresponding_auth_country'] = [i[-1] for i in one_loc_na['Corresponding author locations'].str.split(',')]

In [34]:
# one_loc_na[['First name','Last name','Corresponding_auth_country']].to_csv('one_location.csv', index=False)

In [35]:
one_loc_enriched = pd.read_csv('one_location_enriched.csv')

In [36]:
one_loc_na['inferred_gender'] = list(one_loc_enriched['ga_gender'])

In [37]:
for i in one_loc_na.index:
    one_loc_names.at[i,'inferred_gender_y'] = one_loc_na.at[i,'inferred_gender']

### Subcase 2: All authors have different locations

In [38]:
diff_loc_df = mult_authors_df[mult_authors_df['Corresponding author geocoords'].map(len)==mult_authors_df['Corresponding author Addresses'].map(len)]

In [39]:
diff_loc_df['map_props'] = [1 for i in range(len(diff_loc_df))]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  diff_loc_df['map_props'] = [1 for i in range(len(diff_loc_df))]


In [40]:
diff_loc_out = diff_loc_df.explode(['Corresponding author Addresses','Corresponding author geocoords'])
diff_loc_out['map_props'] /= diff_loc_out['map_props'].groupby(level=0).transform('count')

In [41]:
diff_loc_out[0:20]

Unnamed: 0,Author Full Names,Article Title,Journal,impact factor (2020),Author Keywords,Keywords Plus,Abstract,Addresses--all authors,Nations--all authors,Corresponding author Addresses,...,GDP_2020,_merge,Locations--all authors,Geocoords--all authors,Corresponding author locations,Corresponding author geocoords,Number of authoring countries,Number of corresponding countries,unique_NCBI,map_props
15,"Amaral, Raquel; Sevcikova, Tereza; Elias, Mare...",CharaciopsisBorzi belongs to the Eustigmatophy...,EUROPEAN JOURNAL OF PHYCOLOGY,0.874,Characiopsis; Eustigmataceae group; Eustigmato...,ALGAL CLASS; PHYLOGENY; ULTRASTRUCTURE; NOMENC...,"Characiopsis, established by Borzi in 1895, is...","[Amaral, Raquel; Santos, Lilia M. A.] Univ Coi...",Portugal; Czechia,"Amaral, R (corresponding author), Univ Coimbra...",...,245000000000.0,both,"Coimbra, Portugal; Ostrava, Czechia","(40.20331450000001, -8.4102573); (49.8209226, ...","Coimbra, Portugal; Ostrava, Czechia","(40.20331450000001, -8.4102573)",2,2,"[425071, 2713033, 1431827, 5747, 44429, 271303...",0.5
15,"Amaral, Raquel; Sevcikova, Tereza; Elias, Mare...",CharaciopsisBorzi belongs to the Eustigmatophy...,EUROPEAN JOURNAL OF PHYCOLOGY,0.874,Characiopsis; Eustigmataceae group; Eustigmato...,ALGAL CLASS; PHYLOGENY; ULTRASTRUCTURE; NOMENC...,"Characiopsis, established by Borzi in 1895, is...","[Amaral, Raquel; Santos, Lilia M. A.] Univ Coi...",Portugal; Czechia,"Elias, M (corresponding author), Univ Ostrava...",...,245000000000.0,both,"Coimbra, Portugal; Ostrava, Czechia","(40.20331450000001, -8.4102573); (49.8209226, ...","Coimbra, Portugal; Ostrava, Czechia","(49.8209226, 18.262524300000006)",2,2,"[425071, 2713033, 1431827, 5747, 44429, 271303...",0.5
50,"Bernard, Miriam S.; Strittmatter, Martina; Mur...","Diversity, biogeography and host specificity o...",EUROPEAN JOURNAL OF PHYCOLOGY,0.874,5'COI; barcoding; biogeography; endophytes; ho...,ALGA CAULERPA-TAXIFOLIA; NUCLEAR RIBOSOMAL DNA...,Endophytic filamentous brown algae are known t...,"[Bernard, Miriam S.; Heesch, Svenja; Leblanc, ...",France; Scotland; South Korea,"Bernard, MS (corresponding author), Sorbonne U...",...,2630000000000.0,both,"Roscoff, France; Oban, Argyll, Scotland; Banch...","(37.41379999999999, 127.5183); (48.703091, -4....","Roscoff, France; Santec, France","(48.703091, -4.0292009)",3,1,"[90893, 64910, 64922, 64929, 309358, 172714, 6...",0.5
50,"Bernard, Miriam S.; Strittmatter, Martina; Mur...","Diversity, biogeography and host specificity o...",EUROPEAN JOURNAL OF PHYCOLOGY,0.874,5'COI; barcoding; biogeography; endophytes; ho...,ALGA CAULERPA-TAXIFOLIA; NUCLEAR RIBOSOMAL DNA...,Endophytic filamentous brown algae are known t...,"[Bernard, Miriam S.; Heesch, Svenja; Leblanc, ...",France; Scotland; South Korea,"Peters, AF (corresponding author), Bezhin Ros...",...,2630000000000.0,both,"Roscoff, France; Oban, Argyll, Scotland; Banch...","(37.41379999999999, 127.5183); (48.703091, -4....","Roscoff, France; Santec, France","(48.726199, -3.985325)",3,1,"[90893, 64910, 64922, 64929, 309358, 172714, 6...",0.5
132,"D'Archino, Roberta; Lin, Showe-Mei; Gabrielson...","Why one species in New Zealand, Pugetia delica...",EUROPEAN JOURNAL OF PHYCOLOGY,0.874,DNA sequencing; endemic genera; large-subunit ...,SUBUNIT RDNA; GIGARTINACEAE RHODOPHYTA; MAXIMU...,"Blade-forming red algae occur worldwide and, p...","[D'Archino, Roberta] Natl Inst Water & Atmosph...",New Zealand; Taiwan; United States,"D'Archino, R (corresponding author), Natl Inst...",...,,both,"Wellington, New Zealand; Keelung, Taiwan; Chap...","(-41.2923814, 174.77874630000005); (25.1275997...","Wellington, New Zealand; Keelung, Taiwan","(-41.2923814, 174.77874630000005)",3,2,"[217477, 1789642, 1789644, 1789643, 31430]",0.5
132,"D'Archino, Roberta; Lin, Showe-Mei; Gabrielson...","Why one species in New Zealand, Pugetia delica...",EUROPEAN JOURNAL OF PHYCOLOGY,0.874,DNA sequencing; endemic genera; large-subunit ...,SUBUNIT RDNA; GIGARTINACEAE RHODOPHYTA; MAXIMU...,"Blade-forming red algae occur worldwide and, p...","[D'Archino, Roberta] Natl Inst Water & Atmosph...",New Zealand; Taiwan; United States,"Lin, SM (corresponding author), Natl Taiwan O...",...,,both,"Wellington, New Zealand; Keelung, Taiwan; Chap...","(-41.2923814, 174.77874630000005); (25.1275997...","Wellington, New Zealand; Keelung, Taiwan","(25.1275997, 121.7391815)",3,2,"[217477, 1789642, 1789644, 1789643, 31430]",0.5
204,"Kryvenda, Anastasiia; Rybalka, Nataliya; Wolf,...",Species distinctions among closely related str...,EUROPEAN JOURNAL OF PHYCOLOGY,0.874,Eustigmatophyceae; Eustigmatos; fatty acids; i...,CHAIN ALKYL DIOLS; GREEN-ALGAE; SECONDARY STRU...,There is an increasing interest in the Eustigm...,"[Kryvenda, Anastasiia; Rybalka, Nataliya; Frie...",Germany,"Friedl, T (corresponding author), Georg August...",...,3850000000000.0,both,"Gottingen, Germany; Wurzburg, Germany","(49.79130439999999, 9.9533548); (51.5412804000...","Gottingen, Germany; Wurzburg, Germany","(49.79130439999999, 9.9533548)",1,1,"[44427, 5747, 44431]",0.5
204,"Kryvenda, Anastasiia; Rybalka, Nataliya; Wolf,...",Species distinctions among closely related str...,EUROPEAN JOURNAL OF PHYCOLOGY,0.874,Eustigmatophyceae; Eustigmatos; fatty acids; i...,CHAIN ALKYL DIOLS; GREEN-ALGAE; SECONDARY STRU...,There is an increasing interest in the Eustigm...,"[Kryvenda, Anastasiia; Rybalka, Nataliya; Frie...",Germany,"Wolf, M (corresponding author), Univ Wurzburg...",...,3850000000000.0,both,"Gottingen, Germany; Wurzburg, Germany","(49.79130439999999, 9.9533548); (51.5412804000...","Gottingen, Germany; Wurzburg, Germany","(51.54128040000001, 9.9158035)",1,1,"[44427, 5747, 44431]",0.5
249,"Luo, Zhaohe; Mertens, Kenneth Neil; Nezan, Eli...","Morphology, ultrastructure and molecular phylo...",EUROPEAN JOURNAL OF PHYCOLOGY,0.874,Bysmatrum; cyst; dinoflagellate; eyespot; Peri...,FRESH-WATER DINOFLAGELLATE; RDNA-BASED PHYLOGE...,The dinoflagellate order Peridiniales encompas...,"[Luo, Zhaohe; Gu, Li; Gu, Haifeng] Minist Nat ...",China; France; Canada; Indonesia,"Gu, HF (corresponding author), Minist Nat Reso...",...,2630000000000.0,both,"Xian, Shaanxi, China; Concarneau, France; Vict...","(-6.2250138, 106.9004472); (34.2658138, 108.95...","Xian, Shaanxi, China; Concarneau, France","(34.2658138, 108.9540936)",4,2,"[54902, 2867, 66803, 991128, 2137288, 1691972,...",0.5
249,"Luo, Zhaohe; Mertens, Kenneth Neil; Nezan, Eli...","Morphology, ultrastructure and molecular phylo...",EUROPEAN JOURNAL OF PHYCOLOGY,0.874,Bysmatrum; cyst; dinoflagellate; eyespot; Peri...,FRESH-WATER DINOFLAGELLATE; RDNA-BASED PHYLOGE...,The dinoflagellate order Peridiniales encompas...,"[Luo, Zhaohe; Gu, Li; Gu, Haifeng] Minist Nat ...",China; France; Canada; Indonesia,"Mertens, KN (corresponding author), IFREMER, ...",...,2630000000000.0,both,"Xian, Shaanxi, China; Concarneau, France; Vict...","(-6.2250138, 106.9004472); (34.2658138, 108.95...","Xian, Shaanxi, China; Concarneau, France","(47.87283400000001, -3.920734)",4,2,"[54902, 2867, 66803, 991128, 2137288, 1691972,...",0.5


In [42]:
diff_loc_out['Corresponding_author_last_name'] = diff_loc_out['Corresponding author Addresses'].str.split(', ').str[0]

In [43]:
diff_loc_out['Corresponding_author_last_name'] = diff_loc_out['Corresponding_author_last_name'].str.strip()

In [44]:
# names that just have initials as first names will just have the initials as the first name
diff_loc_out["first and last names"]=[re.findall(diff_loc_out.iloc[i]['Corresponding_author_last_name']+"+[^;]*", diff_loc_out.iloc[i]['Author Full Names']) for i in range(len(diff_loc_out))]

In [45]:
diff_loc_out[0:20]

Unnamed: 0,Author Full Names,Article Title,Journal,impact factor (2020),Author Keywords,Keywords Plus,Abstract,Addresses--all authors,Nations--all authors,Corresponding author Addresses,...,GDP_2020,_merge,Locations--all authors,Geocoords--all authors,Corresponding author locations,Corresponding author geocoords,Number of authoring countries,Number of corresponding countries,unique_NCBI,map_props
15,"Amaral, Raquel; Sevcikova, Tereza; Elias, Mare...",CharaciopsisBorzi belongs to the Eustigmatophy...,EUROPEAN JOURNAL OF PHYCOLOGY,0.874,Characiopsis; Eustigmataceae group; Eustigmato...,ALGAL CLASS; PHYLOGENY; ULTRASTRUCTURE; NOMENC...,"Characiopsis, established by Borzi in 1895, is...","[Amaral, Raquel; Santos, Lilia M. A.] Univ Coi...",Portugal; Czechia,"Amaral, R (corresponding author), Univ Coimbra...",...,245000000000.0,both,"Coimbra, Portugal; Ostrava, Czechia","(40.20331450000001, -8.4102573); (49.8209226, ...","Coimbra, Portugal; Ostrava, Czechia","(40.20331450000001, -8.4102573)",2,2,"[425071, 2713033, 1431827, 5747, 44429, 271303...",0.5
15,"Amaral, Raquel; Sevcikova, Tereza; Elias, Mare...",CharaciopsisBorzi belongs to the Eustigmatophy...,EUROPEAN JOURNAL OF PHYCOLOGY,0.874,Characiopsis; Eustigmataceae group; Eustigmato...,ALGAL CLASS; PHYLOGENY; ULTRASTRUCTURE; NOMENC...,"Characiopsis, established by Borzi in 1895, is...","[Amaral, Raquel; Santos, Lilia M. A.] Univ Coi...",Portugal; Czechia,"Elias, M (corresponding author), Univ Ostrava...",...,245000000000.0,both,"Coimbra, Portugal; Ostrava, Czechia","(40.20331450000001, -8.4102573); (49.8209226, ...","Coimbra, Portugal; Ostrava, Czechia","(49.8209226, 18.262524300000006)",2,2,"[425071, 2713033, 1431827, 5747, 44429, 271303...",0.5
50,"Bernard, Miriam S.; Strittmatter, Martina; Mur...","Diversity, biogeography and host specificity o...",EUROPEAN JOURNAL OF PHYCOLOGY,0.874,5'COI; barcoding; biogeography; endophytes; ho...,ALGA CAULERPA-TAXIFOLIA; NUCLEAR RIBOSOMAL DNA...,Endophytic filamentous brown algae are known t...,"[Bernard, Miriam S.; Heesch, Svenja; Leblanc, ...",France; Scotland; South Korea,"Bernard, MS (corresponding author), Sorbonne U...",...,2630000000000.0,both,"Roscoff, France; Oban, Argyll, Scotland; Banch...","(37.41379999999999, 127.5183); (48.703091, -4....","Roscoff, France; Santec, France","(48.703091, -4.0292009)",3,1,"[90893, 64910, 64922, 64929, 309358, 172714, 6...",0.5
50,"Bernard, Miriam S.; Strittmatter, Martina; Mur...","Diversity, biogeography and host specificity o...",EUROPEAN JOURNAL OF PHYCOLOGY,0.874,5'COI; barcoding; biogeography; endophytes; ho...,ALGA CAULERPA-TAXIFOLIA; NUCLEAR RIBOSOMAL DNA...,Endophytic filamentous brown algae are known t...,"[Bernard, Miriam S.; Heesch, Svenja; Leblanc, ...",France; Scotland; South Korea,"Peters, AF (corresponding author), Bezhin Ros...",...,2630000000000.0,both,"Roscoff, France; Oban, Argyll, Scotland; Banch...","(37.41379999999999, 127.5183); (48.703091, -4....","Roscoff, France; Santec, France","(48.726199, -3.985325)",3,1,"[90893, 64910, 64922, 64929, 309358, 172714, 6...",0.5
132,"D'Archino, Roberta; Lin, Showe-Mei; Gabrielson...","Why one species in New Zealand, Pugetia delica...",EUROPEAN JOURNAL OF PHYCOLOGY,0.874,DNA sequencing; endemic genera; large-subunit ...,SUBUNIT RDNA; GIGARTINACEAE RHODOPHYTA; MAXIMU...,"Blade-forming red algae occur worldwide and, p...","[D'Archino, Roberta] Natl Inst Water & Atmosph...",New Zealand; Taiwan; United States,"D'Archino, R (corresponding author), Natl Inst...",...,,both,"Wellington, New Zealand; Keelung, Taiwan; Chap...","(-41.2923814, 174.77874630000005); (25.1275997...","Wellington, New Zealand; Keelung, Taiwan","(-41.2923814, 174.77874630000005)",3,2,"[217477, 1789642, 1789644, 1789643, 31430]",0.5
132,"D'Archino, Roberta; Lin, Showe-Mei; Gabrielson...","Why one species in New Zealand, Pugetia delica...",EUROPEAN JOURNAL OF PHYCOLOGY,0.874,DNA sequencing; endemic genera; large-subunit ...,SUBUNIT RDNA; GIGARTINACEAE RHODOPHYTA; MAXIMU...,"Blade-forming red algae occur worldwide and, p...","[D'Archino, Roberta] Natl Inst Water & Atmosph...",New Zealand; Taiwan; United States,"Lin, SM (corresponding author), Natl Taiwan O...",...,,both,"Wellington, New Zealand; Keelung, Taiwan; Chap...","(-41.2923814, 174.77874630000005); (25.1275997...","Wellington, New Zealand; Keelung, Taiwan","(25.1275997, 121.7391815)",3,2,"[217477, 1789642, 1789644, 1789643, 31430]",0.5
204,"Kryvenda, Anastasiia; Rybalka, Nataliya; Wolf,...",Species distinctions among closely related str...,EUROPEAN JOURNAL OF PHYCOLOGY,0.874,Eustigmatophyceae; Eustigmatos; fatty acids; i...,CHAIN ALKYL DIOLS; GREEN-ALGAE; SECONDARY STRU...,There is an increasing interest in the Eustigm...,"[Kryvenda, Anastasiia; Rybalka, Nataliya; Frie...",Germany,"Friedl, T (corresponding author), Georg August...",...,3850000000000.0,both,"Gottingen, Germany; Wurzburg, Germany","(49.79130439999999, 9.9533548); (51.5412804000...","Gottingen, Germany; Wurzburg, Germany","(49.79130439999999, 9.9533548)",1,1,"[44427, 5747, 44431]",0.5
204,"Kryvenda, Anastasiia; Rybalka, Nataliya; Wolf,...",Species distinctions among closely related str...,EUROPEAN JOURNAL OF PHYCOLOGY,0.874,Eustigmatophyceae; Eustigmatos; fatty acids; i...,CHAIN ALKYL DIOLS; GREEN-ALGAE; SECONDARY STRU...,There is an increasing interest in the Eustigm...,"[Kryvenda, Anastasiia; Rybalka, Nataliya; Frie...",Germany,"Wolf, M (corresponding author), Univ Wurzburg...",...,3850000000000.0,both,"Gottingen, Germany; Wurzburg, Germany","(49.79130439999999, 9.9533548); (51.5412804000...","Gottingen, Germany; Wurzburg, Germany","(51.54128040000001, 9.9158035)",1,1,"[44427, 5747, 44431]",0.5
249,"Luo, Zhaohe; Mertens, Kenneth Neil; Nezan, Eli...","Morphology, ultrastructure and molecular phylo...",EUROPEAN JOURNAL OF PHYCOLOGY,0.874,Bysmatrum; cyst; dinoflagellate; eyespot; Peri...,FRESH-WATER DINOFLAGELLATE; RDNA-BASED PHYLOGE...,The dinoflagellate order Peridiniales encompas...,"[Luo, Zhaohe; Gu, Li; Gu, Haifeng] Minist Nat ...",China; France; Canada; Indonesia,"Gu, HF (corresponding author), Minist Nat Reso...",...,2630000000000.0,both,"Xian, Shaanxi, China; Concarneau, France; Vict...","(-6.2250138, 106.9004472); (34.2658138, 108.95...","Xian, Shaanxi, China; Concarneau, France","(34.2658138, 108.9540936)",4,2,"[54902, 2867, 66803, 991128, 2137288, 1691972,...",0.5
249,"Luo, Zhaohe; Mertens, Kenneth Neil; Nezan, Eli...","Morphology, ultrastructure and molecular phylo...",EUROPEAN JOURNAL OF PHYCOLOGY,0.874,Bysmatrum; cyst; dinoflagellate; eyespot; Peri...,FRESH-WATER DINOFLAGELLATE; RDNA-BASED PHYLOGE...,The dinoflagellate order Peridiniales encompas...,"[Luo, Zhaohe; Gu, Li; Gu, Haifeng] Minist Nat ...",China; France; Canada; Indonesia,"Mertens, KN (corresponding author), IFREMER, ...",...,2630000000000.0,both,"Xian, Shaanxi, China; Concarneau, France; Vict...","(-6.2250138, 106.9004472); (34.2658138, 108.95...","Xian, Shaanxi, China; Concarneau, France","(47.87283400000001, -3.920734)",4,2,"[54902, 2867, 66803, 991128, 2137288, 1691972,...",0.5


In [46]:
diff_loc_out = diff_loc_out.reset_index()

In [47]:
diff_loc_out[0:20]

Unnamed: 0,index,Author Full Names,Article Title,Journal,impact factor (2020),Author Keywords,Keywords Plus,Abstract,Addresses--all authors,Nations--all authors,...,GDP_2020,_merge,Locations--all authors,Geocoords--all authors,Corresponding author locations,Corresponding author geocoords,Number of authoring countries,Number of corresponding countries,unique_NCBI,map_props
0,15,"Amaral, Raquel; Sevcikova, Tereza; Elias, Mare...",CharaciopsisBorzi belongs to the Eustigmatophy...,EUROPEAN JOURNAL OF PHYCOLOGY,0.874,Characiopsis; Eustigmataceae group; Eustigmato...,ALGAL CLASS; PHYLOGENY; ULTRASTRUCTURE; NOMENC...,"Characiopsis, established by Borzi in 1895, is...","[Amaral, Raquel; Santos, Lilia M. A.] Univ Coi...",Portugal; Czechia,...,245000000000.0,both,"Coimbra, Portugal; Ostrava, Czechia","(40.20331450000001, -8.4102573); (49.8209226, ...","Coimbra, Portugal; Ostrava, Czechia","(40.20331450000001, -8.4102573)",2,2,"[425071, 2713033, 1431827, 5747, 44429, 271303...",0.5
1,15,"Amaral, Raquel; Sevcikova, Tereza; Elias, Mare...",CharaciopsisBorzi belongs to the Eustigmatophy...,EUROPEAN JOURNAL OF PHYCOLOGY,0.874,Characiopsis; Eustigmataceae group; Eustigmato...,ALGAL CLASS; PHYLOGENY; ULTRASTRUCTURE; NOMENC...,"Characiopsis, established by Borzi in 1895, is...","[Amaral, Raquel; Santos, Lilia M. A.] Univ Coi...",Portugal; Czechia,...,245000000000.0,both,"Coimbra, Portugal; Ostrava, Czechia","(40.20331450000001, -8.4102573); (49.8209226, ...","Coimbra, Portugal; Ostrava, Czechia","(49.8209226, 18.262524300000006)",2,2,"[425071, 2713033, 1431827, 5747, 44429, 271303...",0.5
2,50,"Bernard, Miriam S.; Strittmatter, Martina; Mur...","Diversity, biogeography and host specificity o...",EUROPEAN JOURNAL OF PHYCOLOGY,0.874,5'COI; barcoding; biogeography; endophytes; ho...,ALGA CAULERPA-TAXIFOLIA; NUCLEAR RIBOSOMAL DNA...,Endophytic filamentous brown algae are known t...,"[Bernard, Miriam S.; Heesch, Svenja; Leblanc, ...",France; Scotland; South Korea,...,2630000000000.0,both,"Roscoff, France; Oban, Argyll, Scotland; Banch...","(37.41379999999999, 127.5183); (48.703091, -4....","Roscoff, France; Santec, France","(48.703091, -4.0292009)",3,1,"[90893, 64910, 64922, 64929, 309358, 172714, 6...",0.5
3,50,"Bernard, Miriam S.; Strittmatter, Martina; Mur...","Diversity, biogeography and host specificity o...",EUROPEAN JOURNAL OF PHYCOLOGY,0.874,5'COI; barcoding; biogeography; endophytes; ho...,ALGA CAULERPA-TAXIFOLIA; NUCLEAR RIBOSOMAL DNA...,Endophytic filamentous brown algae are known t...,"[Bernard, Miriam S.; Heesch, Svenja; Leblanc, ...",France; Scotland; South Korea,...,2630000000000.0,both,"Roscoff, France; Oban, Argyll, Scotland; Banch...","(37.41379999999999, 127.5183); (48.703091, -4....","Roscoff, France; Santec, France","(48.726199, -3.985325)",3,1,"[90893, 64910, 64922, 64929, 309358, 172714, 6...",0.5
4,132,"D'Archino, Roberta; Lin, Showe-Mei; Gabrielson...","Why one species in New Zealand, Pugetia delica...",EUROPEAN JOURNAL OF PHYCOLOGY,0.874,DNA sequencing; endemic genera; large-subunit ...,SUBUNIT RDNA; GIGARTINACEAE RHODOPHYTA; MAXIMU...,"Blade-forming red algae occur worldwide and, p...","[D'Archino, Roberta] Natl Inst Water & Atmosph...",New Zealand; Taiwan; United States,...,,both,"Wellington, New Zealand; Keelung, Taiwan; Chap...","(-41.2923814, 174.77874630000005); (25.1275997...","Wellington, New Zealand; Keelung, Taiwan","(-41.2923814, 174.77874630000005)",3,2,"[217477, 1789642, 1789644, 1789643, 31430]",0.5
5,132,"D'Archino, Roberta; Lin, Showe-Mei; Gabrielson...","Why one species in New Zealand, Pugetia delica...",EUROPEAN JOURNAL OF PHYCOLOGY,0.874,DNA sequencing; endemic genera; large-subunit ...,SUBUNIT RDNA; GIGARTINACEAE RHODOPHYTA; MAXIMU...,"Blade-forming red algae occur worldwide and, p...","[D'Archino, Roberta] Natl Inst Water & Atmosph...",New Zealand; Taiwan; United States,...,,both,"Wellington, New Zealand; Keelung, Taiwan; Chap...","(-41.2923814, 174.77874630000005); (25.1275997...","Wellington, New Zealand; Keelung, Taiwan","(25.1275997, 121.7391815)",3,2,"[217477, 1789642, 1789644, 1789643, 31430]",0.5
6,204,"Kryvenda, Anastasiia; Rybalka, Nataliya; Wolf,...",Species distinctions among closely related str...,EUROPEAN JOURNAL OF PHYCOLOGY,0.874,Eustigmatophyceae; Eustigmatos; fatty acids; i...,CHAIN ALKYL DIOLS; GREEN-ALGAE; SECONDARY STRU...,There is an increasing interest in the Eustigm...,"[Kryvenda, Anastasiia; Rybalka, Nataliya; Frie...",Germany,...,3850000000000.0,both,"Gottingen, Germany; Wurzburg, Germany","(49.79130439999999, 9.9533548); (51.5412804000...","Gottingen, Germany; Wurzburg, Germany","(49.79130439999999, 9.9533548)",1,1,"[44427, 5747, 44431]",0.5
7,204,"Kryvenda, Anastasiia; Rybalka, Nataliya; Wolf,...",Species distinctions among closely related str...,EUROPEAN JOURNAL OF PHYCOLOGY,0.874,Eustigmatophyceae; Eustigmatos; fatty acids; i...,CHAIN ALKYL DIOLS; GREEN-ALGAE; SECONDARY STRU...,There is an increasing interest in the Eustigm...,"[Kryvenda, Anastasiia; Rybalka, Nataliya; Frie...",Germany,...,3850000000000.0,both,"Gottingen, Germany; Wurzburg, Germany","(49.79130439999999, 9.9533548); (51.5412804000...","Gottingen, Germany; Wurzburg, Germany","(51.54128040000001, 9.9158035)",1,1,"[44427, 5747, 44431]",0.5
8,249,"Luo, Zhaohe; Mertens, Kenneth Neil; Nezan, Eli...","Morphology, ultrastructure and molecular phylo...",EUROPEAN JOURNAL OF PHYCOLOGY,0.874,Bysmatrum; cyst; dinoflagellate; eyespot; Peri...,FRESH-WATER DINOFLAGELLATE; RDNA-BASED PHYLOGE...,The dinoflagellate order Peridiniales encompas...,"[Luo, Zhaohe; Gu, Li; Gu, Haifeng] Minist Nat ...",China; France; Canada; Indonesia,...,2630000000000.0,both,"Xian, Shaanxi, China; Concarneau, France; Vict...","(-6.2250138, 106.9004472); (34.2658138, 108.95...","Xian, Shaanxi, China; Concarneau, France","(34.2658138, 108.9540936)",4,2,"[54902, 2867, 66803, 991128, 2137288, 1691972,...",0.5
9,249,"Luo, Zhaohe; Mertens, Kenneth Neil; Nezan, Eli...","Morphology, ultrastructure and molecular phylo...",EUROPEAN JOURNAL OF PHYCOLOGY,0.874,Bysmatrum; cyst; dinoflagellate; eyespot; Peri...,FRESH-WATER DINOFLAGELLATE; RDNA-BASED PHYLOGE...,The dinoflagellate order Peridiniales encompas...,"[Luo, Zhaohe; Gu, Li; Gu, Haifeng] Minist Nat ...",China; France; Canada; Indonesia,...,2630000000000.0,both,"Xian, Shaanxi, China; Concarneau, France; Vict...","(-6.2250138, 106.9004472); (34.2658138, 108.95...","Xian, Shaanxi, China; Concarneau, France","(47.87283400000001, -3.920734)",4,2,"[54902, 2867, 66803, 991128, 2137288, 1691972,...",0.5


In [48]:
# find entries that have no corresponding author name extracted
no_authors_diff = []

for i in diff_loc_out.index:
    if len(diff_loc_out.loc[i]['first and last names']) == 0:
        no_authors_diff.append(i)
        
no_authors_diff_df = diff_loc_out.loc[no_authors_diff]

In [49]:
no_authors_diff_df

Unnamed: 0,index,Author Full Names,Article Title,Journal,impact factor (2020),Author Keywords,Keywords Plus,Abstract,Addresses--all authors,Nations--all authors,...,GDP_2020,_merge,Locations--all authors,Geocoords--all authors,Corresponding author locations,Corresponding author geocoords,Number of authoring countries,Number of corresponding countries,unique_NCBI,map_props
252,22643,"Zhou, Zhipeng; Li, Guoliang; Tan, Siyu; Li, Do...",A QTL atlas for grain yield and its component ...,PLANT BREEDING,1.832,candidate genes; grain yield; maize; meta-anal...,ENCODES; GENE; SIZE; ARCHITECTURE; WEIGHT; CEL...,Grain yield and its component traits are essen...,"[Zhou, Zhipeng; Li, Guoliang; Tan, Siyu; Li, D...",China; Germany,...,3850000000000.0,both,"Beijing, Beijing, China; Stuttgart, Germany","(39.904211, 116.407395); (48.7758459, 9.1829321)","Beijing, Beijing, China; Stuttgart, Germany","(48.7758459, 9.1829321)",2,2,[],0.5
328,26421,"Ramireddy, Eswarayya; Galuszka, Petr; Schmuell...",Zn-fortified cereal grains in field-grown barl...,PLANT SIGNALING & BEHAVIOR,1.86,Barley; biofortification; cereals; cytokinin; ...,ARABIDOPSIS; BIOFORTIFICATION,Zinc (Zn) is an essential element in human nut...,"[Ramireddy, Eswarayya; Schmuelling, Thomas] Fr...",Germany; India; Czechia,...,2670000000000.0,both,"Berlin, Germany; Tirupati, Andhra Pradesh, Ind...","(13.6287557, 79.4191795); (49.593778, 17.25087...","Berlin, Germany; Tirupati, Andhra Pradesh, India","(13.6287557, 79.4191795)",3,2,[4513],0.5
345,27321,"Zhang, Wenying; Wang, Jun; Xu, Le; Wang, Aiai;...",Drought stress responses in maize are diminish...,PLANT SIGNALING & BEHAVIOR,1.86,Piriformospora indica; maize; growth promotion...,SALT TOLERANCE; GROWTH; ARABIDOPSIS; EXPRESSIO...,"As an endophytic fungus of Sebacinales, Pirifo...","[Zhang, Wenying; Wang, Jun; Xu, Le; Wang, Aiai...",China; Germany,...,3850000000000.0,both,"Jingzhou, Hubei, China; Beijing, Beijing, Chin...","(30.33478989999999, 112.24068999999999); (39.9...","Jingzhou, Hubei, China; Jena, Germany","(50.927054, 11.5892372)",2,2,"[297313, 65672]",0.5
574,37032,"Zulqarnain; Silva, I. A.; van Melis, J.; Sfair...",Phylogenetic interactions among lianas in a so...,SOUTH AFRICAN JOURNAL OF BOTANY,2.315,Seasonal forest; Climbing traits; Phylogenetic...,BROAD-LEAVED FOREST; SPATIAL AUTOCORRELATION; ...,Biotic interactions play an important role in ...,"[Zulqarnain; van Melis, J.] Univ Campinas UNIC...",Brazil; Pakistan,...,300000000000.0,both,"Campinas, SP, Brazil; Recife, PE, Brazil; Bann...","(-22.9099384, -47.06263320000001); (-8.0577401...","Campinas, SP, Brazil; Bannu, KP, Pakistan","(-22.9099384, -47.06263320000001)",2,2,[332119],0.5
909,58127,"Chilczuk, Tomasz; Steinborn, Carmen; Breinling...",Hapalindoles from the Cyanobacterium Hapalosip...,PLANTA MEDICA,3.11,cyanobacteria; Hapalosiphon; hapalindoles; imm...,AMBIGUINE ISONITRILES; NATURAL-PRODUCTS; INDOL...,Novel immunomodulating agents are currently so...,"[Chilczuk, Tomasz; Breinlinger, Steffen; Niede...",Germany,...,3850000000000.0,both,"Halle, Germany; Freiburg, Germany; Berlin, Ger...","(47.99900770000001, 7.842104299999999); (51.49...","Freiburg, Germany; Berlin, Germany","(47.99900770000001, 7.842104299999999)",1,1,"[1117, 102234, 1892263]",0.5
912,58178,"Spiess, Deborah; Abegg, Vanessa Fabienne; Chau...",Placental Passage of Humulone and Protopine in...,PLANTA MEDICA,3.11,humulone; protopine; Humulus lupulus; Eschscho...,ESCHSCHOLZIA-CALIFORNICA; METABOLISM; TRANSPOR...,The placental passage of humulone and protopin...,"[Spiess, Deborah; Duong, Elisa; Simoes-Wuest, ...",Switzerland,...,752000000000.0,both,"Zurich, Switzerland; Basel, Switzerland","(47.37688660000001, 8.541694); (47.5595986, 7....","Basel, Switzerland; Zurich, Switzerland","(47.5595986, 7.5885761)",1,1,[],0.5
935,58939,"Harms, Henrik; Kloeckner, Anna; Schroer, Jan; ...",Antimicrobial Dialkylresorcins from Marine-Der...,PLANTA MEDICA,3.11,Zobellia galactanivorans; antibiotics; dialkyl...,BIOSYNTHESIS; SUBSTANCES; STEMPHOL,Zobellia galactanivorans has been reported as ...,"[Harms, Henrik; Schroer, Jan; Kehraus, Stefan;...",Germany,...,3850000000000.0,both,"Bonn, Germany; Giessen, Germany","(50.58405120000001, 8.678403099999999); (50.73...","Bonn, Germany; Giessen, Germany","(50.58405120000001, 8.678403099999999)",1,1,"[63186, 245170]",0.5
936,58939,"Harms, Henrik; Kloeckner, Anna; Schroer, Jan; ...",Antimicrobial Dialkylresorcins from Marine-Der...,PLANTA MEDICA,3.11,Zobellia galactanivorans; antibiotics; dialkyl...,BIOSYNTHESIS; SUBSTANCES; STEMPHOL,Zobellia galactanivorans has been reported as ...,"[Harms, Henrik; Schroer, Jan; Kehraus, Stefan;...",Germany,...,3850000000000.0,both,"Bonn, Germany; Giessen, Germany","(50.58405120000001, 8.678403099999999); (50.73...","Bonn, Germany; Giessen, Germany","(50.73743, 7.0982068)",1,1,"[63186, 245170]",0.5
997,61929,"Suessenbacher, Iris; Menghini, Damian; Scherze...",Cryptic chlorophyll breakdown in non-senescent...,PHOTOSYNTHESIS RESEARCH,3.12,Chlorophyll breakdown; Chlorophyll turnover; P...,LEAF SENESCENCE; STAY-GREEN; PROTEIN-DEGRADATI...,Chlorophyll (Chl) breakdown is a diagnostic vi...,"[Suessenbacher, Iris; Scherzer, Gerhard; Erhar...",Austria; Switzerland,...,752000000000.0,both,"Innsbruck, Austria; Zurich, Switzerland","(47.2692124, 11.4041024); (47.37688660000001, ...","Innsbruck, Austria; Zurich, Switzerland","(47.2692124, 11.4041024)",2,2,[3702],0.5
1255,73404,"Penzel, Martin; Kroeling, Christian",Thinning efficacy of metamitron on young 'RoHo...,SCIENTIA HORTICULTURAE,3.57,6-Benzyladenine; Chemical thinning; Chlorophyl...,FRUIT-SET; DELICIOUS APPLES; CHEMICAL THINNER;...,To achieve a high quantity of premium class fr...,"[Penzel, Martin] Leibniz Inst Agr Engn & Bioec...",Germany,...,3850000000000.0,both,"Potsdam, Germany; Dresden, Germany","(51.0504088, 13.7372621); (52.3905689, 13.0644...","Potsdam, Germany; Dresden, Germany","(52.3905689, 13.0644729)",1,1,"[3750, 23211]",0.5


In [50]:
# need to do the german last name thing
new_last_names_diff = []

for i in list(no_authors_diff_df.index):
    old_last_name = no_authors_diff_df.loc[i]['Corresponding_author_last_name']
    author_full_name = no_authors_diff_df.loc[i]['Author Full Names']
    # if the old last name is one insertion away from something in the author full names, then consider it a match
    new_last_name = regex.findall("("+old_last_name+"){i<=1,d<=1}", author_full_name, overlapped=True)
    # if there are still no matches, ignore for now
    if len(new_last_name) == 0:
        pass    
    else:
        new_last_names_diff.append([i,new_last_name[0]])

In [51]:
# replace last names in old dataframe with new last names (which contain an extra letter)
for i in range(len(new_last_names_diff)):
    no_authors_diff_df.at[new_last_names_diff[i][0],'Corresponding_author_last_name'] = new_last_names_diff[i][1]

In [52]:
no_authors_diff_df["first and last names"]=[re.findall(no_authors_diff_df.loc[i]['Corresponding_author_last_name']+"+[^;]*", no_authors_diff_df.loc[i]['Author Full Names']) for i in no_authors_diff_df.index]

In [53]:
# this is the problem

In [54]:
# drop people with no first names at this point
diff_loc_out = diff_loc_out[diff_loc_out["first and last names"].map(len)==1]

In [55]:
diff_loc_out[0:20]

Unnamed: 0,index,Author Full Names,Article Title,Journal,impact factor (2020),Author Keywords,Keywords Plus,Abstract,Addresses--all authors,Nations--all authors,...,GDP_2020,_merge,Locations--all authors,Geocoords--all authors,Corresponding author locations,Corresponding author geocoords,Number of authoring countries,Number of corresponding countries,unique_NCBI,map_props
0,15,"Amaral, Raquel; Sevcikova, Tereza; Elias, Mare...",CharaciopsisBorzi belongs to the Eustigmatophy...,EUROPEAN JOURNAL OF PHYCOLOGY,0.874,Characiopsis; Eustigmataceae group; Eustigmato...,ALGAL CLASS; PHYLOGENY; ULTRASTRUCTURE; NOMENC...,"Characiopsis, established by Borzi in 1895, is...","[Amaral, Raquel; Santos, Lilia M. A.] Univ Coi...",Portugal; Czechia,...,245000000000.0,both,"Coimbra, Portugal; Ostrava, Czechia","(40.20331450000001, -8.4102573); (49.8209226, ...","Coimbra, Portugal; Ostrava, Czechia","(40.20331450000001, -8.4102573)",2,2,"[425071, 2713033, 1431827, 5747, 44429, 271303...",0.5
1,15,"Amaral, Raquel; Sevcikova, Tereza; Elias, Mare...",CharaciopsisBorzi belongs to the Eustigmatophy...,EUROPEAN JOURNAL OF PHYCOLOGY,0.874,Characiopsis; Eustigmataceae group; Eustigmato...,ALGAL CLASS; PHYLOGENY; ULTRASTRUCTURE; NOMENC...,"Characiopsis, established by Borzi in 1895, is...","[Amaral, Raquel; Santos, Lilia M. A.] Univ Coi...",Portugal; Czechia,...,245000000000.0,both,"Coimbra, Portugal; Ostrava, Czechia","(40.20331450000001, -8.4102573); (49.8209226, ...","Coimbra, Portugal; Ostrava, Czechia","(49.8209226, 18.262524300000006)",2,2,"[425071, 2713033, 1431827, 5747, 44429, 271303...",0.5
2,50,"Bernard, Miriam S.; Strittmatter, Martina; Mur...","Diversity, biogeography and host specificity o...",EUROPEAN JOURNAL OF PHYCOLOGY,0.874,5'COI; barcoding; biogeography; endophytes; ho...,ALGA CAULERPA-TAXIFOLIA; NUCLEAR RIBOSOMAL DNA...,Endophytic filamentous brown algae are known t...,"[Bernard, Miriam S.; Heesch, Svenja; Leblanc, ...",France; Scotland; South Korea,...,2630000000000.0,both,"Roscoff, France; Oban, Argyll, Scotland; Banch...","(37.41379999999999, 127.5183); (48.703091, -4....","Roscoff, France; Santec, France","(48.703091, -4.0292009)",3,1,"[90893, 64910, 64922, 64929, 309358, 172714, 6...",0.5
3,50,"Bernard, Miriam S.; Strittmatter, Martina; Mur...","Diversity, biogeography and host specificity o...",EUROPEAN JOURNAL OF PHYCOLOGY,0.874,5'COI; barcoding; biogeography; endophytes; ho...,ALGA CAULERPA-TAXIFOLIA; NUCLEAR RIBOSOMAL DNA...,Endophytic filamentous brown algae are known t...,"[Bernard, Miriam S.; Heesch, Svenja; Leblanc, ...",France; Scotland; South Korea,...,2630000000000.0,both,"Roscoff, France; Oban, Argyll, Scotland; Banch...","(37.41379999999999, 127.5183); (48.703091, -4....","Roscoff, France; Santec, France","(48.726199, -3.985325)",3,1,"[90893, 64910, 64922, 64929, 309358, 172714, 6...",0.5
4,132,"D'Archino, Roberta; Lin, Showe-Mei; Gabrielson...","Why one species in New Zealand, Pugetia delica...",EUROPEAN JOURNAL OF PHYCOLOGY,0.874,DNA sequencing; endemic genera; large-subunit ...,SUBUNIT RDNA; GIGARTINACEAE RHODOPHYTA; MAXIMU...,"Blade-forming red algae occur worldwide and, p...","[D'Archino, Roberta] Natl Inst Water & Atmosph...",New Zealand; Taiwan; United States,...,,both,"Wellington, New Zealand; Keelung, Taiwan; Chap...","(-41.2923814, 174.77874630000005); (25.1275997...","Wellington, New Zealand; Keelung, Taiwan","(-41.2923814, 174.77874630000005)",3,2,"[217477, 1789642, 1789644, 1789643, 31430]",0.5
5,132,"D'Archino, Roberta; Lin, Showe-Mei; Gabrielson...","Why one species in New Zealand, Pugetia delica...",EUROPEAN JOURNAL OF PHYCOLOGY,0.874,DNA sequencing; endemic genera; large-subunit ...,SUBUNIT RDNA; GIGARTINACEAE RHODOPHYTA; MAXIMU...,"Blade-forming red algae occur worldwide and, p...","[D'Archino, Roberta] Natl Inst Water & Atmosph...",New Zealand; Taiwan; United States,...,,both,"Wellington, New Zealand; Keelung, Taiwan; Chap...","(-41.2923814, 174.77874630000005); (25.1275997...","Wellington, New Zealand; Keelung, Taiwan","(25.1275997, 121.7391815)",3,2,"[217477, 1789642, 1789644, 1789643, 31430]",0.5
6,204,"Kryvenda, Anastasiia; Rybalka, Nataliya; Wolf,...",Species distinctions among closely related str...,EUROPEAN JOURNAL OF PHYCOLOGY,0.874,Eustigmatophyceae; Eustigmatos; fatty acids; i...,CHAIN ALKYL DIOLS; GREEN-ALGAE; SECONDARY STRU...,There is an increasing interest in the Eustigm...,"[Kryvenda, Anastasiia; Rybalka, Nataliya; Frie...",Germany,...,3850000000000.0,both,"Gottingen, Germany; Wurzburg, Germany","(49.79130439999999, 9.9533548); (51.5412804000...","Gottingen, Germany; Wurzburg, Germany","(49.79130439999999, 9.9533548)",1,1,"[44427, 5747, 44431]",0.5
7,204,"Kryvenda, Anastasiia; Rybalka, Nataliya; Wolf,...",Species distinctions among closely related str...,EUROPEAN JOURNAL OF PHYCOLOGY,0.874,Eustigmatophyceae; Eustigmatos; fatty acids; i...,CHAIN ALKYL DIOLS; GREEN-ALGAE; SECONDARY STRU...,There is an increasing interest in the Eustigm...,"[Kryvenda, Anastasiia; Rybalka, Nataliya; Frie...",Germany,...,3850000000000.0,both,"Gottingen, Germany; Wurzburg, Germany","(49.79130439999999, 9.9533548); (51.5412804000...","Gottingen, Germany; Wurzburg, Germany","(51.54128040000001, 9.9158035)",1,1,"[44427, 5747, 44431]",0.5
9,249,"Luo, Zhaohe; Mertens, Kenneth Neil; Nezan, Eli...","Morphology, ultrastructure and molecular phylo...",EUROPEAN JOURNAL OF PHYCOLOGY,0.874,Bysmatrum; cyst; dinoflagellate; eyespot; Peri...,FRESH-WATER DINOFLAGELLATE; RDNA-BASED PHYLOGE...,The dinoflagellate order Peridiniales encompas...,"[Luo, Zhaohe; Gu, Li; Gu, Haifeng] Minist Nat ...",China; France; Canada; Indonesia,...,2630000000000.0,both,"Xian, Shaanxi, China; Concarneau, France; Vict...","(-6.2250138, 106.9004472); (34.2658138, 108.95...","Xian, Shaanxi, China; Concarneau, France","(47.87283400000001, -3.920734)",4,2,"[54902, 2867, 66803, 991128, 2137288, 1691972,...",0.5
11,251,"Luo, Zhaohe; Lim, Zhen Fei; Mertens, Kenneth N...",Morpho-molecular diversity and phylogeny of By...,EUROPEAN JOURNAL OF PHYCOLOGY,0.874,benthic dinoflagellate; biogeography; Bysmatru...,TIDAL POOL DINOFLAGELLATE; SP. NOV. DINOPHYCEA...,The dinoflagellate genus Bysmatrum encompasses...,"[Luo, Zhaohe; Li, Xintian; Gu, Haifeng] SOA, I...",China; Malaysia; France; Belgium; United States,...,2630000000000.0,both,"Xian, Shaanxi, China; Bachok, Kelantan, Malays...","(30.627977, -96.3344068); (34.2658138, 108.954...","Xian, Shaanxi, China; Concarneau, France","(47.87283400000001, -3.920734)",5,2,"[2077185, 2077196, 990649, 990648, 2910]",0.5


In [56]:
diff_loc_out["first and last names"] = [i[0].split(', ') for i in diff_loc_out["first and last names"]]

In [57]:
diff_loc_out["First name"] = [i[-1] for i in diff_loc_out["first and last names"]]

In [58]:
diff_loc_out["Last name"] = [i[0] for i in diff_loc_out["first and last names"]]

In [59]:
diff_loc_names = diff_loc_out.merge(names_dict, how='left', on='First name')

In [60]:
diff_loc_na = diff_loc_names[diff_loc_names['inferred_gender_y'].isna()].drop_duplicates(subset=['First name'])

In [61]:
diff_loc_na['Corresponding_auth_country'] = [i[-1] for i in diff_loc_na['Corresponding author locations'].str.split(',')]

In [62]:
# diff_loc_na[['First name','Last name','Corresponding_auth_country']].to_csv('diff_location.csv', index=False)

In [63]:
diff_loc_enriched = pd.read_csv('diff_location_enriched.csv')

In [67]:
diff_loc_na['inferred_gender'] = list(diff_loc_enriched['ga_gender'])

In [68]:
# for i in diff_loc_na.index:
#     diff_loc_names.at[i,'inferred_gender_y'] = diff_loc_na.at[i,'inferred_gender']

In [118]:
diff_loc_na[['index', 'First name', 'inferred_gender']]

Unnamed: 0,index,First name,inferred_gender
3,50,Akira F.,female
8,249,Kenneth Neil,male
11,541,Daekyung,male
39,3174,Hang,
40,3174,Komiljon Sh,male
...,...,...,...
8503,288960,Junlae,female
8505,289045,Zhishu,
8506,289045,Jinao,female
8507,289058,Suin,female


# Merge back with original data

In [165]:
diff_loc_out = diff_loc_df.explode(['Corresponding author Addresses','Corresponding author geocoords'])
diff_loc_out['map_props'] /= diff_loc_out['map_props'].groupby(level=0).transform('count')

In [166]:
diff_loc_out['Corresponding_author_last_name'] = diff_loc_out['Corresponding author Addresses'].str.split(', ').str[0]

In [167]:
diff_loc_out["first and last names"]=[re.findall(diff_loc_out.iloc[i]['Corresponding_author_last_name']+"+[^;]*", diff_loc_out.iloc[i]['Author Full Names']) for i in range(len(diff_loc_out))]

In [168]:
diff_loc_out = diff_loc_out.reset_index()

In [169]:
# find entries that have no corresponding author name extracted
no_authors_diff = []

for i in diff_loc_out.index:
    if len(diff_loc_out.loc[i]['first and last names']) == 0:
        no_authors_diff.append(i)
        
no_authors_diff_df = diff_loc_out.loc[no_authors_diff]

In [170]:
# need to do the german last name thing
new_last_names_diff = []

for i in list(no_authors_diff_df.index):
    old_last_name = no_authors_diff_df.loc[i]['Corresponding_author_last_name']
    author_full_name = no_authors_diff_df.loc[i]['Author Full Names']
    # if the old last name is one insertion away from something in the author full names, then consider it a match
    new_last_name = regex.findall("("+old_last_name+"){i<=1,d<=1}", author_full_name, overlapped=True)
    # if there are still no matches, ignore for now
    if len(new_last_name) == 0:
        pass    
    else:
        new_last_names_diff.append([i,new_last_name[0]])

In [171]:
# replace last names in old dataframe with new last names (which contain an extra letter)
for i in range(len(new_last_names_diff)):
    no_authors_diff_df.at[new_last_names_diff[i][0],'Corresponding_author_last_name'] = new_last_names_diff[i][1]

In [172]:
no_authors_diff_df["first and last names"]=[re.findall(no_authors_diff_df.loc[i]['Corresponding_author_last_name']+"+[^;]*", no_authors_diff_df.loc[i]['Author Full Names']) for i in no_authors_diff_df.index]

In [173]:
mult_author_df = diff_loc_out[diff_loc_out["first and last names"].map(len) != 1]

In [174]:
mult_author_df['Corresponding_author_last_name'] = mult_author_df['Corresponding author Addresses'].str.findall(r'^[^,]*,..').str[0]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  mult_author_df['Corresponding_author_last_name'] = mult_author_df['Corresponding author Addresses'].str.findall(r'^[^,]*,..').str[0]


In [175]:
mult_author_df['Corresponding_author_last_name'] = mult_author_df['Corresponding_author_last_name'].str.strip()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  mult_author_df['Corresponding_author_last_name'] = mult_author_df['Corresponding_author_last_name'].str.strip()


In [176]:
# use previous cell to find first and last names of corresponding authors
author_matches = []

for i in mult_author_df.index:
    mylist = mult_author_df.loc[i]['first and last names']
    r = re.compile(str(mult_author_df.loc[i]['Corresponding_author_last_name'])+".*")
    newlist = list(filter(r.match, mylist))
    author_matches.append(newlist)
    
mult_author_df['first and last names'] = author_matches

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  mult_author_df['first and last names'] = author_matches


In [177]:
for i in mult_author_df.index:
    diff_loc_out.at[i,'first and last names'] = mult_author_df.at[i,'first and last names']

In [178]:
for i in diff_loc_out["first and last names"]:
    try:
        diff_loc_out.at[i,"first and last names"] = i[0].split(', ')
    except:
        pass

In [179]:
First_name = []

for i in diff_loc_out["first and last names"]:
    try:
        First_name.append(i[-1])
    except:
        First_name.append('None')
        
diff_loc_out["First name"] = First_name

In [180]:
Last_name = []

for i in diff_loc_out["first and last names"]:
    try:
        Last_name.append(i[0])
    except:
        Last_name.append('None')
        
diff_loc_out["Last name"] = Last_name

In [121]:
diff_loc_out[0:15][['Author Full Names', 'inferred_gender']]

Unnamed: 0,Author Full Names,inferred_gender
15,"Amaral, Raquel; Sevcikova, Tereza; Elias, Mare...",female
15,"Amaral, Raquel; Sevcikova, Tereza; Elias, Mare...",female
50,"Bernard, Miriam S.; Strittmatter, Martina; Mur...",female
50,"Bernard, Miriam S.; Strittmatter, Martina; Mur...",female
132,"D'Archino, Roberta; Lin, Showe-Mei; Gabrielson...",female
132,"D'Archino, Roberta; Lin, Showe-Mei; Gabrielson...",female
204,"Kryvenda, Anastasiia; Rybalka, Nataliya; Wolf,...",male
204,"Kryvenda, Anastasiia; Rybalka, Nataliya; Wolf,...",male
249,"Luo, Zhaohe; Mertens, Kenneth Neil; Nezan, Eli...",female
249,"Luo, Zhaohe; Mertens, Kenneth Neil; Nezan, Eli...",female


In [144]:
merged_diff_loc = diff_loc_out.merge(diff_loc_na, how = 'inner', on = ['index', 'First name'])

  merged_diff_loc = diff_loc_out.merge(diff_loc_na, how = 'inner', on = ['index', 'First name'])


In [None]:
# strategy 2: make new dictionary

In [147]:
diff_loc_enriched = pd.read_csv('diff_location_enriched.csv')

In [148]:
one_loc_enriched = pd.read_csv('one_location_enriched.csv')

In [152]:
names_dict_diff = diff_loc_enriched[['First name', 'ga_gender']]

In [154]:
names_dict_diff = names_dict_diff.rename(columns = {'ga_gender': 'inferred_gender'})

In [156]:
names_dict_one = one_loc_enriched[['First name', 'ga_gender']]

In [157]:
names_dict_one = names_dict_one.rename(columns = {'ga_gender': 'inferred_gender'})

In [159]:
names_dict_final = pd.concat([names_dict, names_dict_diff, names_dict_one])

In [161]:
diff_loc_names_final = diff_loc_out.merge(names_dict_final, how='left', on='First name')

In [163]:
diff_loc_names_final[0:15]

Unnamed: 0,index,Author Full Names,Article Title,Journal,impact factor (2020),Author Keywords,Keywords Plus,Abstract,Addresses--all authors,Nations--all authors,...,Geocoords--all authors,Corresponding author locations,Corresponding author geocoords,Number of authoring countries,Number of corresponding countries,unique_NCBI,map_props,First name,Last name,inferred_gender_y
0,15,"Amaral, Raquel; Sevcikova, Tereza; Elias, Mare...",CharaciopsisBorzi belongs to the Eustigmatophy...,EUROPEAN JOURNAL OF PHYCOLOGY,0.874,Characiopsis; Eustigmataceae group; Eustigmato...,ALGAL CLASS; PHYLOGENY; ULTRASTRUCTURE; NOMENC...,"Characiopsis, established by Borzi in 1895, is...","[Amaral, Raquel; Santos, Lilia M. A.] Univ Coi...",Portugal; Czechia,...,"(40.20331450000001, -8.4102573); (49.8209226, ...","Coimbra, Portugal; Ostrava, Czechia","(40.20331450000001, -8.4102573)",2,2,"[425071, 2713033, 1431827, 5747, 44429, 271303...",0.5,"Amaral, Raquel","Amaral, Raquel",
1,15,"Amaral, Raquel; Sevcikova, Tereza; Elias, Mare...",CharaciopsisBorzi belongs to the Eustigmatophy...,EUROPEAN JOURNAL OF PHYCOLOGY,0.874,Characiopsis; Eustigmataceae group; Eustigmato...,ALGAL CLASS; PHYLOGENY; ULTRASTRUCTURE; NOMENC...,"Characiopsis, established by Borzi in 1895, is...","[Amaral, Raquel; Santos, Lilia M. A.] Univ Coi...",Portugal; Czechia,...,"(40.20331450000001, -8.4102573); (49.8209226, ...","Coimbra, Portugal; Ostrava, Czechia","(49.8209226, 18.262524300000006)",2,2,"[425071, 2713033, 1431827, 5747, 44429, 271303...",0.5,"Elias, Marek","Elias, Marek",
2,50,"Bernard, Miriam S.; Strittmatter, Martina; Mur...","Diversity, biogeography and host specificity o...",EUROPEAN JOURNAL OF PHYCOLOGY,0.874,5'COI; barcoding; biogeography; endophytes; ho...,ALGA CAULERPA-TAXIFOLIA; NUCLEAR RIBOSOMAL DNA...,Endophytic filamentous brown algae are known t...,"[Bernard, Miriam S.; Heesch, Svenja; Leblanc, ...",France; Scotland; South Korea,...,"(37.41379999999999, 127.5183); (48.703091, -4....","Roscoff, France; Santec, France","(48.703091, -4.0292009)",3,1,"[90893, 64910, 64922, 64929, 309358, 172714, 6...",0.5,"Bernard, Miriam S.","Bernard, Miriam S.",
3,50,"Bernard, Miriam S.; Strittmatter, Martina; Mur...","Diversity, biogeography and host specificity o...",EUROPEAN JOURNAL OF PHYCOLOGY,0.874,5'COI; barcoding; biogeography; endophytes; ho...,ALGA CAULERPA-TAXIFOLIA; NUCLEAR RIBOSOMAL DNA...,Endophytic filamentous brown algae are known t...,"[Bernard, Miriam S.; Heesch, Svenja; Leblanc, ...",France; Scotland; South Korea,...,"(37.41379999999999, 127.5183); (48.703091, -4....","Roscoff, France; Santec, France","(48.726199, -3.985325)",3,1,"[90893, 64910, 64922, 64929, 309358, 172714, 6...",0.5,"Peters, Akira F.","Peters, Akira F.",
4,132,"D'Archino, Roberta; Lin, Showe-Mei; Gabrielson...","Why one species in New Zealand, Pugetia delica...",EUROPEAN JOURNAL OF PHYCOLOGY,0.874,DNA sequencing; endemic genera; large-subunit ...,SUBUNIT RDNA; GIGARTINACEAE RHODOPHYTA; MAXIMU...,"Blade-forming red algae occur worldwide and, p...","[D'Archino, Roberta] Natl Inst Water & Atmosph...",New Zealand; Taiwan; United States,...,"(-41.2923814, 174.77874630000005); (25.1275997...","Wellington, New Zealand; Keelung, Taiwan","(-41.2923814, 174.77874630000005)",3,2,"[217477, 1789642, 1789644, 1789643, 31430]",0.5,"D'Archino, Roberta","D'Archino, Roberta",
5,132,"D'Archino, Roberta; Lin, Showe-Mei; Gabrielson...","Why one species in New Zealand, Pugetia delica...",EUROPEAN JOURNAL OF PHYCOLOGY,0.874,DNA sequencing; endemic genera; large-subunit ...,SUBUNIT RDNA; GIGARTINACEAE RHODOPHYTA; MAXIMU...,"Blade-forming red algae occur worldwide and, p...","[D'Archino, Roberta] Natl Inst Water & Atmosph...",New Zealand; Taiwan; United States,...,"(-41.2923814, 174.77874630000005); (25.1275997...","Wellington, New Zealand; Keelung, Taiwan","(25.1275997, 121.7391815)",3,2,"[217477, 1789642, 1789644, 1789643, 31430]",0.5,"Lin, Showe-Mei","Lin, Showe-Mei",
6,204,"Kryvenda, Anastasiia; Rybalka, Nataliya; Wolf,...",Species distinctions among closely related str...,EUROPEAN JOURNAL OF PHYCOLOGY,0.874,Eustigmatophyceae; Eustigmatos; fatty acids; i...,CHAIN ALKYL DIOLS; GREEN-ALGAE; SECONDARY STRU...,There is an increasing interest in the Eustigm...,"[Kryvenda, Anastasiia; Rybalka, Nataliya; Frie...",Germany,...,"(49.79130439999999, 9.9533548); (51.5412804000...","Gottingen, Germany; Wurzburg, Germany","(49.79130439999999, 9.9533548)",1,1,"[44427, 5747, 44431]",0.5,"Friedl, Thomas","Friedl, Thomas",
7,204,"Kryvenda, Anastasiia; Rybalka, Nataliya; Wolf,...",Species distinctions among closely related str...,EUROPEAN JOURNAL OF PHYCOLOGY,0.874,Eustigmatophyceae; Eustigmatos; fatty acids; i...,CHAIN ALKYL DIOLS; GREEN-ALGAE; SECONDARY STRU...,There is an increasing interest in the Eustigm...,"[Kryvenda, Anastasiia; Rybalka, Nataliya; Frie...",Germany,...,"(49.79130439999999, 9.9533548); (51.5412804000...","Gottingen, Germany; Wurzburg, Germany","(51.54128040000001, 9.9158035)",1,1,"[44427, 5747, 44431]",0.5,"Wolf, Matthias","Wolf, Matthias",
8,249,"Luo, Zhaohe; Mertens, Kenneth Neil; Nezan, Eli...","Morphology, ultrastructure and molecular phylo...",EUROPEAN JOURNAL OF PHYCOLOGY,0.874,Bysmatrum; cyst; dinoflagellate; eyespot; Peri...,FRESH-WATER DINOFLAGELLATE; RDNA-BASED PHYLOGE...,The dinoflagellate order Peridiniales encompas...,"[Luo, Zhaohe; Gu, Li; Gu, Haifeng] Minist Nat ...",China; France; Canada; Indonesia,...,"(-6.2250138, 106.9004472); (34.2658138, 108.95...","Xian, Shaanxi, China; Concarneau, France","(34.2658138, 108.9540936)",4,2,"[54902, 2867, 66803, 991128, 2137288, 1691972,...",0.5,"Gu, Haifeng","Gu, Haifeng",
9,249,"Luo, Zhaohe; Mertens, Kenneth Neil; Nezan, Eli...","Morphology, ultrastructure and molecular phylo...",EUROPEAN JOURNAL OF PHYCOLOGY,0.874,Bysmatrum; cyst; dinoflagellate; eyespot; Peri...,FRESH-WATER DINOFLAGELLATE; RDNA-BASED PHYLOGE...,The dinoflagellate order Peridiniales encompas...,"[Luo, Zhaohe; Gu, Li; Gu, Haifeng] Minist Nat ...",China; France; Canada; Indonesia,...,"(-6.2250138, 106.9004472); (34.2658138, 108.95...","Xian, Shaanxi, China; Concarneau, France","(47.87283400000001, -3.920734)",4,2,"[54902, 2867, 66803, 991128, 2137288, 1691972,...",0.5,"Mertens, Kenneth Neil","Mertens, Kenneth Neil",


In [184]:
diff_loc_out['First name'][0]

'Amaral, Raquel'

In [None]:
# old merging

In [71]:
data = pd.read_csv('../ete3/Plt_sci_publications_geo_species_8.26.csv', low_memory=False)

In [72]:
data.loc[249]['Corresponding_author_last_name']

'Gu'

In [73]:
data.loc[249]['Corresponding author Addresses']

'Gu, HF (corresponding author), Minist Nat Resources, Inst Oceanog 3, Xiamen 361005, Peoples R China.; Mertens, KN (corresponding author), IFREMER, LER BO, Stn Biol Marine, Pl Croix,BP40537, F-29185 Concarneau, France.'

In [74]:
data.loc[249]['Author Full Names']

'Luo, Zhaohe; Mertens, Kenneth Neil; Nezan, Elizabeth; Gu, Li; Pospelova, Vera; Thoha, Hikmah; Gu, Haifeng'

In [75]:
one_loc_names_merged = (one_loc_names.groupby(['index'])
    .agg({'inferred_gender_y': lambda x: x.tolist()}))

In [76]:
one_loc_names_merged.loc[0]['inferred_gender_y']

['male', 'female']

In [77]:
diff_loc_names_merged = (diff_loc_names.groupby(['index'])
    .agg({'inferred_gender_y': lambda x: x.tolist()}))

In [78]:
diff_loc_names_merged

Unnamed: 0_level_0,inferred_gender_y
index,Unnamed: 1_level_1
15,"[female, male]"
50,"[female, female]"
132,"[female, female]"
204,"[male, male]"
249,[male]
...,...
289754,[female]
289763,"[male, male]"
290266,"[male, male]"
290469,"[male, male]"


In [79]:
for i in one_loc_names_merged.index:
    data.at[i,'inferred_gender'] = one_loc_names_merged.at[i,'inferred_gender_y']

In [80]:
for i in diff_loc_names_merged.index:
    data.at[i,'inferred_gender'] = diff_loc_names_merged.at[i,'inferred_gender_y']

In [81]:
# data.to_csv('data_full_gender.csv', index = False)

In [82]:
# data.to_pickle('data_full_gender.pkl')

In [83]:
data['inferred_gender'].loc[0]

['male', 'female']

### Subcase 3: Some authors have different locations. Drop these
Some authors may correspond to more than one location and or/ multiple authors may correspond to one location

In [84]:
# get everything not in subcases 1 or 2
step1 = mult_authors_df[~mult_authors_df.isin(one_loc_df)].dropna(subset=['Author Full Names'])
mult_loc_df = step1[~step1.isin(diff_loc_df)].dropna(subset=['Author Full Names'])

In [85]:
# probably drop these (around 4k)
mult_loc_df

Unnamed: 0,Author Full Names,Article Title,Journal,impact factor (2020),Author Keywords,Keywords Plus,Abstract,Addresses--all authors,Nations--all authors,Corresponding author Addresses,...,Per_capita_income_2020,GDP_2020,_merge,Locations--all authors,Geocoords--all authors,Corresponding author locations,Corresponding author geocoords,Number of authoring countries,Number of corresponding countries,unique_NCBI
568,"Pena, Viviana; Belanger, David; Gagnon, Patric...","Lithothamnion (Hapalidiales, Rhodophyta) in th...",EUROPEAN JOURNAL OF PHYCOLOGY,0.874,Coralline red algae; cox1; DNA barcoding; dist...,RHODOLITH BEDS CORALLINALES; NORTH-ATLANTIC; P...,Coralline red algae in the non-geniculate gene...,"[Pena, Viviana] Univ A Coruna, Fac Ciencias, B...",Spain; Canada; United States; France; Norway; ...,"[Pena, V (corresponding author), Univ A Coruna...",...,63027.679530,2.090000e+13,both,"La Coruna, Galicia, Spain; St John, NF, Canada...","(30.2240897, -92.0198427); (35.91319960000001,...","La Coruna, Galicia, Spain; Chapel Hill, NC, Un...","[(35.91319960000001, -79.05584449999998), (43...",9.0,2.0,"[48605, 48598, 48601]"
694,"Schwartz, Nicole; Dobretsov, Sergey; Rohde, Sv...",Comparison of antifouling properties of native...,EUROPEAN JOURNAL OF PHYCOLOGY,0.874,Antifouling; chemical defence; invasive alga; ...,ALGA FUCUS-VESICULOSUS; LARVAL SETTLEMENT; MAR...,The invasiveness of algal species can be facil...,"[Schwartz, Nicole; Rohde, Sven; Schupp, Peter ...",Germany; Oman,"[Schwartz, N (corresponding author), Carl von ...",...,31117.757710,7.397139e+10,both,"Wilhelmshaven, Germany; Muscat, Oman","(23.5880307, 58.38287170000001); (53.532340299...","Wilhelmshaven, Germany; Muscat, Oman","[(23.5880307, 58.38287170000001), (53.5323402...",2.0,2.0,"[74468, 74090, 2856, 10212, 10205, 3015, 536]"
703,"Shan, Tifeng; Pang, Shaojun; Wang, Xuemei; Li,...",Genetic analysis of a recently established Und...,EUROPEAN JOURNAL OF PHYCOLOGY,0.874,Genetic diversity; invasive seaweed; kelp; mic...,BROWN ALGA; R-PACKAGE; INVASION; PHAEOPHYCEAE;...,"Undaria pinnatifida, a kelp species native to ...","[Shan, Tifeng; Pang, Shaojun; Wang, Xuemei; Li...",China; Germany; England; Japan,"[Shan, TF, Pang, SJ (corresponding author), C...",...,54844.545970,3.850000e+12,both,"Qingdao, Shandong, China; Beijing, Beijing, Ch...","(34.9755668, 138.3826773); (36.0662299, 120.38...","Qingdao, Shandong, China; Bremen, Germany","[(36.0662299, 120.38298999999999), (53.079296...",4.0,2.0,"[172714, 74381]"
1791,"Acuna, Rafael; Fliesswasser, Stella; Ackermann...",Phylogenetic relationships and generic re-arra...,TAXON,0.980,Blumenbachia; Caiophora; morphology; Loasa; pl...,SUBFAM. LOASOIDEAE; DNA-SEQUENCES; CORNALES; P...,"Loasaceae, a mostly American group, is one of ...","[Acuna, Rafael; Fliesswasser, Stella; Ackerman...",Germany; Costa Rica; Chile,"[Acuna, R, Weigend, M (corresponding author),...",...,41608.025350,2.450000e+11,both,"Bonn, Germany; Koblenz, Germany; Berlin, Germa...","(-33.4488897, -70.66926550000002); (50.3569429...","Bonn, Germany; San Pedro, Costa Rica","[(50.73743, 7.0982068), (9.9301517, -84.05110...",3.0,2.0,"[37831, 193479, 187910, 37822, 37823, 2570285,..."
1908,"Boo, Ga Hun; Tu Van Nguyen; Kim, Jung Yeon; Le...",A revised classification of the Gelidiellaceae...,TAXON,0.980,Gelidiales; Gelidiella; Madagascar; multigene ...,ACEROSA FORSSKAL FELDMANN; MARINE-ALGAE; GELID...,The agarophyte family Gelidiellaceae currently...,"[Boo, Ga Hun] Univ Calif Berkeley, Univ Herbar...",United States; South Korea; Vietnam; France; S...,"[Boo, GH (corresponding author), Univ Calif Be...",...,,,both,"Berkeley, CA, United States; Taejon, Chungnam,...","(10.8230989, 106.6296638); (36.7871202, 127.12...","Berkeley, CA, United States; Taejon, Chungnam,...","[(36.7871202, 127.1205669), (37.8715226000000...",6.0,2.0,"[564942, 1819763, 2026934, 417196, 28866, 2026..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
289741,"Thioune, El-Hadji; Strickler, Susan; Gallagher...",Temperature Impacts the Response ofCoffea cane...,TROPICAL PLANT BIOLOGY,1.512,Coffee; Drought; Water deficit; Combined water...,CLIMATE-CHANGE; DROUGHT TOLERANCE; COFFEA-CANE...,Climate change is expected to result in more f...,"[Thioune, El-Hadji; Gallagher, Thomas; Osborne...",Ireland; France; United States; Switzerland,"[Osborne, B (corresponding author), Univ Coll ...",...,46991.182030,2.630000e+12,both,"Glasnevin Hill, Dublin, Ireland; Rouen, France...","(42.4439614, -76.5018807); (46.2043907, 6.1431...","Glasnevin Hill, Dublin, Ireland; Tours, France","[(47.394144, 0.68484), (53.3742486, -6.268633...",4.0,2.0,[13442]
289755,"Cao, Qinghe; Li, Ang; Chen, Jinyang; Sun, Yu; ...",Transcriptome Sequencing of the Sweet Potato P...,TROPICAL PLANT BIOLOGY,1.512,Ipomoea trifida; Transcriptome; RNA-seq; Droug...,GERM PLASM; SYNTHETIC HEXAPLOIDS; IMPROVEMENT;...,Ipomoea trifida (H.B.K.) G. Don. is the closes...,"[Cao, Qinghe; Li, Ang; Chen, Jinyang; Tang, Ju...",China,"[Cao, QH (corresponding author), Minist Agr, K...",...,17189.281790,1.470000e+13,both,"Xuzhou, Jiangsu, China; Tianjin, Tianjin, China","(34.2043999, 117.28577); (39.0850999, 117.19937)","Xuzhou, Jiangsu, China; Tianjin, Tianjin, China","[(34.2043999, 117.28577), (39.0850999, 117.19...",1.0,1.0,[4120]
290047,"Poudel, Bindu; Abdalla, Osama A.; Liu, Qingchu...",Field distribution and disease incidence of to...,TROPICAL PLANT PATHOLOGY,1.512,Disease incidence; Field distribution; Groundn...,GROUNDNUT-RINGSPOT-VIRUS; INFECTING TOMATO; WI...,Tomato chlorotic spot tospovirus is a species ...,"[Poudel, Bindu; Abdalla, Osama A.; Liu, Qingch...",United States; Egypt,"[Poudel, B, Zhang, SA (corresponding author),...",...,63027.679530,2.090000e+13,both,"Homestead, FL, United States; Yuma, AZ, United...","(25.4687224, -80.4775569); (26.7617322, -81.43...","Homestead, FL, United States; Yuma, AZ, United...","[(25.4687224, -80.4775569), (32.6926512, -114...",2.0,1.0,[]
290051,"Ramos-Gonzalez, Pedro Luis; da Costa-Rodrigues...",First genome sequence of an isolate of hibiscu...,TROPICAL PLANT PATHOLOGY,1.512,Betacarmovirus; Hibiscus rosa-sinensis; High-t...,IDENTIFICATION; PURIFICATION; EXPRESSION; CARM...,"For the first time, the near-complete genome s...","[Ramos-Gonzalez, Pedro Luis; da Costa-Rodrigue...",Brazil,"[Ramos-Gonzalez, PL, Freitas-Astua, J (corres...",...,14834.193700,1.450000e+12,both,"Vila Mariana, SP, Brazil; Piracicaba, SP, Braz...","(-12.6718752, -39.1039317); (-22.7342864, -47....","Vila Mariana, SP, Brazil; Cruz Das Almas, BA, ...","[(-12.6718752, -39.1039317), (-23.589702, -46...",1.0,1.0,[]


In [86]:
data = data[~data.index.isin(mult_loc_df.index)]

In [87]:
# data.to_pickle('data_full_gender.pkl')