# Data Retrieval Scripts
I combined data from several sources + fixed some rows manually. Here are links to the sources:

- UCI Name-Gender Dataset: https://archive.ics.uci.edu/ml/datasets/Gender+by+Name
    - Downloaded the dataset and matched the names I had in my own dataset. If match found, prob = 1.
- Chinese-English Names-Genders Corpus: https://raw.githubusercontent.com/wainshine/Chinese-Names-Corpus/master/English_Names_Corpus/English_Cn_Name_Corpus_Gender（48W）.txt

- Behind the Name Downloadable Dataset: https://www.behindthename.com/api/

- Genderize.io: https://genderize.io/
    - Querying names one at a time. There is a daily limit to appx 1500 queries per IP-address.

In [66]:
!pip install genderize
!pip install unidecode

Collecting unidecode
  Downloading Unidecode-1.3.6-py3-none-any.whl (235 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m235.9/235.9 kB[0m [31m3.8 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hInstalling collected packages: unidecode
Successfully installed unidecode-1.3.6


In [67]:
import pandas as pd
from tqdm import tqdm
from unidecode import unidecode
from genderize import Genderize # Gender API

In [190]:
# Import the df
names_genders = pd.read_csv('data/names_genders.csv', sep = ',')

# Sort to have the NAs for updating ready
## This sort is performed every time when the data is read in
names_genders = names_genders.sort_values(['gender', 'prob', 'first_name'], 
                          ascending = [0,0,1], 
                          na_position= 'first').reset_index(drop = True)
print(f"Number of names uncategorized: {names_genders['gender'].isnull().sum()}")
names_genders.head()

Number of names uncategorized: 8825


Unnamed: 0,first_name,alph_value,gender,prob
0,JanikVasily,J,,
1,JannFrederik,J,,
2,Jannatun,J,,
3,Jannes,J,,
4,Janniele,J,,


# Match the data with external data

In [186]:
def search_gender_from_data(names_genders, ext_data, ext_firstname, ext_gender):
    """Match the names with an external dataset and retrieve gender
    Args:
        ext_data (pd.DataFrame): external dataset to be used for matching
        ext_firstname (str): name of the 'first_name' column in the external dataset
        ext_gender (str): name of the 'gender' column in the external dataset
    """
    
    # Search for names from the UCI name data set
    for i in tqdm(range(len(names_genders))):
        
        # If gender is already marked, ignore the row
        if names_genders.loc[i, 'gender'] == 'M' or names_genders.loc[i, 'gender'] == 'F':
                pass
        else:
            # Extract the name and letter
            firstname = names_genders.loc[i, 'first_name']

            # Search in a subset of the externalm dataset
            idx = ext_data[ext_data[ext_firstname] == firstname].index

            # If no index found, no name -> do nothing
            if len(idx) == 0:
                pass
            
            else:
                idx = idx.values[0]
                print(firstname, ext_data.loc[idx, ext_gender][0], 1)
                names_genders.loc[i, 'gender'] = ext_data.loc[idx, ext_gender][0] # get the gender
                names_genders.loc[i, 'prob'] = 1 # set prob to 1

### Use the Chinese data to fetch names

In [None]:
# Import the data (see below for data retrieval)
chinese_names = pd.read_csv('chinese_names.csv')

# Run the function
search_gender_from_data(names_genders, chinese_names, 'name_eng', 'gender')

# Sort the data so that no-retrieved data is first
names_genders = names_genders.sort_values(['gender', 'prob', 'first_name'], 
                                               ascending = [0,0,1], 
                                               na_position= 'first').reset_index(drop = True)
# Update the names-genders csv
names_genders.to_csv('data/names_genders.csv', index = False)

# Remove the Chinese names tables
! rm chinese_names.csv

### Use the UCI data

In [None]:
# Uncomment the below code
# import ssl
# ssl._create_default_https_context = ssl._create_unverified_context

# Import the data (see below for data retrieval)
uci_names = pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/00591/name_gender_dataset.csv')

# Run the function
search_gender_from_data(names_genders, uci_names, 'Name', 'Gender')

# Sort the data so that no-retrieved data is first
names_genders = names_genders.sort_values(['gender', 'prob', 'first_name'], 
                                               ascending = [0,0,1], 
                                               na_position= 'first').reset_index(drop = True)
# Update the names-genders csv
names_genders.to_csv('data/names_genders.csv', index = False)

### Use the Behind the Name data

In [187]:
btn_names = pd.read_table('data/btn_givennames.txt')

# Do the data cleaning
btn_names['name'] = btn_names['name'].apply(unidecode)
btn_names = btn_names[btn_names['name'].str.contains('[A-Za-z]', na=False)].reset_index(drop = True)
btn_names = btn_names[~(btn_names['name'].str.len() < 4)].reset_index(drop = True)
btn_names['name'] = btn_names['name'].str.replace(' ', '')
btn_names['gender'] = btn_names['gender'].str.upper()
btn_names

Unnamed: 0,name,gender
0,Aabraham,M
1,Aada,F
2,Aadan,M
3,Aadolf,M
4,Aafje,F
...,...,...
24273,Zydre,F
24274,Zydrunas,M
24275,Zygfryd,M
24276,Zygmunt,M


In [188]:
# Run the function
search_gender_from_data(names_genders, btn_names, 'name', 'gender')


  1%|▎                                    | 201/21336 [00:00<00:20, 1043.77it/s]

Jaouad M 1
Jarmo M 1
Jaroslava F 1
Jayadev M 1
Jayanta M 1
Jayashri F 1
Jedrzej M 1
Jeppe M 1
Jernej M 1
Jevgenija F 1
Jiang M 1
Jianhong M 1


  4%|█▍                                   | 827/21336 [00:00<00:16, 1226.48it/s]

Joeri M 1
Jonatas M 1
Joona M 1
Joonas M 1
Jordao M 1
JoseAntonio M 1
JoseMaria M 1
Jouko M 1
JuanJose M 1
JuanManuel M 1
JuanPablo M 1
Juhan M 1
Juhana M 1
Jurgita F 1
Jurica M 1
Jussi M 1
Kahina F 1
Kallistos M 1
Kambiz M 1
Kanchana F 1


  6%|██                                  | 1200/21336 [00:01<00:16, 1231.35it/s]

Kanstantsin M 1
Karlijn F 1
Karolos M 1
Katsuhito M 1
Katsuro M 1
Kazuhiko M 1
Kejsi F 1
Ketil M 1
Khadijeh F 1
Khwaja M 1
Kimmo M 1
Kirils M 1
Kishori F 1
Kivanc M 1
Klavdiya F 1
Kleitos M 1
Klemen M 1
Klemens M 1
Koenraad M 1
Kolos M 1


  7%|██▍                                 | 1450/21336 [00:01<00:16, 1238.60it/s]

Kostyantyn M 1
Kouji M 1
Kristiyan M 1
Krste M 1
Kshitij M 1
Kuzman M 1
Lassi M 1
Laurentiu M 1
Leifur M 1
Leontios M 1
Lesya F 1
Lieven M 1


  9%|███                                 | 1826/21336 [00:01<00:15, 1236.11it/s]

Liubov F 1
Liudmila F 1
Liudmyla F 1
Liviu M 1
Liwen M 1
Ljerka F 1
Lodewijk M 1
Lojze M 1
Lorand M 1
Lovro M 1
Lubos M 1
Ludek M 1
Ludivine F 1
Luminita F 1
Lutfi M 1
Lutfiye F 1
Lutgarde F 1
Lykourgos M 1
Maarit F 1
Maartje F 1
Madhukar M 1
Madita F 1
Magdy M 1
Magomed M 1
Maimunah F 1
Maksims M 1
Malati F 1
Mamman M 1
MariaCristina F 1
MariadelMar F 1
Marijn M 1
Marnix M 1


 10%|███▋                                | 2202/21336 [00:01<00:15, 1236.44it/s]

Martim M 1
Masahiko M 1
Masoumeh F 1
Mateu M 1
Matevz M 1
Matjaz M 1
Matko M 1
Meadhbh F 1
Medhat M 1
Mesud M 1
Metod M 1
Midhat M 1
MiguelAngel M 1
Mihkel M 1
Miloslav M 1
Minko M 1
Minoo F 1
Mirco M 1
Mitja M 1


 12%|████▎                               | 2574/21336 [00:02<00:15, 1235.92it/s]

Mogens M 1
Mojdeh F 1
Momir M 1
Moray M 1
Morvarid F 1
Mozhgan F 1
Murugan M 1
Muthoni F 1
Mykhailo M 1
Myrthe F 1
Nadiia F 1
Naoise M 1
Nazgul F 1
Necla F 1
Neofytos M 1


 14%|████▉                               | 2949/21336 [00:02<00:14, 1234.83it/s]

Nezihe F 1
Nigora F 1
Nigul M 1
Nilima F 1
Nitzan M 1
Nolwenn F 1
Noureddin M 1
Nurettin M 1
Odhran M 1
Oivind M 1
Olegs M 1
Oleksandra F 1
Orsolya F 1
Osvald M 1
Othmane M 1
Otokar M 1
Oualid M 1
Oumarou M 1
Oystein M 1
Oyvind M 1
Ozgur M 1
Palina F 1
Pallab M 1
Pascaline F 1
Paskal M 1
Pauliina F 1
Pavol M 1
Pelayo M 1
Pencho M 1


 16%|█████▌                              | 3326/21336 [00:02<00:14, 1244.36it/s]

Pentti M 1
Pepijn M 1
Pertti M 1
Petteri M 1
Petya M 1
Pierpaolo M 1
Pierrick M 1
Pirouz M 1
Placid M 1
Polona F 1
Pradip M 1
Prasanna M 1
Prasenjit M 1
Pritha F 1
Przemek M 1
Przemyslaw M 1
Putri F 1
Qinglong M 1


 17%|██████▏                             | 3702/21336 [00:03<00:14, 1240.27it/s]

Radboud M 1
Radim M 1
Radko M 1
Radomil M 1
Radoslaw M 1
Raimondas M 1
Rajib M 1
Rajmund M 1
Rasel M 1
Rasoul M 1
Ratnam M 1
Reinoud M 1
Reinout M 1
Renars M 1
Renske F 1
Resit M 1
Riikka F 1
Rinat M 1
Rodica F 1
Rogier M 1
Romeu M 1
Roope M 1
Rositsa F 1
Rozenn F 1
Rudiger M 1


 19%|██████▉                             | 4077/21336 [00:03<00:13, 1242.21it/s]

Rukiye F 1
Rustem M 1
Ryoichi M 1
Sadegh M 1
Saeid M 1
Saiful M 1
Sakine F 1
Salme F 1
Samuli M 1
Sanjib M 1
Sanjin M 1
Santeri M 1
Sasko M 1
Sebnem F 1


 21%|███████▌                            | 4455/21336 [00:03<00:13, 1250.61it/s]

Semen M 1
Senol M 1
Seong M 1
Seppo M 1
Sergejs M 1
Serhii M 1
Serif M 1
Sevil F 1
Shahrokh M 1
Shankara M 1
Sherzod M 1


 23%|████████▏                           | 4834/21336 [00:03<00:13, 1249.19it/s]

Shinsuke M 1
Shirou M 1
Shohreh F 1
Shouta M 1
Shresth M 1
Siarhei M 1
Sibusiso M 1
Sietse M 1
Silviu M 1
Simge F 1
Sindre M 1
Sindri M 1
Sixte M 1
Sixtine F 1


 24%|████████▊                           | 5209/21336 [00:04<00:13, 1239.82it/s]

Sjaak M 1
Sjoerd M 1
Slawomir M 1
Somayeh F 1
Sondre M 1
Souma M 1
Spela F 1
Stanka F 1
Steinn M 1
Stien F 1
Stijn M 1
Stine F 1
Subramanian M 1
Sudheer M 1
Sukhwinder M 1
Sumati F 1
Sunan M 1


 26%|█████████▍                          | 5584/21336 [00:04<00:12, 1241.34it/s]

Svante M 1
Svein M 1
Sveinn M 1
Svetla F 1
Sviatlana F 1
Svyatoslav M 1
Sylwester M 1
Syuzanna F 1
Szczepan M 1
Szilveszter M 1
Taavet M 1
Tadej M 1
Takehiko M 1
Tancredi M 1
Tanel M 1
Taneli M 1
Tayeb M 1
Teemu M 1
Telmo M 1
Teppo M 1
Terhi F 1
Terje M 1
Tetyana F 1
Tevfik M 1
Tharindu M 1
Theophanes M 1
Thibaud M 1
Thiemo M 1
Thorarinn M 1
Thorben M 1
Thore M 1
Thorir M 1


 28%|██████████                          | 5960/21336 [00:04<00:12, 1236.67it/s]

Tiberiu M 1
Torbjorn M 1
Torgeir M 1
Tornike M 1
Traian M 1
Troels M 1
Truls M 1
Tugba F 1
Tugce F 1
Tuomo M 1
Turgay M 1
Tuukka M 1
Tuuli F 1
Tymofiy M 1
Uduak M 1
Ulisse M 1
Vahit M 1
Valentyn M 1
Valeriu M 1
Valtteri M 1
Varghese M 1
Varuna M 1
Vasant M 1
Vaska M 1
Veerle F 1
Veljko M 1


 30%|██████████▋                         | 6333/21336 [00:05<00:12, 1230.44it/s]

Venkata M 1
Vetle M 1
Vicenc M 1
Vihtori M 1
Viivi F 1
Viktoriia F 1
Vilde F 1
Vilhelm M 1
Viliam M 1
Viljem M 1
Vilmar M 1
Virgile M 1
Vitalik M 1
Vitezslav M 1
Vladan M 1
Vojtech M 1
Volkmar M 1
Vugar M 1
Wadud M 1
Wahida F 1
Werther M 1
Willemijn F 1


 34%|████████████▏                       | 7214/21336 [00:05<00:11, 1246.30it/s]

Yahui F 1
Yakup M 1
Yating F 1
Yauhen M 1
Yawen F 1
Yelyzaveta F 1
Yeong F 1
Yevgen M 1
Yevhen M 1
Yevheniya F 1
Yiftach M 1
Yijun M 1


 36%|████████████▊                       | 7591/21336 [00:06<00:11, 1247.88it/s]

Yngve M 1
Yngvi M 1
Ysbrand M 1


 37%|█████████████▍                      | 7968/21336 [00:06<00:10, 1247.65it/s]

Yudai M 1
Zbynek M 1
Zedong M 1
Zhaleh F 1


 40%|██████████████▎                     | 8472/21336 [00:06<00:10, 1251.95it/s]

Zhihao M 1
Ziaeddin M 1
Ziynet F 1
Zohreh F 1
Zsigmond M 1
Zsombor M 1
Zurab M 1


 44%|███████████████▊                    | 9361/21336 [00:07<00:09, 1262.70it/s]

Harkaitz M 1
Hideaki M 1
Hlynur M 1
Hormoz M 1
Ildar M 1
Iulian M 1
Ivaylo M 1
Iztok M 1
Jagadish M 1
Harshad M 1
Horatiu M 1
Iacopo M 1
Ignas M 1
Ignasi M 1
Ilshat M 1
Indrajit M 1
Ionut M 1


 46%|████████████████▍                   | 9739/21336 [00:07<00:09, 1239.92it/s]

Hauke M 1
Ionel M 1
Jagannath M 1
Hippolyte F 1
Hirohito M 1
Hasse M 1
Hermanni M 1
Husnu M 1
Hibiki M 1
Horia M 1
Huang M 1
Iciar F 1
Iveta F 1
Idoia F 1
Intan F 1
Ionela F 1
Heidrun F 1
Hulya F 1
Ingunn F 1
Jagna F 1
Iiris F 1
Henrike F 1


100%|███████████████████████████████████| 21336/21336 [00:08<00:00, 2650.38it/s]


In [189]:

# Sort the data so that no-retrieved data is first
names_genders = names_genders.sort_values(['gender', 'prob', 'first_name'], 
                                               ascending = [0,0,1], 
                                               na_position= 'first').reset_index(drop = True)
# Update the names-genders csv
names_genders.to_csv('data/names_genders.csv', index = False)

# Pull the genders from Genderize.io API

In [35]:
# A helper function to update data from APIs, etc

def update_names_table(names_genders, limit = 500):
    
    try: 
        # For-loop querying the genderize.io API
        for i in tqdm(range(limit)):

            # Extract the name
            first_name = names_genders.loc[i, 'first_name'] # first name

            # Check if the name has already been checked
            ## Query only if the name hasn't been checked already
            if names_genders.loc[i, 'prob'] >= 0 and names_genders.loc[i, 'prob'] <= 1:
                pass

            else:
                # Try querying the Genderize.io API
                gender_info = Genderize().get([first_name])
                names_genders.loc[i, 'gender'] = gender_info[0]['gender']
                names_genders.loc[i,'prob'] = gender_info[0]['probability']

    except:
        print(f'Iteration nr {i}')
        print('Limit likely exceeded.')

     # Sort the data so that no-retrieved data is first
    names_genders = names_genders.sort_values(['gender', 'prob', 'first_name'], 
                                               ascending = [0,0,1], 
                                               na_position= 'first').reset_index(drop = True)
    # Write to csv once data is pulled
    names_genders.to_csv('data/names_genders.csv', index = False)

In [36]:
# Run the function that querys names from Genderize.io
update_names_table(names_genders, 200)

 94%|██████████████████████████████████████▎  | 187/200 [03:51<00:16,  1.24s/it]

Iteration nr 187
Limit likely exceeded.





In [117]:
# A sloppy solution for the Chinese Names data:

# Clone the Github repo
!git clone https://github.com/wainshine/Chinese-Names-Corpus

# Import the data
chinese_names = pd.read_table('./Chinese-Names-Corpus/English_Names_Corpus/English_Cn_Name_Corpus_Gender（48W）.txt',
                             sep = '|',
                            names=['name_chi', 'name_eng', 'gender']
                         )

# Remove rows that do not include alphabetical letters
chinese_names = chinese_names[chinese_names['name_eng'].str.contains('[A-Za-z]', na=False)].reset_index(drop = True)
chinese_names = chinese_names[~(chinese_names['name_eng'].str.len() < 4)].reset_index(drop = True)
chinese_names['name_eng'] = chinese_names['name_eng'].str.replace(' ', '')

# Drop duplicates
chinese_names = chinese_names.drop_duplicates('name_eng').reset_index(drop = True)

# # Select only the English version and gender
chinese_names = chinese_names[['name_eng', 'gender']]

# Save to csv
chinese_names.to_csv('chinese_names.csv', index = False)

# Remove the directory, as it is not needed anymore
!rm -rf Chinese-Names-Corpus

Cloning into 'Chinese-Names-Corpus'...
remote: Enumerating objects: 266, done.[K
remote: Counting objects: 100% (53/53), done.[K
remote: Compressing objects: 100% (28/28), done.[K
remote: Total 266 (delta 26), reused 44 (delta 25), pack-reused 213[K
Receiving objects: 100% (266/266), 35.93 MiB | 7.47 MiB/s, done.
Resolving deltas: 100% (141/141), done.
