# Data Retrieval Scripts
I combined data from several sources + fixed some rows manually. Here are links to the sources:

- UCI Name-Gender Dataset: https://archive.ics.uci.edu/ml/datasets/Gender+by+Name
    - Downloaded the dataset and matched the names I had in my own dataset. If match found, prob = 1.
- Chinese-English Names-Genders Corpus: https://raw.githubusercontent.com/wainshine/Chinese-Names-Corpus/master/English_Names_Corpus/English_Cn_Name_Corpus_Gender（48W）.txt

- Genderize.io: https://genderize.io/
    - Querying names one at a time. There is a daily limit to appx 1500 queries per IP-address.

In [66]:
!pip install genderize
!pip install unidecode

Collecting unidecode
  Downloading Unidecode-1.3.6-py3-none-any.whl (235 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m235.9/235.9 kB[0m [31m3.8 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hInstalling collected packages: unidecode
Successfully installed unidecode-1.3.6


In [67]:
import pandas as pd
from tqdm import tqdm
from unidecode import unidecode
from genderize import Genderize # Gender API

In [171]:
# Import the df
names_genders = pd.read_csv('data/names_genders.csv', sep = ',')

# Sort to have the NAs for updating ready
## This sort is performed every time when the data is read in
names_genders = names_genders.sort_values(['gender', 'prob', 'first_name'], 
                          ascending = [0,0,1], 
                          na_position= 'first').reset_index(drop = True)
print(f"Number of names uncategorized: {names_genders['gender'].isnull().sum()}")
names_genders.head()

Number of names uncategorized: 9189


Unnamed: 0,first_name,alph_value,gender,prob
0,JanikVasily,J,,
1,JannFrederik,J,,
2,Jannatun,J,,
3,Jannes,J,,
4,Janniele,J,,


# Match the data with external data

In [172]:
def search_gender_from_data(names_genders, ext_data, ext_firstname, ext_gender):
    """Match the names with an external dataset and retrieve gender
    Args:
        ext_data (pd.DataFrame): external dataset to be used for matching
        ext_firstname (str): name of the 'first_name' column in the external dataset
        ext_gender (str): name of the 'gender' column in the external dataset
    """
    
    # Search for names from the UCI name data set
    for i in tqdm(range(len(names_genders))):
        
        # If gender is already marked, ignore the row
        if names_genders.loc[i, 'gender'] == 'M' or names_genders.loc[i, 'gender'] == 'F':
                pass
        else:
            # Extract the name and letter
            firstname = names_genders.loc[i, 'first_name']

            # Search in a subset of the externalm dataset
            idx = ext_data[ext_data[ext_firstname] == firstname].index

            # If no index found, no name -> do nothing
            if len(idx) == 0:
                pass
            
            else:
                idx = idx.values[0]
              #  print(firstname, ext_data.loc[idx, ext_gender], 1)
                names_genders.loc[i, 'gender'] = ext_data.loc[idx, ext_gender] # get the gender
                names_genders.loc[i, 'prob'] = 1 # set prob to 1

### Use the Chinese data to fetch names

In [None]:
# Import the data (see below for data retrieval)
chinese_names = pd.read_csv('chinese_names.csv')

# Run the function
search_gender_from_data(names_genders, chinese_names, 'name_eng', 'gender')

# Sort the data so that no-retrieved data is first
names_genders = names_genders.sort_values(['gender', 'prob', 'first_name'], 
                                               ascending = [0,0,1], 
                                               na_position= 'first').reset_index(drop = True)
# Update the names-genders csv
names_genders.to_csv('data/names_genders.csv', index = False)

# Remove the Chinese names tables
! rm chinese_names.csv

### Use the UCI data

In [169]:
# Uncomment the below code
# import ssl
# ssl._create_default_https_context = ssl._create_unverified_context

# Import the data (see below for data retrieval)
uci_names = pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/00591/name_gender_dataset.csv')

# Run the function
search_gender_from_data(names_genders, uci_names, 'Name', 'Gender')

# Sort the data so that no-retrieved data is first
names_genders = names_genders.sort_values(['gender', 'prob', 'first_name'], 
                                               ascending = [0,0,1], 
                                               na_position= 'first').reset_index(drop = True)
# Update the names-genders csv
names_genders.to_csv('data/names_genders.csv', index = False)

  0%|                                       | 45/21336 [00:00<01:36, 221.39it/s]

Jaromir M 1
Jaroslaw M 1


  3%|█▏                                    | 678/21336 [00:03<01:33, 221.64it/s]

Jorgen M 1
Jozsef M 1


  4%|█▌                                    | 883/21336 [00:04<01:33, 219.55it/s]

Jurrien M 1


  5%|█▊                                   | 1019/21336 [00:04<01:31, 221.79it/s]

Karoly M 1
Katarina F 1


  6%|██▏                                  | 1295/21336 [00:05<01:30, 222.26it/s]

Kubra F 1


  7%|██▌                                  | 1456/21336 [00:06<01:29, 221.21it/s]

Laurene F 1


  7%|██▋                                  | 1569/21336 [00:07<01:30, 218.48it/s]

Lidia F 1


  8%|███                                  | 1770/21336 [00:08<01:29, 217.61it/s]

Lubomira F 1


  9%|███▎                                 | 1880/21336 [00:08<01:28, 218.73it/s]

Mairead F 1


 10%|███▌                                 | 2041/21336 [00:09<01:27, 219.50it/s]

Matej M 1
Matias M 1
Matyas M 1


 10%|███▊                                 | 2201/21336 [00:10<01:27, 219.93it/s]

Mickael M 1
Mieczyslaw M 1
Mikolaj M 1


 11%|████                                 | 2362/21336 [00:10<01:25, 222.32it/s]

Miroslaw M 1


 12%|████▎                                | 2454/21336 [00:11<01:25, 221.84it/s]

Monika F 1


 13%|████▋                                | 2704/21336 [00:12<01:25, 218.25it/s]

Nathanaelle F 1
Nazim M 1
Nebojsa M 1


 13%|████▉                                | 2838/21336 [00:12<01:24, 219.69it/s]

Noelie F 1
Noemie F 1


 14%|█████                                | 2907/21336 [00:13<01:23, 221.56it/s]

Nuria F 1


 14%|█████▏                               | 2976/21336 [00:13<01:23, 220.60it/s]

Otavio M 1
Paivi F 1


 15%|█████▌                               | 3228/21336 [00:14<01:21, 221.52it/s]

Plinio M 1


 17%|██████▏                              | 3572/21336 [00:16<01:20, 220.02it/s]

Rafal M 1
Raiza F 1


 17%|██████▎                              | 3641/21336 [00:16<01:19, 221.99it/s]

Ramunas M 1
Raphaelle F 1


 17%|██████▍                              | 3710/21336 [00:16<01:19, 220.77it/s]

Rejean M 1


 18%|██████▊                              | 3894/21336 [00:17<01:19, 220.22it/s]

Ruairi M 1
Ruben M 1


 20%|███████▍                             | 4262/21336 [00:19<01:17, 219.45it/s]

Savas M 1


 20%|███████▌                             | 4328/21336 [00:19<01:17, 218.13it/s]

Senen M 1


 23%|████████▋                            | 4993/21336 [00:22<01:13, 221.91it/s]

Simao M 1


 24%|████████▊                            | 5108/21336 [00:23<01:13, 221.21it/s]

Songul F 1


 24%|█████████                            | 5223/21336 [00:23<01:12, 222.75it/s]

Stefane F 1


 25%|█████████▎                           | 5337/21336 [00:24<01:13, 217.84it/s]

Sumeyra F 1


 27%|██████████                           | 5767/21336 [00:26<01:10, 220.43it/s]

Thorbjorn M 1


 27%|██████████                           | 5834/21336 [00:26<01:10, 218.51it/s]

Timea F 1


 29%|██████████▌                          | 6082/21336 [00:27<01:10, 217.87it/s]

Vache M 1


 29%|██████████▊                          | 6220/21336 [00:28<01:07, 222.49it/s]

Viktoria F 1
Vitor M 1


 31%|███████████▎                         | 6534/21336 [00:29<01:08, 217.05it/s]

Wlodzimierz M 1


 38%|█████████████▉                       | 8030/21336 [00:36<01:00, 220.50it/s]

Zalan M 1


 45%|████████████████▍                    | 9498/21336 [00:43<00:53, 221.19it/s]

Inigo M 1


 45%|████████████████▋                    | 9590/21336 [00:43<00:53, 220.43it/s]

Inaki M 1


 45%|████████████████▋                    | 9636/21336 [00:43<00:53, 219.57it/s]

Havard M 1


 46%|█████████████████                    | 9863/21336 [00:44<00:52, 217.23it/s]

Ildiko F 1


100%|████████████████████████████████████| 21336/21336 [00:45<00:00, 471.81it/s]


# Pull the genders from Genderize.io API

In [35]:
# A helper function to update data from APIs, etc

def update_names_table(names_genders, limit = 500):
    
    try: 
        # For-loop querying the genderize.io API
        for i in tqdm(range(limit)):

            # Extract the name
            first_name = names_genders.loc[i, 'first_name'] # first name

            # Check if the name has already been checked
            ## Query only if the name hasn't been checked already
            if names_genders.loc[i, 'prob'] >= 0 and names_genders.loc[i, 'prob'] <= 1:
                pass

            else:
                # Try querying the Genderize.io API
                gender_info = Genderize().get([first_name])
                names_genders.loc[i, 'gender'] = gender_info[0]['gender']
                names_genders.loc[i,'prob'] = gender_info[0]['probability']

    except:
        print(f'Iteration nr {i}')
        print('Limit likely exceeded.')

     # Sort the data so that no-retrieved data is first
    names_genders = names_genders.sort_values(['gender', 'prob', 'first_name'], 
                                               ascending = [0,0,1], 
                                               na_position= 'first').reset_index(drop = True)
    # Write to csv once data is pulled
    names_genders.to_csv('data/names_genders.csv', index = False)

In [36]:
# Run the function that querys names from Genderize.io
update_names_table(names_genders, 200)

 94%|██████████████████████████████████████▎  | 187/200 [03:51<00:16,  1.24s/it]

Iteration nr 187
Limit likely exceeded.





In [117]:
# A sloppy solution for the Chinese Names data:

# Clone the Github repo
!git clone https://github.com/wainshine/Chinese-Names-Corpus

# Import the data
chinese_names = pd.read_table('./Chinese-Names-Corpus/English_Names_Corpus/English_Cn_Name_Corpus_Gender（48W）.txt',
                             sep = '|',
                            names=['name_chi', 'name_eng', 'gender']
                         )

# Remove rows that do not include alphabetical letters
chinese_names = chinese_names[chinese_names['name_eng'].str.contains('[A-Za-z]', na=False)].reset_index(drop = True)
chinese_names = chinese_names[~(chinese_names['name_eng'].str.len() < 4)].reset_index(drop = True)
chinese_names['name_eng'] = chinese_names['name_eng'].str.replace(' ', '')

# Drop duplicates
chinese_names = chinese_names.drop_duplicates('name_eng').reset_index(drop = True)

# # Select only the English version and gender
chinese_names = chinese_names[['name_eng', 'gender']]

# Save to csv
chinese_names.to_csv('chinese_names.csv', index = False)

# Remove the directory, as it is not needed anymore
!rm -rf Chinese-Names-Corpus

Cloning into 'Chinese-Names-Corpus'...
remote: Enumerating objects: 266, done.[K
remote: Counting objects: 100% (53/53), done.[K
remote: Compressing objects: 100% (28/28), done.[K
remote: Total 266 (delta 26), reused 44 (delta 25), pack-reused 213[K
Receiving objects: 100% (266/266), 35.93 MiB | 7.47 MiB/s, done.
Resolving deltas: 100% (141/141), done.
