# Data Retrieval Scripts
I combined data from several sources + fixed some rows manually. Here are links to the sources:

- UCI Name-Gender Dataset: https://archive.ics.uci.edu/ml/datasets/Gender+by+Name
    - Downloaded the dataset and matched the names I had in my own dataset. If match found, prob = 1.

- Names lists from Github repos:
    - Chinese-English Names-Genders Corpus: https://raw.githubusercontent.com/wainshine/Chinese-Names-Corpus/master/English_Names_Corpus/English_Cn_Name_Corpus_Gender（48W）.txt
    - Indian names list: https://github.com/laxmimerit/indian-names-dataset
    - Arabic names list: https://github.com/zakahmad/ArabicNameGenderFinder

- Behind the Name Downloadable Dataset: https://www.behindthename.com/api/

- Genderize.io: https://genderize.io/
    - Querying names one at a time. There is a daily limit to appx 1000 queries per IP-address.
    - maybe see this also: https://github.com/jholtmann/genderize_csv

In [None]:
!pip install genderize
!pip install unidecode

In [1]:
import pandas as pd
from tqdm import tqdm
from unidecode import unidecode
from genderize import Genderize # Gender API

In [8]:
# Import the df
names_genders = pd.read_csv('data/names_genders.csv', sep = ',')

# Sort to have the NAs for updating ready
## This sort is performed every time when the data is read in
names_genders = names_genders.sort_values(['gender', 'prob', 'first_name'], 
                          ascending = [0,0,1], 
                          na_position= 'first').reset_index(drop = True)
print(f"Number of names uncategorized: {names_genders['gender'].isnull().sum()}")
names_genders.head(10)

Number of names uncategorized: 3967


Unnamed: 0,first_name,alph_value,gender,prob
0,Timyoas,T,,
1,TingAn,T,,
2,TingChun,T,,
3,TingHan,T,,
4,TingKuo,T,,
5,TingWai,T,,
6,TingWu,T,,
7,TingYu,T,,
8,Tingdi,T,,
9,Tingfung,T,,


# Match the data with external data

In [None]:
def search_gender_from_data(names_genders, ext_data, ext_firstname, ext_gender):
    """Match the names with an external dataset and retrieve gender
    Args:
        ext_data (pd.DataFrame): external dataset to be used for matching
        ext_firstname (str): name of the 'first_name' column in the external dataset
        ext_gender (str): name of the 'gender' column in the external dataset
    """
    # Lowercase the names in the external dataset
    ext_data[ext_firstname] = ext_data[ext_firstname].str.lower()
    
    # Search for names from the UCI name data set
    for i in tqdm(range(len(names_genders))):
        
        # If gender is already marked, ignore the row
        if names_genders.loc[i, 'gender'] == 'M' or names_genders.loc[i, 'gender'] == 'F':
                pass
        else:
            # Extract the name and lower it
            firstname = names_genders.loc[i, 'first_name'].lower()
            
            # Search in a subset of the externalm dataset
            idx = ext_data[ext_data[ext_firstname] == firstname].index

            # If no index found, no name -> do nothing
            if len(idx) == 0:
                pass
            
            else:
                idx = idx.values[0]
                print(firstname, ext_data.loc[idx, ext_gender][0], 1)
                names_genders.loc[i, 'gender'] = ext_data.loc[idx, ext_gender][0] # get the gender
                names_genders.loc[i, 'prob'] = 1 # set prob to 1

### Use the Chinese data to fetch names

In [None]:
# Import the data (see below for data retrieval)
chinese_names = pd.read_csv('data/chinese_names.csv')

# Run the function
search_gender_from_data(names_genders, chinese_names, 'name_eng', 'gender')

# Sort the data so that no-retrieved data is first
names_genders = names_genders.sort_values(['gender', 'prob', 'first_name'], 
                                               ascending = [0,0,1], 
                                               na_position= 'first').reset_index(drop = True)
# Update the names-genders csv
names_genders.to_csv('data/names_genders.csv', index = False)

### Use the UCI data

In [None]:
# Uncomment the below code
# import ssl
# ssl._create_default_https_context = ssl._create_unverified_context

# Import the data (see below for data retrieval)
uci_names = pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/00591/name_gender_dataset.csv')

# Save a csv version to the repo
uci_names.o_csv('data/uci_names.csv', index = False)

# Run the function
search_gender_from_data(names_genders, uci_names, 'Name', 'Gender')

# Sort the data so that no-retrieved data is first
names_genders = names_genders.sort_values(['gender', 'prob', 'first_name'], 
                                               ascending = [0,0,1], 
                                               na_position= 'first').reset_index(drop = True)
# Update the names-genders csv
names_genders.to_csv('data/names_genders.csv', index = False)

### Use the Behind the Name data

In [None]:
# Data Import
btn_names = pd.read_table('data/btn_givennames.txt')

# Do the data cleaning
btn_names['name'] = btn_names['name'].apply(unidecode)
btn_names = btn_names[btn_names['name'].str.contains('[A-Za-z]', na=False)].reset_index(drop = True)
btn_names = btn_names[~(btn_names['name'].str.len() < 4)].reset_index(drop = True)
btn_names['name'] = btn_names['name'].str.replace(' ', '')
btn_names['gender'] = btn_names['gender'].str.upper()

# Run the function
search_gender_from_data(names_genders, btn_names, 'name', 'gender')

# Sort the data so that no-retrieved data is first
names_genders = names_genders.sort_values(['gender', 'prob', 'first_name'], 
                                               ascending = [0,0,1], 
                                               na_position= 'first').reset_index(drop = True)
# Update the names-genders csv
names_genders.to_csv('data/names_genders.csv', index = False)

### Use the Indian Names Data

In [None]:
# Data Import
ind_names = pd.read_csv('data/indian_names.csv')

# Run the function
search_gender_from_data(names_genders, ind_names, 'name', 'gender')

# Sort the data so that no-retrieved data is first
names_genders = names_genders.sort_values(['gender', 'prob', 'first_name'], 
                                               ascending = [0,0,1], 
                                               na_position= 'first').reset_index(drop = True)
# Update the names-genders csv
names_genders.to_csv('data/names_genders.csv', index = False)

### Use the Arabic Names Data

In [None]:
# Data Import
ar_names = pd.read_csv('data/arabic_names.csv')

# Run the function
search_gender_from_data(names_genders, ar_names, 'Name', 'Gender')

# Sort the data so that no-retrieved data is first
names_genders = names_genders.sort_values(['gender', 'prob', 'first_name'], 
                                               ascending = [0,0,1], 
                                               na_position= 'first').reset_index(drop = True)
# Update the names-genders csv
names_genders.to_csv('data/names_genders.csv', index = False)

# Pull the data from Genderize.io API

In [3]:
# A helper function to update data from Genderize.io API

def update_names_table(names_genders, limit = 1000):
    
    try: 
        # For-loop querying the genderize.io API
        for i in tqdm(range(limit)):

            # Extract the name
            first_name = names_genders.loc[i, 'first_name'] # first name

            # Check if the name has already been checked
            ## Query only if the name hasn't been checked already
            if names_genders.loc[i, 'prob'] >= 0 and names_genders.loc[i, 'prob'] <= 1:
                pass

            else:
                # Try querying the Genderize.io API
                gender_info = Genderize().get([first_name])
                names_genders.loc[i, 'gender'] = gender_info[0]['gender']
                names_genders.loc[i,'prob'] = gender_info[0]['probability']

    except:
        print(f'Iteration nr {i}')
        print('Limit likely exceeded.')

     # Sort the data so that no-retrieved data is first
    names_genders = names_genders.sort_values(['gender', 'prob', 'first_name'], 
                                               ascending = [0,0,1], 
                                               na_position= 'first').reset_index(drop = True)
    # Write to csv once data is pulled
    names_genders.to_csv('data/names_genders.csv', index = False)

In [11]:
# Run the function that querys names from Genderize.io
update_names_table(names_genders, 1000)

100%|███████████████████████████████████████| 1000/1000 [19:21<00:00,  1.16s/it]


## Get the data set from Github Repos: the sloppy way

### Chinese Names Data

In [None]:
# Clone the Github repo
!git clone https://github.com/wainshine/Chinese-Names-Corpus

# Import the data
chinese_names = pd.read_table('./Chinese-Names-Corpus/English_Names_Corpus/English_Cn_Name_Corpus_Gender（48W）.txt',
                             sep = '|',
                            names=['name_chi', 'name_eng', 'gender']
                         )

# Remove rows that do not include alphabetical letters
chinese_names = chinese_names[chinese_names['name_eng'].str.contains('[A-Za-z]', na=False)].reset_index(drop = True)
chinese_names = chinese_names[~(chinese_names['name_eng'].str.len() < 4)].reset_index(drop = True)
chinese_names['name_eng'] = chinese_names['name_eng'].str.replace(' ', '')

# Drop duplicates
chinese_names = chinese_names.drop_duplicates('name_eng').reset_index(drop = True)

# # Select only the English version and gender
chinese_names = chinese_names[['name_eng', 'gender']]

# Save to csv
chinese_names.to_csv('data/chinese_names.csv', index = False)

# Remove the directory, as it is not needed anymore
!rm -rf Chinese-Names-Corpus

### Indian Names data

In [None]:
# Clone the Github repo
!git clone https://github.com/laxmimerit/indian-names-dataset
    
# Data import
ind_males = pd.read_csv('indian-names-dataset/Indian-Male-Names.csv')
ind_females = pd.read_csv('indian-names-dataset/Indian-Female-Names.csv')
ind_names = pd.concat([ind_males,ind_females], ignore_index=True)

# Remove rows that do not include alphabetical letters
ind_names = ind_names[ind_names['name'].str.contains('[A-Za-z]', na=False)].reset_index(drop = True)
ind_names = ind_names[~(ind_names['name'].str.len() < 4)].reset_index(drop = True)
ind_names['name'] = ind_names['name'].str.replace(' ', '')

# Drop duplicates
ind_names = ind_names.drop_duplicates('name').reset_index(drop = True)

# Uppercase the gender values
ind_names['gender']= ind_names['gender'].str.upper()

# Pick only name and gender
ind_names = ind_names[['name', 'gender']]

# Save to csv
ind_names.to_csv('data/indian_names.csv', index = False)

# Remove the directory, as it is not needed anymore
!rm -rf indian-names-dataset

### Arabic Names Data

In [None]:
!git clone https://github.com/zakahmad/ArabicNameGenderFinder
    
# Data import
ar_males = pd.read_csv('ArabicNameGenderFinder/males_en.csv')
ar_females = pd.read_csv('ArabicNameGenderFinder/females_en.csv')
ar_names = pd.concat([ar_males, ar_females], ignore_index=True)

# Remove rows that do not include alphabetical letters
ar_names = ar_names[ar_names['Name'].str.contains('[A-Za-z]', na=False)].reset_index(drop = True)
ar_names = ar_names[~(ar_names['Name'].str.len() < 4)].reset_index(drop = True)
ar_names['Name'] = ar_names['Name'].str.replace(' ', '')

# Drop duplicates
ar_names = ar_names.drop_duplicates('Name').reset_index(drop = True)

# Uppercase the gender values
ar_names['Gender']= ar_names['Gender'].str[:1]

# Save to csv
ar_names.to_csv('data/arabic_names.csv', index = False)

# Remove the directory, as it is not needed anymore
!rm -rf ArabicNameGenderFinder