# Prepare Male and Female Names from GenderComputer

## 1. Prepare Unique Male and Female Names

This data is used to obtain 2 csv files:
* **a list of unique male names**, saved as `../male_names_only.csv` : a list of male names such that for each male name, it doesn't exist as a female name in any country
* **a list of unique female names**, saved as `../female_names_only.csv` : a list of female names such that for each female name, it doesn't exist as a male name in any country

In [20]:
from genderComputer import GenderComputer
gc = GenderComputer()

name_to_gender = {}
for country in gc.nameLists.keys():
    for gender, names in gc.nameLists[country].items():
        for name in names:
            if name in name_to_gender and  name_to_gender[name] != gender:
                name_to_gender[name] = 'invalid'
            else:
                name_to_gender[name] = gender
                
result = [(name, gender) for name, gender in name_to_gender.items() if gender != 'invalid']

Loaded dictionary from /Users/mhilmiasyrofi/Documents/FairnessTestInSA/asset/gender_computer/genderComputer/nameLists/gender.dict
Finished initialization


In [21]:
# result

In [22]:
male_name = [name for name, gender in name_to_gender.items() if gender == 'male']
len(male_name)

51518

In [23]:
female_name = [ name for name, gender in name_to_gender.items() if gender == 'female']
len(female_name)

56112

In [24]:
import pandas as pd
# male_name

In [25]:
dfm = pd.DataFrame(data={"name": male_name})

In [26]:
dfm

Unnamed: 0,name
0,arsalakhan
1,aymal
2,babrack
3,jelander
4,adil
...,...
51513,wubbo
51514,ybele
51515,ygram
51516,ype


#### Check the name

In [27]:
"palona" in dfm["name"].tolist()
"danyar" in dfm["name"].tolist()

True

In [28]:
dff = pd.DataFrame(data={"name": female_name})
dff

Unnamed: 0,name
0,kawatara
1,spogmai
2,torpekai
3,dlia
4,justina
...,...
56107,ypje
56108,ytje
56109,ytske
56110,yttje


#### Check the name

In [29]:
"sveya" in dff["name"].tolist()

True

#### Save the names into csv files

In [30]:
dfm.to_csv("../male_names_only.csv", index=False)
dff.to_csv("../female_names_only.csv", index=False)

## 2. Get Unique Name by Country

This data is used to obtain 2 csv files:
* **a list of unique male names that is unique for each country**, saved as `../unique_male_names_and_country.csv` : a list of male names such that (1) for each male name it doesn't exist as a female name in any country and (2) for each male name, it's unique for each country
* **a list of unique female names that is unique for each country**, saved as `../unique_female_names_and_country.csv` : a list of female names such that (1) for each female name, it doesn't exist as a male name in any country  and (2) for each female name, it's unique for each country

In [31]:
name_to_country = {}
for country in gc.nameLists.keys():
    for gender, names in gc.nameLists[country].items():
        for name in names:
            if name in name_to_country and  name_to_country[name] != country:
                name_to_country[name] = 'invalid'
            else:
                name_to_country[name] = country
                
result = [(name, country) for name, country in name_to_country.items() if country != 'invalid']

In [32]:
df = pd.DataFrame(data=result, columns=["name", "country"])

In [33]:
df

Unnamed: 0,name,country
0,arsalakhan,Afghanistan
1,aymal,Afghanistan
2,babrack,Afghanistan
3,jelander,Afghanistan
4,kawatara,Afghanistan
...,...,...
90041,ypje,The Netherlands
90042,ytje,The Netherlands
90043,ytske,The Netherlands
90044,yttje,The Netherlands


In [34]:
df.to_csv("../unique_name_country.csv", index=None)

In [35]:
df = pd.read_csv("../unique_name_country.csv")

# gcm = pd.read_csv("../../data/gc_name/male_names_only.csv")
gcm = pd.read_csv("../male_names_only.csv")
gcm = gcm.sample(frac=1, random_state=123)
mnames = gcm["name"].tolist()# # names from GC
# gcf = pd.read_csv("../../data/gc_name/female_names_only.csv")
gcf = pd.read_csv("../female_names_only.csv")
gcf = gcf.sample(frac=1, random_state=123)
fnames = gcf["name"].tolist()# # names from GC


df = df.sample(frac=1, random_state=12345)
n = 1
gb = df.groupby("country")
unique_male_names = []
unique_female_names = []
countries = []
for k in gb.groups.keys() :
    d = gb.get_group(k)
    male = ""
    female = ""
#     print(len(d["name"].tolist()))
#     print(d["name"].tolist()[0])
    for name in d["name"].tolist():
#         if "-" not in name and " " not in name and "\\" not in name:
        if male == "" :
            if name in mnames :
                male = name
        if female == "" :
            if name in fnames :
                female = name
        if male != "" and female != "" :
            unique_male_names.append(male.title())
            unique_female_names.append(female.title())
            countries.append(k)
            break

In [36]:
print(unique_male_names)

['Jelander', 'Christop', 'Fabijano', 'Mourão', 'Kerigan', 'Vendelín', 'Ville-Veikko', 'Kotsos', 'Néci', 'Jivavijaya', 'Keyghobad', 'Airiz', 'Tsahi', '\ufeffFrancesco', 'Verners', 'Karstein', 'Albercik', 'Lascăr', 'Matevž', 'Waabberi', 'Juan Luis', 'Vide', 'Harrit', 'Yardan\xa0', 'Theoden', 'Eldridge', 'Budivid']


In [37]:
df_unique_male_names = pd.DataFrame(data={"name": unique_male_names, "country": countries})
df_unique_male_names.to_csv("../unique_male_names_and_country.csv")
df_unique_male_names 

Unnamed: 0,name,country
0,Jelander,Afghanistan
1,Christop,Australia
2,Fabijano,Belgium
3,Mourão,Brazil
4,Kerigan,Canada
5,Vendelín,Czech
6,Ville-Veikko,Finland
7,Kotsos,Greece
8,Néci,Hungary
9,Jivavijaya,India


In [38]:
df_unique_female_names = pd.DataFrame(data={"name": unique_female_names, "country": countries})
df_unique_female_names.to_csv("../unique_female_names_and_country.csv")
df_unique_female_names 

Unnamed: 0,name,country
0,Spogmai,Afghanistan
1,Kimberle,Australia
2,Manyak,Belgium
3,Tainaçã,Brazil
4,Toniko,Canada
5,Bertička,Czech
6,Josefiina,Finland
7,Pitsa,Greece
8,Bske,Hungary
9,Humaila,India
