# Prepare Male and Female Names from GenderComputer

## 1. Prepare Unique Male and Female Names

This data is used to obtain 2 csv files:
* **a list of unique male names**, saved as `../male_names_only.csv` : a list of male names such that for each male name, it doesn't exist as a female name in any country
* **a list of unique female names**, saved as `../female_names_only.csv` : a list of female names such that for each female name, it doesn't exist as a male name in any country

In [23]:
from genderComputer import GenderComputer
gc = GenderComputer()

name_to_gender = {}
for country in gc.nameLists.keys():
    for gender, names in gc.nameLists[country].items():
        for name in names:
            if name in name_to_gender and  name_to_gender[name] != gender:
                name_to_gender[name] = 'invalid'
            else:
                name_to_gender[name] = gender
                
result = [(name, gender) for name, gender in name_to_gender.items() if gender != 'invalid']

Loaded dictionary from /Users/mhilmiasyrofi/Documents/BiasFinder/asset/gender_computer/genderComputer/nameLists/gender.dict
Finished initialization


In [24]:
# result

In [25]:
male_name = [name for name, gender in name_to_gender.items() if gender == 'male']
len(male_name)

51518

In [26]:
female_name = [ name for name, gender in name_to_gender.items() if gender == 'female']
len(female_name)

56112

In [27]:
import string

alphabet = string.ascii_lowercase

def isAlphabetOnly(name) :
    for char in name :
        if char not in alphabet :
            return False
    return True

alphabet_male_name = []
alphabet_female_name = []


for name in male_name :
    if isAlphabetOnly(name):
        alphabet_male_name.append(name)

for name in female_name :
    if isAlphabetOnly(name):
        alphabet_female_name.append(name)
        
print(len(alphabet_male_name))
print(len(alphabet_female_name))

42558
46148


In [28]:
male_name = alphabet_male_name
female_name = alphabet_female_name

In [29]:
import pandas as pd

In [30]:
dfm = pd.DataFrame(data={"name": male_name})

In [31]:
dfm

Unnamed: 0,name
0,arsalakhan
1,aymal
2,babrack
3,jelander
4,adil
...,...
42553,wubbo
42554,ybele
42555,ygram
42556,ype


#### Check the name

In [32]:
"palona" in dfm["name"].tolist()
"danyar" in dfm["name"].tolist()

True

In [33]:
dff = pd.DataFrame(data={"name": female_name})
dff

Unnamed: 0,name
0,kawatara
1,spogmai
2,torpekai
3,dlia
4,justina
...,...
46143,ypje
46144,ytje
46145,ytske
46146,yttje


#### Check the name

In [34]:
"sveya" in dff["name"].tolist()

True

#### Save the names into csv files

In [35]:
dfm.to_csv("../male_names_only.csv", index=False)
dff.to_csv("../female_names_only.csv", index=False)

### 1b. Get Unique name from a Country

In [36]:
print("Country List: ", gc.nameLists.keys())

Country List:  dict_keys(['Afghanistan', 'Albania', 'Australia', 'Belgium', 'Brazil', 'Canada', 'Czech', 'Finland', 'Greece', 'Hungary', 'India', 'Iran', 'Ireland', 'Israel', 'Italy', 'Latvia', 'Norway', 'Poland', 'Romania', 'Russia', 'Slovenia', 'Somalia', 'Spain', 'Sweden', 'Turkey', 'UK', 'Ukraine', 'USA', 'The Netherlands'])


**We choose to use name from USA**

In [37]:
selected_country = "USA"

names_from_selected_country = set()
names_outside_selected_country = set()

for country in gc.nameLists.keys():
    if country == selected_country :
        for gender, names in gc.nameLists[country].items() :
            for name in names :
                names_from_selected_country.add(name)
    else :
        for gender, names in gc.nameLists[country].items() :
            for name in names :
                names_outside_selected_country.add(name)

selected_male_name = []
selected_female_name = []

for name in male_name :
    if name in names_from_selected_country and name not in names_outside_selected_country:
        selected_male_name.append(name)

for name in female_name :
    if name in names_from_selected_country and name not in names_outside_selected_country :
        selected_female_name.append(name)

In [40]:
print(len(selected_male_name))
print(len(selected_female_name))

64
644


In [41]:
selected_dfm = pd.DataFrame(data={"name": selected_male_name})
selected_dff = pd.DataFrame(data={"name": selected_female_name})

In [42]:
selected_dfm.to_csv("../male_names_only_USA.csv", index=False)
selected_dff.to_csv("../female_names_only_USA.csv", index=False)

## 2. Get Unique Name by Country

This data is used to obtain 2 csv files:
* **a list of unique male names that is unique for each country**, saved as `../unique_male_names_and_country.csv` : a list of male names such that (1) for each male name it doesn't exist as a female name in any country and (2) for each male name, it's unique for each country
* **a list of unique female names that is unique for each country**, saved as `../unique_female_names_and_country.csv` : a list of female names such that (1) for each female name, it doesn't exist as a male name in any country  and (2) for each female name, it's unique for each country

In [43]:
name_to_country = {}
for country in gc.nameLists.keys():
    for gender, names in gc.nameLists[country].items():
        for name in names:
            if name in name_to_country and  name_to_country[name] != country:
                name_to_country[name] = 'invalid'
            else:
                name_to_country[name] = country
                
result = [(name, country) for name, country in name_to_country.items() if country != 'invalid']

In [44]:
df = pd.DataFrame(data=result, columns=["name", "country"])

In [45]:
df

Unnamed: 0,name,country
0,arsalakhan,Afghanistan
1,aymal,Afghanistan
2,babrack,Afghanistan
3,jelander,Afghanistan
4,kawatara,Afghanistan
...,...,...
90041,ypje,The Netherlands
90042,ytje,The Netherlands
90043,ytske,The Netherlands
90044,yttje,The Netherlands


In [46]:
df.to_csv("../unique_name_country.csv", index=None)

In [47]:
df = pd.read_csv("../unique_name_country.csv")

# gcm = pd.read_csv("../../data/gc_name/male_names_only.csv")
gcm = pd.read_csv("../male_names_only.csv")
gcm = gcm.sample(frac=1, random_state=123)
mnames = gcm["name"].tolist()# # names from GC
# gcf = pd.read_csv("../../data/gc_name/female_names_only.csv")
gcf = pd.read_csv("../female_names_only.csv")
gcf = gcf.sample(frac=1, random_state=123)
fnames = gcf["name"].tolist()# # names from GC


df = df.sample(frac=1, random_state=12345)
n = 1
gb = df.groupby("country")
unique_male_names = []
unique_female_names = []
countries = []
for k in gb.groups.keys() :
    d = gb.get_group(k)
    male = ""
    female = ""
#     print(len(d["name"].tolist()))
#     print(d["name"].tolist()[0])
    for name in d["name"].tolist():
#         if "-" not in name and " " not in name and "\\" not in name:
        if male == "" :
            if name in mnames :
                male = name
        if female == "" :
            if name in fnames :
                female = name
        if male != "" and female != "" :
            unique_male_names.append(male.title())
            unique_female_names.append(female.title())
            countries.append(k)
            break

In [48]:
print(unique_male_names)

['Jelander', 'Christop', 'Fabijano', 'Odisseu', 'Kerigan', 'Kajinek', 'Kauko', 'Kotsos', 'Lenci', 'Jivavijaya', 'Keyghobad', 'Airiz', 'Tsahi', 'Verners', 'Karstein', 'Albercik', 'Oliviu', 'Matevz', 'Waabberi', 'Vide', 'Harrit', 'Pars', 'Theoden', 'Eldridge', 'Budivid']


In [49]:
df_unique_male_names = pd.DataFrame(data={"name": unique_male_names, "country": countries})
df_unique_male_names.to_csv("../unique_male_names_and_country.csv")
df_unique_male_names 

Unnamed: 0,name,country
0,Jelander,Afghanistan
1,Christop,Australia
2,Fabijano,Belgium
3,Odisseu,Brazil
4,Kerigan,Canada
5,Kajinek,Czech
6,Kauko,Finland
7,Kotsos,Greece
8,Lenci,Hungary
9,Jivavijaya,India


In [50]:
df_unique_female_names = pd.DataFrame(data={"name": unique_female_names, "country": countries})
df_unique_female_names.to_csv("../unique_female_names_and_country.csv")
df_unique_female_names 

Unnamed: 0,name,country
0,Spogmai,Afghanistan
1,Kimberle,Australia
2,Manyak,Belgium
3,Ibiacema,Brazil
4,Toniko,Canada
5,Dagmarka,Czech
6,Josefiina,Finland
7,Pitsa,Greece
8,Bske,Hungary
9,Humaila,India
