# Data Preparation from Gender Computer

**Source:** https://github.com/tue-mdse/genderComputer/tree/master/nameLists

In [65]:
import pandas as pd
import numpy as np
import math
import os

#### Reading One Corpus

In [119]:
def read_data(fpath) :
    df = pd.read_csv(fpath, sep=";", names=["Name", "_1", "_2", "_3"])
    return df

    
# fpath = "../data/gc_name/raw/AlbaniaMaleUTF8.csv"
# fpath = "../data/gc_name/raw/CanadaMaleUTF8.csv"
fpath = "../data/gc_name/raw/BrusselsMaleUTF8.csv"
# fpath = "../data/gc_name/raw/USAMaleUTF8.csv"
# fpath = "../data/gc_name/raw/ItalyMaleUTF8.csv"
df = read_data(fpath)

In [120]:
df

Unnamed: 0,Name,_1,_2,_3
0,Mohamed,11.436,,
1,Jean,9.853,,
2,Michel,5.470,,
3,Philippe,5.090,,
4,Pierre,5.015,,
...,...,...,...,...
12734,Zulfi,2.000,,
12735,Zurkani,2.000,,
12736,Zühtü,2.000,,
12737,Ügur,2.000,,


#### Reading All Corpus from Database

In [121]:
from os import walk

dirname = "../data/gc_name/raw/"

mpaths = []
fpaths = []

for f in listdir(dirname): 
    if isfile(join(dirname, f)) :
        if ".csv" == f[-4:] :
            if "Male" in f :
                mpaths.append(join(dirname, f))
            if "Female" in f :
                fpaths.append(join(dirname, f))

In [122]:
mpaths

['../data/gc_name/raw/UKMaleUTF8.csv',
 '../data/gc_name/raw/FlandersMaleUTF8.csv',
 '../data/gc_name/raw/IranMaleUTF8.csv',
 '../data/gc_name/raw/TurkeyMaleUTF8.csv',
 '../data/gc_name/raw/FrisiaMaleUTF8.csv',
 '../data/gc_name/raw/IsraelMaleUTF8.csv',
 '../data/gc_name/raw/BrazilMaleUTF8.csv',
 '../data/gc_name/raw/SwedenMaleUTF8.csv',
 '../data/gc_name/raw/SloveniaMaleUTF8.csv',
 '../data/gc_name/raw/AfghanistanMaleUTF8.csv',
 '../data/gc_name/raw/FinlandMaleUTF8.csv',
 '../data/gc_name/raw/PolandMaleUTF8.csv',
 '../data/gc_name/raw/CanadaMaleUTF8.csv',
 '../data/gc_name/raw/BrusselsMaleUTF8.csv',
 '../data/gc_name/raw/SomaliaMaleUTF8.csv',
 '../data/gc_name/raw/GreeceMaleUTF8.csv',
 '../data/gc_name/raw/IndiaMaleUTF8.csv',
 '../data/gc_name/raw/BelgiumMaleUTF8.csv',
 '../data/gc_name/raw/UkraineMaleUTF8.csv',
 '../data/gc_name/raw/VietnamMaleUTF8.csv',
 '../data/gc_name/raw/WalloniaMaleUTF8.csv',
 '../data/gc_name/raw/IrelandMaleUTF8.csv',
 '../data/gc_name/raw/AustraliaMaleUTF8.cs

In [123]:
fpaths

['../data/gc_name/raw/CzechFemaleUTF8.csv',
 '../data/gc_name/raw/WalloniaFemaleUTF8.csv',
 '../data/gc_name/raw/BelgiumFemaleUTF8.csv',
 '../data/gc_name/raw/SpainFemaleUTF8.csv',
 '../data/gc_name/raw/ItalyFemaleUTF8.csv',
 '../data/gc_name/raw/RussiaFemaleUTF8.csv',
 '../data/gc_name/raw/IrelandFemaleUTF8.csv',
 '../data/gc_name/raw/UkraineFemaleUTF8.csv',
 '../data/gc_name/raw/VietnamFemaleUTF8.csv',
 '../data/gc_name/raw/CanadaFemaleUTF8.csv',
 '../data/gc_name/raw/FrisiaFemaleUTF8.csv',
 '../data/gc_name/raw/AlbaniaFemaleUTF8.csv',
 '../data/gc_name/raw/AfghanistanFemaleUTF8.csv',
 '../data/gc_name/raw/FlandersFemaleUTF8.csv',
 '../data/gc_name/raw/UKFemaleUTF8.csv',
 '../data/gc_name/raw/RomaniaFemaleUTF8.csv',
 '../data/gc_name/raw/LatviaFemaleUTF8.csv',
 '../data/gc_name/raw/IndiaFemaleUTF8.csv',
 '../data/gc_name/raw/SomaliaFemaleUTF8.csv',
 '../data/gc_name/raw/AustraliaFemaleUTF8.csv',
 '../data/gc_name/raw/SwedenFemaleUTF8.csv',
 '../data/gc_name/raw/IsraelFemaleUTF8.csv',

#### Gather Data from Corpus

Take `n` name from from each male-female for each country

Discard the country if the number of available name is less than `n`

In [124]:
n = 10 # number of name that want to take from each country

# list of discarded country
discarded_country = set()

for path in mpaths :
    d = read_data(path)
    country = path[len(dirname):-12]
    
    if len(d) < n :
        print(country, " is discarded")
        print("Length data is less than ", n)
        discarded_country.add(country)

for path in fpaths :
    d = read_data(path)
    country = path[len(dirname):-14]
    
    if len(d) < n :
        print(country, " is discarded")
        print("Length data is less than ", n)
        discarded_country.add(country)

Afghanistan  is discarded
Length data is less than  10
Albania  is discarded
Length data is less than  10
Albania  is discarded
Length data is less than  10
Afghanistan  is discarded
Length data is less than  10


In [128]:
seed = 123456

df = pd.DataFrame(columns=["Name", "Gender", "Country"])

for path in mpaths :
    d = read_data(path)
    country = path[len(dirname):-12]
    gender = "male"
    
    if country not in discarded_country :    
        for index, rows in d.sample(n=n, random_state=seed).iterrows() :
            df = df.append({"Name": str(rows["Name"]).title(), "Gender": gender, "Country" : country}, ignore_index=True)
            
for path in fpaths :
    d = read_data(path)
    country = path[len(dirname):-14]
    gender = "female"
    
    if country not in discarded_country :
        for index, rows in d.sample(n=n, random_state=seed).iterrows() :
            df = df.append({"Name": str(rows["Name"]).title(), "Gender": gender, "Country" : country}, ignore_index=True)

In [129]:
df

Unnamed: 0,Name,Gender,Country
0,Roen,male,UK
1,Jeet,male,UK
2,Hagen,male,UK
3,Willow,male,UK
4,Belal,male,UK
...,...,...,...
615,Virág,female,Hungary
616,Adél,female,Hungary
617,Olga,female,Hungary
618,Jolán,female,Hungary


In [127]:
df.to_csv("../data/gc_name/data.csv", index=None)