In [1]:
import numpy as np
import pickle
import os
import pandas as pds
from names_dataset import NameDataset
from tqdm import tqdm
import datetime

Sources:

- Countries 2 letters code from: https://gist.github.com/mlisovyi/e8df5c907a8250e14cc1e5933ed53ffd

- Age distributions from: the World Bank (helped with Chatgpt), the World Health Organization (helped with Chatgpt), the World Factbook (cia.gov)

- Population sizes from: United Nations Department of Economic and Social Affairs, Eurostat and the World Bank (helped with Chatgpt)

- names and last names from: https://github.com/philipperemy/name-dataset (Facebook Data Leak 2019)

In [2]:
with open(os.path.join('data dictionaries', 'european_countries_code.pkl'), 'rb') as f:
    european_countries_code = pickle.load(f)

with open(os.path.join('data dictionaries', 'european_countries_pop.pkl'), 'rb') as f:
    european_countries_pop = pickle.load(f)

with open(os.path.join('data dictionaries', 'european_countries_age.pkl'), 'rb') as f:
    european_countries_age = pickle.load(f)

In [3]:
# generate dataset with identifiers (names, birth year, nationality, gender)

N = 50000

country_proba_names = {}

country = list(european_countries_code.keys())[0]
country_code = european_countries_code[country]
pop_proportion = european_countries_pop[country]
age_distr = european_countries_age[country]

big_df = pds.read_csv(os.path.join('data names', country_code+'.csv'), delimiter = ',', names = ['name', 'family name', 'gender', 'country'])
sample_df = big_df.sample(n = int(N*pop_proportion))
sample_df['birth year'] = 2020 - np.random.choice(a = [np.random.randint(int(interval[:2]), int(interval[3:])) for interval in age_distr.keys()], size = sample_df.shape[0], p = list(age_distr.values()))

country_proba_names[country] = { k:big_df[big_df.name == k].shape[0]/big_df.shape[0] for k in sample_df.name.unique() }

for country in tqdm(list(european_countries_code.keys())[1:]):
    try:
        country_code = european_countries_code[country]
        pop_proportion = european_countries_pop[country]
        age_distr = european_countries_age[country]

        tmp_big_df = pds.read_csv(os.path.join('data names', country_code+'.csv'), delimiter = ',', names = ['name', 'family name', 'gender', 'country'])
        tmp_sample_df = tmp_big_df.sample(n = int(N*pop_proportion))
        tmp_sample_df['birth year'] = 2020 - np.random.choice(a = [np.random.randint(int(interval[:2]), int(interval[3:])) for interval in age_distr.keys()], size = tmp_sample_df.shape[0], p = list(age_distr.values()))
        
        sample_df = pds.concat([sample_df, tmp_sample_df])

        country_proba_names[country] = { k:tmp_big_df[tmp_big_df.name == k].shape[0]/tmp_big_df.shape[0] for k in tmp_sample_df.name.unique() }

    except FileNotFoundError:
        pass

sample_df

100%|██████████| 42/42 [1:12:02<00:00, 102.92s/it]


Unnamed: 0,name,family name,gender,country,birth year
188250,Najres,Emuk,M,AL,1965
284993,Sebastian,Kroi,M,AL,1995
213638,Desi,Mata,F,AL,1995
200361,Ramirez,Varvarica,M,AL,1953
132451,Vasil,Cipaj,M,AL,1995
...,...,...,...,...,...
8780460,Robin,Frost,M,GB,2013
4097044,Rich,Lafferty,M,GB,2003
5809051,Chris,White,M,GB,1958
5803623,Damen,Shepherd,M,GB,1958


In [4]:
s = 0
for key in country_proba_names:
    s += len(country_proba_names[key])
with open(f'country_proba_names_{s}names.pkl', 'wb') as f:
    pickle.dump(country_proba_names, f)

sample_df.to_csv(f"DF_N={sample_df.shape[0]}_{datetime.date.today()}.csv", index = False)