In [1]:
from io import BytesIO
import requests
from math import ceil
from dateutil.relativedelta import relativedelta
from datetime import date

import numpy as np
import pandas as pd

In [2]:
today = date.today()

In [3]:
def _read_url(url):
    response = requests.get(url)
    return BytesIO(response.content)
    
def read_csv_from_url(url, usecols=None, sep=';', compression=None):
    url_content = _read_url(url)
    return pd.read_csv(url_content, compression=compression, sep=sep, usecols=usecols)

def read_excel_from_url(url, skiprows=None):
    url_content = _read_url(url)
    return pd.read_excel(url_content, skiprows=skiprows)

def handle_commune(x):
    key = x.lower().replace(' ', '').replace('-', '').replace('\'', '')
    key = 'hautbréda' if key == 'lehautbréda' else key
    return key

def generate_birthdate(x):
    if x == '100 ou plus':
        base = 100 * 364
        var = int(np.random.exponential(364, 1))
    else:
        base = int(x) * 364
        var = np.random.randint(0, 364)
    n_days = int(base + var)
    return today - pd.Timedelta(days=n_days)

def birthdate_to_age(birthdate):
    return relativedelta(date.today(), pd.to_datetime(birthdate)).years

In [4]:
# get addresses from https://adresse.data.gouv.fr/donnees-nationales
department_idxes = [69]
cols = ('numero', 'nom_voie', 'code_postal', 'nom_commune')

res = []
for department_idx in department_idxes:
    url = f'https://adresse.data.gouv.fr/data/ban/adresses/latest/csv/adresses-{department_idx:02d}.csv.gz'
    dfx = read_csv_from_url(url, cols, compression='gzip')
    res.append(dfx)

df_adresses = pd.concat(res)
df_adresses['adresse'] = df_adresses.apply(lambda r: ' '.join(map(str, r.values)), axis=1)
df_adresses['code_dep'] = df_adresses.code_postal.apply(lambda x: str(x)[:2])
df_adresses = df_adresses[df_adresses['code_dep'].apply(lambda x: x in ('38', '69'))]
df_adresses['key_commune'] = df_adresses.apply(lambda row: f"{row['code_dep']}_{handle_commune(row['nom_commune'])}", axis=1)
s_n_address_per_commune = df_adresses.groupby('key_commune').size()
df_adresses['proba_adresse'] = df_adresses['key_commune'].apply(lambda x: 1 / s_n_address_per_commune[x])

In [5]:
s1 = set(df_adresses.key_commune.unique())

In [6]:
# get population for each town from https://www.insee.fr/fr/statistiques/4989724?sommaire=4989761#consulter
df_pop = pd.read_csv('data/Communes.csv', sep=';', usecols=('COM', 'CODDEP', 'PTOT'))
df_pop['key_commune'] = df_pop.apply(lambda row: f"{row['CODDEP']}_{handle_commune(row['COM'])}", axis=1)
df_pop = df_pop[df_pop.key_commune.apply(lambda x: x in s1)]
df_pop['proba_commune'] = df_pop['PTOT'] / df_pop['PTOT'].sum()
del df_pop['COM']

In [7]:
# get french names from https://www.data.gouv.fr/fr/datasets/liste-de-prenoms-et-patronymes/#_
url_last_name = 'https://static.data.gouv.fr/resources/liste-de-prenoms-et-patronymes/20181014-162921/patronymes.csv'
df_last_name = read_csv_from_url(url_last_name, sep=',').sort_values('count', ascending=False).reset_index(drop=True)
df_last_name.head()

Unnamed: 0,patronyme,count
0,MARTIN,31781
1,BERNARD,16475
2,THOMAS,14396
3,DURAND,13901
4,ROBERT,13786


In [8]:
# get french first names from https://www.insee.fr/fr/statistiques/2540004?sommaire=4767262#consulter
url_first_name = 'https://www.insee.fr/fr/statistiques/fichier/2540004/nat2019_csv.zip'
df_first_name = read_csv_from_url(url_first_name, sep=';', compression='zip')
df_first_name = df_first_name.query('annais != "XXXX" and preusuel != "_PRENOMS_RARES"')
df_first_name.annais = df_first_name.annais.astype(int)
df_first_name = df_first_name.query('annais > 1960')
df_first_name = df_first_name.groupby(['preusuel', 'sexe']).nombre.sum().reset_index()
df_first_name['proba'] = df_first_name['nombre'] / df_first_name['nombre'].sum() 

In [9]:
# get age pyramid from https://www.insee.fr/fr/statistiques/fichier/2381472/demo-pop-pyram.xlsx
df_age = read_excel_from_url('https://www.insee.fr/fr/statistiques/fichier/2381472/demo-pop-pyram.xlsx',
                             skiprows=2)
df_age = df_age.rename(columns={'Âge révolu': 'age', 'Ensemble': 'count'})[['age', 'count']].dropna()
mask = df_age.age.apply(lambda x: True if x == '100 ou plus' else int(x)>14)
df_age = df_age[mask]
df_age['proba'] = df_age['count'] / df_age['count'].sum()

In [10]:
# generate file

In [11]:
n = 120000
first_names_idxes = np.random.choice(df_first_name.index, n, p=df_first_name['proba'])
last_names = np.random.choice(df_last_name.patronyme, n, p=df_last_name['count']/df_last_name['count'].sum())
towns = np.random.choice(df_pop.key_commune, n, p=df_pop.proba_commune)
age = np.random.choice(df_age.age, n, p=df_age.proba)

In [12]:
X = np.hstack((last_names[:, np.newaxis],
               towns[:, np.newaxis],
               age[:, np.newaxis]))
 
d_adresse = {kc: dfx.adresse.values for kc, dfx in df_adresses.groupby('key_commune')}
d_cp = {kc: dfx.code_postal.values for kc, dfx in df_adresses.groupby('key_commune')}

df = pd.DataFrame(X, columns=('nom', 'key_commune', 'age'))
df['adresse'] = df['key_commune'].apply(lambda kc: np.random.choice(d_adresse[kc]))
df['code_postal'] = df['key_commune'].apply(lambda kc: np.random.choice(d_cp[kc]))
del df['key_commune']

df['date_naissance'] = df['age'].apply(generate_birthdate)
df['age'] = df['date_naissance'].apply(birthdate_to_age)

df_first_name_gender = df_first_name.iloc[first_names_idxes].reset_index(drop=True).rename(columns={'preusuel': 'prenom'})
df = pd.concat((df, df_first_name_gender), axis=1)
df = df[['nom', 'prenom', 'sexe', 'age', 'adresse', 'code_postal']]

In [13]:
df_personal_data = df.iloc[:100000].copy()
df_personal_data_remaining = df.iloc[100000:].copy()

In [15]:
df_personal_data.to_csv('personal_data.csv', sep=';', index=False)

In [36]:
df_activite = pd.read_csv('data/activite_physique.csv', sep=';')

In [38]:
df_activite

Unnamed: 0,Sexe,Age,Objectifs_activite_physique
0,1,18-39,69.1
1,1,40-54,70.8
2,1,55-74,71.8
3,1,75 et plus,?
4,2,18-39,50.3
5,2,40-54,49.4
6,2,55-74,57.8
7,2,75 et plus,?


In [39]:
df_alcool = pd.read_csv('data/consommation_alcool.csv', sep=';')

In [40]:
df_alcool

Unnamed: 0,Sexe,Age,Alcool quotidien,Alcool annuel
0,1,18-24,3.9,87.9
1,1,25-34,5.9,88.9
2,1,35-44,9.6,86.9
3,1,45-54,11,87.7
4,1,55-64,22.7,91.6
5,1,65-74,38.5,94
6,1,75 et plus,?,?
7,2,18-24,0.7,79.5
8,2,25-34,1.4,76.4
9,2,35-44,2,79.6


In [31]:
df_hta = pd.read_csv('data/prevalence_hta.csv', sep=';')

In [32]:
df_hta

Unnamed: 0,Sexe,Age,Prevalence_avecsans,Prevalence_hors_pathologies
0,1,0-14,0.06,5
1,1,15-34,0.71,6
2,1,35-54,9.87,689
3,1,55-64,34.66,1976
4,1,65-74,54.1,2621
5,1,75 et plus,67.13,2784
6,2,0-14,0.04,4
7,2,15-34,0.82,75
8,2,35-54,9.0,742
9,2,55-64,28.15,2067


In [None]:
def generate_pathologie(age, sexe):
    age = 100 if age == '100 ou plus' else int(age)
    if age < 15:
        age_lbl = '<15'
    elif 15 <= age <= 34:
        age_lbl = '15-34'
    elif 35 <= age <= 54:
        age_lbl = '35-54'
    elif 54 <= age <= 64:
        age_lbl = '55-64'
    elif 65 <= age <= 74:
        age_lbl = '65-74'
    elif 75 <= age:
        age_lbl = '75>='
    
    s_probas = df_pathologie.loc[sexe, age_lbl]
    return (s_probas >  np.random.rand(len(s_probas)))

In [None]:
df_sensitive_data = pd.concat(
 [
     df_personal_data.query('age >= 100'),
     df_personal_data.query('age < 100 and age>=80').sample(200),
     df_personal_data.query('age < 80 and age>=60').sample(250),
     df_personal_data.query('age < 60').sample(500),
     df_personal_data_remaining.query('age >= 100'),
     df_personal_data_remaining.query('age < 100 and age>=80').sample(20),
     df_personal_data_remaining.query('age < 80 and age>=60').sample(25),
     df_personal_data_remaining.query('age < 60').sample(50)     
 ]
)

df_sensitive_data['date_naissance'] = df_sensitive_data['age'].apply(generate_birthdate)

In [None]:
df_sensitive_data = df_sensitive_data.sort_values(['nom', 'prenom']).reset_index(drop=True)

In [None]:
# prevalence maladie :https://www.ameli.fr/fileadmin/user_upload/documents/Effectifs_par_pathologie_et_par_classe_d_age_selon_le_sexe_en_2018.xls
df_pathologie_h = pd.read_excel('data/pathologie.xlsx', sheet_name=0, header=0, index_col=0).T
df_pathologie_h['sexe'] = 1
df_pathologie_h = df_pathologie_h.set_index([df_pathologie_h.sexe, df_pathologie_h.index])
del df_pathologie_h['sexe']

df_pathologie_f = pd.read_excel('data/pathologie.xlsx', sheet_name=1, header=0, index_col=0).T
df_pathologie_f['sexe'] = 2
df_pathologie_f = df_pathologie_f.set_index([df_pathologie_f.sexe, df_pathologie_f.index])
del df_pathologie_f['sexe']

df_pathologie = pd.concat((df_pathologie_h, df_pathologie_f))

In [None]:
df_pathologie

In [None]:
# reflechir pour avoir des donnees sensibles numeriques, discretes

In [None]:
df_sensitive_data = pd.concat((df_sensitive_data,
                               df_sensitive_data.apply(lambda row: generate_pathologie(row['age'], row['sexe']), axis=1)),
                              axis=1)

In [None]:
cols = ['nom', 'prenom', 'sexe', 'age', 'date_naissance','adresse', 'code_postal', 'Maladies cardioneurovasculaires', 'Diabète', 'Cancers', 'Maternité']
df_sensitive_data = df_sensitive_data[cols].rename(columns={'Maladies cardioneurovasculaires': 'maladies_cardiaques',
                                                    'Diabète': 'diabete',
                                                    'Cancers':'cancers',
                                                    'Maternité': 'maternite'})

In [None]:
df_sensitive_data.to_csv('sensitive_data.csv', sep=';', index=False)