In [None]:
from io import BytesIO
import requests
from math import ceil
from dateutil.relativedelta import relativedelta
from datetime import date
from bisect import bisect

import numpy as np
import pandas as pd

In [None]:
today = date.today()

In [None]:
def _read_url(url):
    response = requests.get(url)
    return BytesIO(response.content)
    
def read_csv_from_url(url, usecols=None, sep=';', compression=None):
    url_content = _read_url(url)
    return pd.read_csv(url_content, compression=compression, sep=sep, usecols=usecols)

def read_excel_from_url(url, skiprows=None):
    url_content = _read_url(url)
    return pd.read_excel(url_content, skiprows=skiprows)

def handle_commune(x):
    key = x.lower().replace(' ', '').replace('-', '').replace('\'', '')
    key = 'hautbréda' if key == 'lehautbréda' else key
    return key

def generate_birthdate(x):
    if x == '100 ou plus':
        base = 100 * 364
        var = int(np.random.exponential(364, 1))
    else:
        base = int(x) * 364
        var = np.random.randint(0, 364)
    n_days = int(base + var)
    return today - pd.Timedelta(days=n_days)

def birthdate_to_age(birthdate):
    return relativedelta(date.today(), pd.to_datetime(birthdate)).years

In [None]:
# get addresses from https://adresse.data.gouv.fr/donnees-nationales
department_idxes = [69]
cols = ('numero', 'nom_voie', 'code_postal', 'nom_commune')

res = []
for department_idx in department_idxes:
    url = f'https://adresse.data.gouv.fr/data/ban/adresses/latest/csv/adresses-{department_idx:02d}.csv.gz'
    dfx = read_csv_from_url(url, cols, compression='gzip')
    res.append(dfx)

df_adresses = pd.concat(res)
df_adresses['adresse'] = df_adresses.apply(lambda r: ' '.join(map(str, r.values)), axis=1)
df_adresses['code_dep'] = df_adresses.code_postal.apply(lambda x: str(x)[:2])
df_adresses = df_adresses[df_adresses['code_dep'].apply(lambda x: x == '69')]
df_adresses['key_commune'] = df_adresses.apply(lambda row: f"{row['code_dep']}_{handle_commune(row['nom_commune'])}", axis=1)
s_n_address_per_commune = df_adresses.groupby('key_commune').size()
df_adresses['proba_adresse'] = df_adresses['key_commune'].apply(lambda x: 1 / s_n_address_per_commune[x])

In [None]:
s1 = set(df_adresses.key_commune.unique())

In [None]:
# get population for each town from https://www.insee.fr/fr/statistiques/4989724?sommaire=4989761#consulter
df_pop = pd.read_csv('data/Communes.csv', sep=';', usecols=('COM', 'CODDEP', 'PTOT'))
df_pop['key_commune'] = df_pop.apply(lambda row: f"{row['CODDEP']}_{handle_commune(row['COM'])}", axis=1)
df_pop = df_pop[df_pop.key_commune.apply(lambda x: x in s1)]
df_pop['proba_commune'] = df_pop['PTOT'] / df_pop['PTOT'].sum()
del df_pop['COM']

In [None]:
# get french names from https://www.data.gouv.fr/fr/datasets/liste-de-prenoms-et-patronymes/#_
url_last_name = 'https://static.data.gouv.fr/resources/liste-de-prenoms-et-patronymes/20181014-162921/patronymes.csv'
df_last_name = read_csv_from_url(url_last_name, sep=',').sort_values('count', ascending=False).reset_index(drop=True)
df_last_name.head()

In [None]:
# get french first names from https://www.insee.fr/fr/statistiques/2540004?sommaire=4767262#consulter
url_first_name = 'https://www.insee.fr/fr/statistiques/fichier/2540004/nat2019_csv.zip'
df_first_name = read_csv_from_url(url_first_name, sep=';', compression='zip')
df_first_name = df_first_name.query('annais != "XXXX" and preusuel != "_PRENOMS_RARES"')
df_first_name.annais = df_first_name.annais.astype(int)
df_first_name = df_first_name.query('annais > 1960')
df_first_name = df_first_name.groupby(['preusuel', 'sexe']).nombre.sum().reset_index()
df_first_name['proba'] = df_first_name['nombre'] / df_first_name['nombre'].sum() 

In [None]:
# get age pyramid from https://www.insee.fr/fr/statistiques/fichier/2381472/demo-pop-pyram.xlsx
df_age = read_excel_from_url('https://www.insee.fr/fr/statistiques/fichier/2381472/demo-pop-pyram.xlsx',
                             skiprows=2)
df_age = df_age.rename(columns={'Âge révolu': 'age', 'Ensemble': 'count'})[['age', 'count']].dropna()
mask = df_age.age.apply(lambda x: True if x == '100 ou plus' else int(x) >= 19)
df_age = df_age[mask]
df_age['proba'] = df_age['count'] / df_age['count'].sum()héhé

In [None]:
# generate file

In [None]:
n = 120000
first_names_idxes = np.random.choice(df_first_name.index, n, p=df_first_name['proba'])
last_names = np.random.choice(df_last_name.patronyme, n, p=df_last_name['count']/df_last_name['count'].sum())
towns = np.random.choice(df_pop.key_commune, n, p=df_pop.proba_commune)
age = np.random.choice(df_age.age, n, p=df_age.proba)

In [None]:
X = np.hstack((last_names[:, np.newaxis],
               towns[:, np.newaxis],
               age[:, np.newaxis]))
 
d_adresse = {kc: dfx.adresse.values for kc, dfx in df_adresses.groupby('key_commune')}
d_cp = {kc: dfx.code_postal.values[0] for kc, dfx in df_adresses.groupby('key_commune')}

df = pd.DataFrame(X, columns=('nom', 'key_commune', 'age'))
df['adresse'] = df['key_commune'].apply(lambda kc: np.random.choice(d_adresse[kc]))
df['code_postal'] = df['key_commune'].apply(lambda kc: d_cp[kc])
del df['key_commune']

df['date_naissance'] = df['age'].apply(generate_birthdate)
df['age'] = df['date_naissance'].apply(birthdate_to_age)

df_first_name_gender = df_first_name.iloc[first_names_idxes].reset_index(drop=True).rename(columns={'preusuel': 'prenom'})
df = pd.concat((df, df_first_name_gender), axis=1)
df = df[['nom', 'prenom', 'sexe', 'age', 'adresse', 'code_postal']]

In [None]:
df_personal_data = df.iloc[:100000].copy()
df_personal_data_remaining = df.iloc[100000:].copy()

In [None]:
df_personal_data.to_csv('personal_data.csv', sep=';', index=False)

http://beh.santepubliquefrance.fr/beh/2020/15/pdf/2020_15_1.pdf

In [None]:
df_activite = pd.read_csv('data/activite_physique.csv', sep=';')

In [None]:
df_alcool = pd.read_csv('data/consommation_alcool.csv', sep=';')

In [None]:
df_hta = pd.read_csv('data/prevalence_hta.csv', sep=';')

In [None]:
df_hta['prevalence_traitement'] *= 1.3
df_hta['prevalence_diagnostic'] *= 1.3

In [None]:
def generate_sensitive_value(dfx, age, sexe, col, low=None, lbls=None):
    dfy = dfx.query(f'sexe == {sexe}').reset_index(drop=True)
    splts = dfy.age.apply(lambda x: x.split('-')[0]).astype(int)
    idx_age = bisect(splts, age) - 1
    cols = [col, low] if low is not None else [col]
    s_proba = dfy.loc[idx_age, cols].astype(float) * .01
    roll = np.random.rand(len(cols))
    res = s_proba > roll

    if len(res) > 1:
        if res[0]:
            return lbls[0]
        elif res[1]:
            return lbls[1]
        else:
            return lbls[2]
    else:
        return res.values[0]

In [None]:
df_personal_data['code_postal'].astype(str).apply(lambda x: x.startswith('690'))

In [None]:
df_sensitive_data = pd.concat(
 [
     df_personal_data.query('age < 75 and age>=60').sample(250),
     df_personal_data.query('age < 60').sample(500),
     df_personal_data_remaining.query('age < 75 and age>=60').sample(100),
     df_personal_data_remaining.query('age < 60').sample(200)     
 ]
)

df_sensitive_data['date_naissance'] = df_sensitive_data['age'].apply(generate_birthdate)

In [None]:
df_sensitive_data = df_sensitive_data.sort_values(['nom', 'prenom']).reset_index(drop=True)

In [None]:
df_sensitive_data['consommation_alcool'] = df_sensitive_data.apply(lambda row: generate_sensitive_value(df_alcool, row.age, row.sexe, 'alcool_quotidien', low='alcool_annuel', lbls=['élevée', 'modérée', 'faible']), axis=1)
df_sensitive_data['activite_physique'] = df_sensitive_data.apply(lambda row: generate_sensitive_value(df_activite, row.age, row.sexe, 'atteinte_objectif_quotidien'), axis=1)
df_sensitive_data['tension_arterielle'] = df_sensitive_data.apply(lambda row: generate_sensitive_value(df_hta, row.age, row.sexe, 'prevalence_diagnostic', low='prevalence_traitement', lbls=['très élevée', 'élevée', 'normale']), axis=1)

In [None]:
df_find = df_sensitive_data.sample(6)[['nom', 'prenom', 'sexe', 'age', 'code_postal']]

In [None]:
df_find.to_markdown()

In [None]:
df_sensitive_data.to_csv('sensitive_data.csv', sep=';', index=False)