In [1]:
from io import BytesIO
from pathlib import Path
from math import ceil
from dateutil.relativedelta import relativedelta
from datetime import date
from bisect import bisect

import numpy as np
import pandas as pd

In [2]:
today = date.today()
data_dir = Path('../data')
generate_data_dir = data_dir / 'generate'

In [3]:
def handle_commune(x):
    key = x.lower().replace(' ', '').replace('-', '').replace('\'', '')
    key = 'hautbréda' if key == 'lehautbréda' else key
    return key

def generate_birthdate(x):
    if x == '100 ou plus':
        base = 100 * 364
        var = int(np.random.exponential(364, 1))
    else:
        base = int(x) * 364
        var = np.random.randint(0, 364)
    n_days = int(base + var)
    return today - pd.Timedelta(days=n_days)

def birthdate_to_age(birthdate):
    return relativedelta(date.today(), pd.to_datetime(birthdate)).years

In [4]:
cols = ('numero', 'nom_voie', 'code_postal', 'nom_commune')
df_adresses = pd.read_csv(generate_data_dir / 'adresses-69.csv.gz', sep=';', usecols=cols, compression='gzip')
df_adresses['adresse'] = df_adresses.apply(lambda r: ' '.join(map(str, r.values)), axis=1)
df_adresses['code_dep'] = df_adresses.code_postal.apply(lambda x: str(x)[:2])
df_adresses = df_adresses[df_adresses['code_dep'].apply(lambda x: x == '69')]
df_adresses['key_commune'] = df_adresses.apply(lambda row: f"{row['code_dep']}_{handle_commune(row['nom_commune'])}", axis=1)
s_n_address_per_commune = df_adresses.groupby('key_commune').size()
df_adresses['proba_adresse'] = df_adresses['key_commune'].apply(lambda x: 1 / s_n_address_per_commune[x])

In [5]:
s1 = set(df_adresses.key_commune.unique())

In [6]:
df_pop = pd.read_csv(generate_data_dir / 'Communes.csv', sep=';', usecols=('COM', 'CODDEP', 'PTOT'))
df_pop['key_commune'] = df_pop.apply(lambda row: f"{row['CODDEP']}_{handle_commune(row['COM'])}", axis=1)
df_pop = df_pop[df_pop.key_commune.apply(lambda x: x in s1)]
df_pop['proba_commune'] = df_pop['PTOT'] / df_pop['PTOT'].sum()
del df_pop['COM']

In [7]:
df_last_name = pd.read_csv(generate_data_dir / 'patronymes.csv', sep=',').sort_values('count', ascending=False).reset_index(drop=True)
df_last_name.head()

Unnamed: 0,patronyme,count
0,MARTIN,31781
1,BERNARD,16475
2,THOMAS,14396
3,DURAND,13901
4,ROBERT,13786


In [8]:
df_first_name = pd.read_csv(generate_data_dir / 'nat2019_csv.zip', sep=';', compression='zip')
df_first_name = df_first_name.query('annais != "XXXX" and preusuel != "_PRENOMS_RARES"')
df_first_name.annais = df_first_name.annais.astype(int)
df_first_name = df_first_name.query('annais > 1960')
df_first_name = df_first_name.groupby(['preusuel', 'sexe']).nombre.sum().reset_index()
df_first_name['proba'] = df_first_name['nombre'] / df_first_name['nombre'].sum() 

In [9]:
df_age = pd.read_excel(generate_data_dir / 'demo-pop-pyram.xlsx',
                        skiprows=2)
df_age = df_age.rename(columns={'Âge révolu': 'age', 'Ensemble': 'count'})[['age', 'count']].dropna()
mask = df_age.age.apply(lambda x: True if x == '100 ou plus' else int(x) >= 19)
df_age = df_age[mask]
df_age['proba'] = df_age['count'] / df_age['count'].sum()

In [10]:
n = 120000
first_names_idxes = np.random.choice(df_first_name.index, n, p=df_first_name['proba'])
last_names = np.random.choice(df_last_name.patronyme, n, p=df_last_name['count']/df_last_name['count'].sum())
towns = np.random.choice(df_pop.key_commune, n, p=df_pop.proba_commune)
age = np.random.choice(df_age.age, n, p=df_age.proba)

In [11]:
X = np.hstack((last_names[:, np.newaxis],
               towns[:, np.newaxis],
               age[:, np.newaxis]))
 
d_adresse = {kc: dfx.adresse.values for kc, dfx in df_adresses.groupby('key_commune')}
d_cp = {kc: dfx.code_postal.values[0] for kc, dfx in df_adresses.groupby('key_commune')}
d_rural = {kc: dfx.PTOT.values[0] < 500 for kc, dfx in df_pop.groupby('key_commune')}

df = pd.DataFrame(X, columns=('nom', 'key_commune', 'age'))
df['adresse'] = df['key_commune'].apply(lambda kc: np.random.choice(d_adresse[kc]))
df['code_postal'] = df['key_commune'].apply(lambda kc: d_cp[kc])
df['is_rural'] = df['key_commune'].apply(lambda kc: d_rural[kc])
del df['key_commune']

df['date_naissance'] = df['age'].apply(generate_birthdate)
df['age'] = df['date_naissance'].apply(birthdate_to_age)

df_first_name_gender = df_first_name.iloc[first_names_idxes].reset_index(drop=True).rename(columns={'preusuel': 'prenom'})
df = pd.concat((df, df_first_name_gender), axis=1)
df = df[['nom', 'prenom', 'sexe', 'age', 'adresse', 'code_postal', 'is_rural']]

In [12]:
df_personal_data = df.iloc[:100000].copy()
df_personal_data_remaining = df.iloc[100000:].copy()

In [13]:
df_personal_data[['nom', 'prenom', 'sexe', 'age', 'adresse', 'code_postal']]\
.to_csv(data_dir / 'personal_data.csv', sep=';', index=False)

In [14]:
df_activite = pd.read_csv(generate_data_dir / 'activite_physique.csv', sep=';')

In [15]:
df_alcool = pd.read_csv(generate_data_dir / 'consommation_alcool.csv', sep=';')

In [16]:
df_hta = pd.read_csv(generate_data_dir / 'prevalence_hta.csv', sep=';')

In [17]:
df_hta['prevalence_traitement'] *= 1.3
df_hta['prevalence_diagnostic'] *= 1.3

In [18]:
def generate_sensitive_value(dfx, age, sexe, col, low=None, lbls=None):
    dfy = dfx.query(f'sexe == {sexe}').reset_index(drop=True)
    splts = dfy.age.apply(lambda x: x.split('-')[0]).astype(int)
    idx_age = bisect(splts, age) - 1
    cols = [col, low] if low is not None else [col]
    s_proba = dfy.loc[idx_age, cols].astype(float) * .01
    roll = np.random.rand(len(cols))
    res = s_proba > roll

    if len(res) > 1:
        if res[0]:
            return lbls[0]
        elif res[1]:
            return lbls[1]
        else:
            return lbls[2]
    else:
        return res.values[0]

In [19]:
df_sensitive_data = pd.concat(
 [
     df_personal_data.query('age < 75 and age>=60 and is_rural==True'),
     df_personal_data.query('age < 75 and age>=60 and is_rural==False').sample(250),
     df_personal_data.query('age < 60 and is_rural==True'),
     df_personal_data.query('age < 60 and is_rural==False').sample(350),
     df_personal_data_remaining.sample(300)
 ]
)

df_sensitive_data['date_naissance'] = df_sensitive_data['age'].apply(generate_birthdate)

In [20]:
df_sensitive_data = df_sensitive_data.sort_values(['nom', 'prenom']).reset_index(drop=True)

In [21]:
df_sensitive_data['consommation_alcool'] = df_sensitive_data.apply(lambda row: generate_sensitive_value(df_alcool, row.age, row.sexe, 'alcool_quotidien', low='alcool_annuel', lbls=['élevée', 'modérée', 'faible']), axis=1)
df_sensitive_data['activite_physique'] = df_sensitive_data.apply(lambda row: generate_sensitive_value(df_activite, row.age, row.sexe, 'atteinte_objectif_quotidien'), axis=1)
df_sensitive_data['tension_arterielle'] = df_sensitive_data.apply(lambda row: generate_sensitive_value(df_hta, row.age, row.sexe, 'prevalence_diagnostic', low='prevalence_traitement', lbls=['très élevée', 'élevée', 'normale']), axis=1)

In [22]:
df_find = pd.concat(
    [
        df_sensitive_data.query('(is_rural == True) and (age <=20)').sample(1),
        df_sensitive_data.query('(is_rural == False) and (age <=20)').sample(1),
        df_sensitive_data.query('(is_rural == True) and (age >=70)').sample(1),
        df_sensitive_data.query('(is_rural == False) and (age >=70)').sample(1),
        df_sensitive_data.query('(is_rural == True) and (age <70) and (age > 20)').sample(1),
        df_sensitive_data.query('(is_rural == False) and (age <70) and (age >20)').sample(1)       
    ]
)[['nom', 'prenom', 'age', 'sexe', 'code_postal']]

In [23]:
with open(data_dir / 'patients_to_find.md', 'w') as f:
    f.write(df_find.to_markdown(index=False))
    
df_find.to_csv(data_dir /'patients_to_find.csv', sep=';', index=False)

In [24]:
del df_sensitive_data['is_rural']

In [25]:
df_sensitive_data.to_csv(data_dir / 'sensitive_data.csv', sep=';', index=False)