# DITP experimentation n°2 - part 1
# Notebook for LM campaign on the 19th september that begins on the 26th of september
Analysis deadlines: 10 october, 10 november, 31 december

- 8 files are to be generated by this script
  - 4 files for parents
  - 4 files for direct beneficiaires
- The CSV format is similar to the previous campaign, except that there is a new column named "pronom" whose value can be "il" or "elle"
- 1 file that will contain exhaustive information about these 8 files is also generated for further analysis with DITP later on at step 3 

In [None]:
import os
from dotenv import load_dotenv
import pandas as pd
import json
import numpy as np
from datetime import date

load_dotenv()

db_export_filepath = os.environ['DB_EXPORT_PEOPLE_NOT_ACTIVATED_PASS_SPORT_PATHFILE']

# Will contain exhaustive information to be re-used later for DITP analysis to update people who activated their pass sport
consolidated_output_pathfile = os.environ['CAMPAIGN_LINK_MOBILITY_19_SEPTEMBER_CONSOLIDATED_OUTPUT_PATHFILE']

# 4 benef & 4 parents files will be generated at the end
benef_part_file_format = os.environ['CAMPAIGN_SPLITTED_FILES_BENEF_OUTPUT_PREFIX']
parent_part_file_format = os.environ['CAMPAIGN_SPLITTED_FILES_PARENTS_OUTPUT_PREFIX']

qr_code_secret_key = os.environ['BENEF_2024_QR_CODE_URL_SECRET']
qr_code_base_url = os.environ['BENEF_2024_QR_CODE_BASE_URL']

# rgpd users
# Combine with the relative path to the file
pathfile_rgpd_users_blacklist =os.path.join('..', os.environ['RGPD_USERS_BLACKLIST_CSV_PATH_FILE']) 

In [None]:
# Combine the two merged CSV (with ids and without ids)
columns = ['id', 'nom', 'prenom', 'genre', 'allocataire', 'adresse_allocataire', 'id_psp','date_naissance', 'zrr', 'qpv']

df_db = pd.read_csv(db_export_filepath, sep=',', usecols=columns)

In [None]:
# unwrap alloc
df_json_allocataire = pd.json_normalize(df_db['allocataire'].apply(json.loads))
df_json_allocataire = df_json_allocataire.add_prefix('allocataire-')

df_db.index = pd.RangeIndex(start=0, stop=len(df_db), step=1)

df_db_unwrapped = pd.merge(
  df_db, 
  df_json_allocataire[
    ['allocataire-courriel', 'allocataire-qualite', 'allocataire-nom', 'allocataire-prenom', 'allocataire-telephone', 'allocataire-date_naissance']
  ], 
  left_index=True, 
  right_index=True
)

df_db_unwrapped = df_db_unwrapped.drop(columns=['allocataire'])

In [None]:
# unwrap adresse alloc
df_json_adresse_allocataire = pd.json_normalize(df_db['adresse_allocataire'].apply(json.loads))

df_db_unwrapped.index = pd.RangeIndex(start=0, stop=len(df_db), step=1)

df_db_unwrapped = pd.merge(df_db_unwrapped, df_json_adresse_allocataire[['code_postal']], left_index=True, right_index=True)
df_db_unwrapped = df_db_unwrapped.drop(columns=['adresse_allocataire'])

In [None]:
# Vectorized check for 'allocataire-courriel' and 'allocataire-telephone' being empty or NaN
mask_contact_empty = ((df_db_unwrapped['allocataire-courriel'].isna()) | (df_db_unwrapped['allocataire-courriel'] == '')) & \
                     ((df_db_unwrapped['allocataire-telephone'].isna()) | (df_db_unwrapped['allocataire-telephone'] == ''))

# Vectorized check for any of the 'nom', 'prenom', 'date_naissance', 'genre' being empty or NaN
mask_info_missing = (df_db_unwrapped[['nom', 'prenom', 'date_naissance', 'genre']].isna().any(axis=1)) | \
                    ((df_db_unwrapped[['nom', 'prenom', 'date_naissance', 'genre']] == '').any(axis=1))

# Final mask
mask_email_info_missing = mask_contact_empty | mask_info_missing

df_db_unwrapped_reachable = df_db_unwrapped[~mask_email_info_missing]
print(f"{len(df_db_unwrapped) - len(df_db_unwrapped_reachable)} rows deleted because they are not reachable by email or phone")

In [None]:
# rename columns
column_mapping = {
    'allocataire-courriel': 'email',
    'allocataire-qualite': 'allocataire_qualite',
    'allocataire-nom': 'allocataire_nom',
    'allocataire-prenom': 'allocataire_prenom',
    'allocataire-telephone': 'telephone',
    'prenom': 'beneficiaire_prenom',
    'nom': 'beneficiaire_nom',
    'genre': 'beneficiaire_genre',
    'date_naissance': 'beneficiaire_date_naissance',
    'id_psp': 'code',
    'allocataire-date_naissance': 'allocataire_date_naissance'
}

df_db_unwrapped_reachable.columns = df_db_unwrapped_reachable.columns.to_series().replace(column_mapping)

In [None]:
# only keep necessary columns
df_campaign = df_db_unwrapped_reachable[['email',
                                         'allocataire_nom',
                                         'allocataire_prenom',
                                         'allocataire_date_naissance',
                                         'allocataire_qualite',
                                         'beneficiaire_prenom',
                                         'beneficiaire_nom',
                                         'beneficiaire_genre',
                                         'beneficiaire_date_naissance', 
                                         'code', 
                                         'telephone',
                                         'zrr',
                                         'qpv',
                                         'code_postal',
                                         'id']]

In [None]:
# https://www.notion.so/Suivi-remont-s-utilisateurs-0bfd5c50ac67460a99ef651e3f8a0f45?pvs=4#cd6cbf85cbe6498c8ebbeda96ecba42d
df_rgpd = pd.read_csv(pathfile_rgpd_users_blacklist, usecols=['email'], dtype={ 'email': 'string' })
df_campaign = df_campaign.loc[~df_campaign['email'].isin(df_rgpd['email'])]

In [None]:
# Cast to date_time benef + allocataire birth dates
df_campaign['beneficiaire_date_naissance'] = pd.to_datetime(df_campaign['beneficiaire_date_naissance'], errors='coerce')
df_campaign['allocataire_date_naissance'] = pd.to_datetime(df_campaign['allocataire_date_naissance'], errors='coerce')

In [None]:
# Add column for beneficiaire gender
df_campaign['neele'] = 'Né le'
mask_girl = df_campaign['beneficiaire_genre'] == 'F'
df_campaign.loc[mask_girl, 'neele'] =  'Née le'

In [None]:
# Add column for allocataire gender
df_campaign['allocataire_genre'] = np.where(df_campaign['allocataire_qualite'] == 'Mme', 'F', 'M')

In [None]:
# capitalize on name / surname
df_campaign['allocataire_prenom'] = df_campaign['allocataire_prenom'].astype(str).str.capitalize()
df_campaign['allocataire_nom'] = df_campaign['allocataire_nom'].astype(str).str.capitalize()
df_campaign['beneficiaire_prenom'] = df_campaign['beneficiaire_prenom'].astype(str).str.capitalize()
df_campaign['beneficiaire_nom'] = df_campaign['beneficiaire_nom'].astype(str).str.capitalize()

In [None]:
# internationalize phone_number
df_campaign['telephone'] = df_campaign['telephone'].replace('^0', '+33', regex=True)

In [None]:
mask_new_benef = df_campaign['id'].isna()

df_campaign_existing = df_campaign[~mask_new_benef]

In [None]:
# "il", "elle" values for column "pronom"
df_campaign_existing['pronom'] = np.where(df_campaign_existing['beneficiaire_genre'] == 'M', 'il', 'elle')

In [None]:
# age of beneficiaire + allocataire (if it exists)
def calculate_age(born):
    today = date.today()
    age = today.year - born.year
    if (today.month, today.day) < (born.month, born.day):
        age -= 1
    
    return age

df_campaign_existing['beneficiaire_age'] = df_campaign_existing['beneficiaire_date_naissance'].apply(calculate_age)
df_campaign_existing['allocataire_age'] = df_campaign_existing['allocataire_date_naissance'].apply(calculate_age)

In [None]:
df_campaign_existing[['beneficiaire_age', 'allocataire_age']] = df_campaign_existing[['beneficiaire_age', 'allocataire_age']].astype('Int64')  # Nullable integer type

In [None]:
# Format date naissance
df_campaign_existing['beneficiaire_date_naissance'] = pd.to_datetime(df_campaign_existing['beneficiaire_date_naissance'], format='%d-%m-%Y')
df_campaign_existing['beneficiaire_date_naissance'] = df_campaign_existing['beneficiaire_date_naissance'].dt.strftime('%d/%m/%Y')

In [None]:
# Génération des URLs pour le QR code
import urllib.parse
import base64

from Crypto.Cipher import AES
from Crypto.Util.Padding import pad

base_64_key = base64.b64decode(qr_code_secret_key)
key_mapping = { 
  'beneficiaire_prenom': 'bp', 
  'beneficiaire_nom': 'bn', 
  'beneficiaire_genre' : 'bg', 
  'beneficiaire_date_naissance': 'bdn', 
  'code': 'c'
}

def encrypt(data):
    cipher = AES.new(base_64_key, AES.MODE_CBC)
    ct_bytes = cipher.encrypt(pad(data.encode('utf-8'), AES.block_size))
    iv = cipher.iv
    ct = base64.b64encode(iv + ct_bytes).decode('utf-8')
    return ct

def generate_encrypted_url_column(row):
    params = {key_mapping.get(column): row[column] for column in df_campaign.columns}
    cleaned_params = {k: v for k, v in params.items() if k is not None}
    encoded_params = urllib.parse.urlencode(cleaned_params)
    encoded_encrypted_params = encrypt(encoded_params)
    full_url_string = f"{qr_code_base_url}#{urllib.parse.quote_plus(encoded_encrypted_params)}"
    return full_url_string

In [None]:
# Generation des QR code
if 'url_qr_code' in df_campaign_existing:
    del df_campaign_existing['url_qr_code']
    
df_campaign_existing['url_qr_code'] = df_campaign_existing.apply(generate_encrypted_url_column, axis=1)

In [None]:
# Dataframe that contains all information to update information for DITP later on in october/november/december
df_campaign_existing_consolidated = df_campaign_existing.copy()

# Dataframe for Link Mobility
df_campaign_existing = df_campaign_existing[[
  'email',
  'allocataire_nom',
  'allocataire_prenom',
  'beneficiaire_prenom',
  'beneficiaire_nom',
  'beneficiaire_genre',
  'beneficiaire_date_naissance',
  'code',
  'telephone',
  'neele',
  'pronom',
  'url_qr_code'
]]

In [None]:
# existing rows, case allocataire != bénéficiaire
mask_alloc_diff_benef = df_campaign_existing['beneficiaire_prenom'].str.lower() != df_campaign_existing['allocataire_prenom'].str.lower()
df_campaign_existing_alloc_diff_benef = df_campaign_existing[mask_alloc_diff_benef]

In [None]:
# existing rows, case allocataire == bénéficiaire
mask_alloc_eq_benef = df_campaign_existing['beneficiaire_prenom'].str.lower() == df_campaign_existing['allocataire_prenom'].str.lower()
df_campaign_existing_alloc_eq_benef = df_campaign_existing[mask_alloc_eq_benef]

In [None]:
df_campaign_existing_alloc_eq_benef_shuffled = df_campaign_existing_alloc_eq_benef.sample(frac=1, random_state=1).reset_index(drop=True)
df_campaign_existing_alloc_diff_benef_shuffled = df_campaign_existing_alloc_diff_benef.sample(frac=1, random_state=1).reset_index(drop=True)

In [None]:
df_campaign_existing_alloc_eq_benef_shuffled_split = np.array_split(df_campaign_existing_alloc_eq_benef_shuffled, 4)
df_campaign_existing_alloc_diff_benef_shuffled_split = np.array_split(df_campaign_existing_alloc_diff_benef_shuffled, 4)

In [None]:
# Direct beneficiaires splitted into 4 equal length files
for i, split_df in enumerate(df_campaign_existing_alloc_eq_benef_shuffled_split, start=1):
    filename = f'{benef_part_file_format}-{i}.csv'
    split_df.to_csv(filename, index=False)

In [None]:
# Parents splitted into 4 equal length files
for i, split_df in enumerate(df_campaign_existing_alloc_diff_benef_shuffled_split, start=1):
    filename = f'{parent_part_file_format}-{i}.csv'
    split_df.to_csv(filename, index=False)

In [None]:
# Save exhaustive dataframe that contains information that are removed for LM mailing campaign
# this will be used later for DITP analysis
df_campaign_existing_consolidated.to_csv(consolidated_output_pathfile, index=False)