In [None]:
import yaml, json
import pandas as pd
import geopandas as gpd
import numpy as np
from tqdm.notebook import tqdm
import requests
import time, os
import shapely.geometry as geo

### Prepare depots
This notebooks loads the addresses of the depots of the operators as defined in the configuration input from the enterprise census. The addresses are then geocoded to obtain detailed coordinates using the open BAN API.

In [None]:
# Manage inputs and outputs
study_area_path = "../../resources/spatial/study_area.gpkg"
siret_path = "../../resources/external/sirene/StockEtablissement_utf8.zip"
operators_path = "../../resources/operators.yml"
output_path = "../../results/depots/prepared.gpkg"

crs = "EPSG:2154"

if "snakemake" in locals():
    study_area_path = snakemake.input["study_area"]
    siret_path = snakemake.input["siret"]
    operators_path = snakemake.input["operators"]
    output_path = snakemake.output[0]

    params = snakemake.params[0] if len(snakemake.params) == 1 and len(snakemake.params.keys()) == 0 else snakemake.params

    if "crs" in params:
        rs = params["crs"]

In [None]:
# Load operator data
with open(operators_path) as f:
    operators = yaml.load(f, yaml.SafeLoader)["operators"]

In [None]:
# Find all SIREN identifiers
all_siren = set()

for operator in operators.values():
    all_siren |= set(operator["siren"])

In [None]:
# Load study area and find municipalities
df_study_area = gpd.read_file(study_area_path)
municipalities = set([str(x) for x in df_study_area["INSEE_COM"].unique()])

In [None]:
# Read relevant information from enterprise census
df_siret = []

with tqdm(desc = "Reading SIRET", total = 33835760) as progress:
    for df_chunk in pd.read_csv(siret_path, usecols = [
        "siren", "codeCommuneEtablissement",
        "codePostalEtablissement", "libelleCommuneEtablissement",
        "libelleVoieEtablissement", "numeroVoieEtablissement",
        "complementAdresseEtablissement", "typeVoieEtablissement",
        "trancheEffectifsEtablissement", "etatAdministratifEtablissement"
    ], dtype = { "codeCommuneEtablissement": str, "siren": str, 
                "codePostalEtablissement": str,
                "trancheEffectifsEtablissement": str }, chunksize = 10240):
        progress.update(len(df_chunk))
        
        f = df_chunk["codeCommuneEtablissement"].isin(municipalities)
        f &= df_chunk["etatAdministratifEtablissement"] == "A"
        f &= df_chunk["siren"].isin(all_siren)
        
        if np.count_nonzero(f) > 0:
            df_chunk = df_chunk[f].copy()
            
            df_chunk["number"] = df_chunk["numeroVoieEtablissement"].astype(float).fillna(0).astype(int).astype(str)
            df_chunk.loc[df_chunk["number"] == "0", "number"] = ""
            
            df_chunk["address"] = df_chunk["number"] + " "
            df_chunk["address"] += df_chunk["typeVoieEtablissement"].astype(str) + " "
            df_chunk["address"] += df_chunk["libelleVoieEtablissement"].astype(str) + ", "
            df_chunk["address"] += df_chunk["codePostalEtablissement"] + " "
            df_chunk["address"] += df_chunk["libelleCommuneEtablissement"].astype(str)
            
            df_chunk["complementAdresseEtablissement"] = df_chunk["complementAdresseEtablissement"].replace(
                "ZI MI PLAINE", "").replace("ZAC LES MARCHES DU RHONE", "")
            
            f = df_chunk["complementAdresseEtablissement"].str.len() > 0
            df_chunk.loc[f, "address"] += ", " + df_chunk.loc[f, "complementAdresseEtablissement"].astype(str)

            df_chunk["employment"] = df_chunk["trancheEffectifsEtablissement"]
            df_chunk["municipality"] = df_chunk["codeCommuneEtablissement"]
            df_chunk["postcode"] = df_chunk["codePostalEtablissement"]
            
            df_siret.append(df_chunk[["siren", "address", "employment", "postcode", "municipality"]].copy())

df_siret = pd.concat(df_siret)

In [None]:
assert len(all_siren) == len(df_siret["siren"].unique())

In [None]:
for operator_id, operator in operators.items():
    f = df_siret["siren"].isin([str(x) for x in operator["siren"]])
    df_siret.loc[f, "operator"] = operator_id

In [None]:
# Apply employment filters, mainly for the post offices
for operator_id, operator in operators.items():
    if "employment_filter" in operator:
        f_operator = df_siret["operator"] == operator_id
        
        f = df_siret["employment"].isna()

        for item in operator["employment_filter"]:
            f |= df_siret["employment"] == item

        f &= f_operator
        
        print(operator_id, "filtering out", np.count_nonzero(f), "of", np.count_nonzero(f_operator))
        
        df_siret = df_siret[~f].copy()

In [None]:
# Assign identifiers
df_siret = df_siret.sort_values(by = ["operator", "siren", "address"])
df_siret["depot_id"] = np.arange(len(df_siret))

In [None]:
# Perform geocoding using the BAN API
df_coordinates = []

for index, row in tqdm(df_siret.iterrows(), total = len(df_siret)):
    response = requests.get("https://api-adresse.data.gouv.fr/search/", {
        "q": row["address"],
        "autocomplete": "0",
        "citycode": row["municipality"]
    })

    if response.ok:
        content = json.loads(response.content)

        if len(content["features"]) > 0:
            df_coordinates.append({
                "depot_id": row["depot_id"],
                "geometry": geo.Point(
                    content["features"][0]["geometry"]["coordinates"][0],
                    content["features"][0]["geometry"]["coordinates"][1],
                )
            })
    
    time.sleep(0.5)
        
df_coordinates = pd.DataFrame.from_records(df_coordinates)
df_coordinates = gpd.GeoDataFrame(df_coordinates, crs = "EPSG:4326")

In [None]:
assert len(df_coordinates) == len(df_siret)
assert len(df_coordinates["depot_id"].unique()) == len(df_siret["depot_id"].unique())

In [None]:
# Final clean-up and output
df_centers = pd.merge(
    df_siret[["depot_id", "operator"]], df_coordinates, on = "depot_id")

df_centers = gpd.GeoDataFrame(df_centers, crs = "EPSG:4326").to_crs(crs)
df_centers.to_file(output_path)