In [None]:
import duckdb
from pipelines.tasks.config.common import DUCKDB_FILE
from pipelines.tasks.client.https_client import HTTPSClient
from pipelines.tasks.config.common import CACHE_FOLDER
import json
import os
from tqdm import tqdm
import pandas as pd

In [None]:
# la donnée geo est telechargé depuis https://public.opendatasoft.com/explore/dataset/georef-france-commune/information
https_client = HTTPSClient(
    "https://public.opendatasoft.com/api/explore/v2.1/catalog/datasets/"
)

In [None]:
path = "georef-france-commune/exports/geojson?lang=fr&timezone=Africa%2FLagos"
filepath = os.path.join(CACHE_FOLDER, "georef-france-commune.geojson")
https_client.download_file_from_https(path, filepath)

In [None]:
con = duckdb.connect(database=DUCKDB_FILE, read_only=True)

In [None]:
query_2024 = """
select * from ana__resultats_communes
"""

prelevements_2024 = con.sql(query_2024)
prelevements_2024_df = prelevements_2024.df()
prelevements_2024_df.head(2)

In [None]:
with open(os.path.join(CACHE_FOLDER, "georef-france-commune.geojson"), "r") as file:
    data_geo = json.load(file)

In [None]:
# Si la solution est validée, il faudra optimiser ce code qui est bien trop lent
data_geo_features = data_geo["features"]
default_properties = {"annee": "2024", "resultat_cvm": "None"}
for i in tqdm(range(len(data_geo_features))):
    elem = data_geo_features[i]
    code_insee = elem["properties"]["com_code"]
    name_insee = elem["properties"]["com_name"]
    if code_insee is not None:
        code_insee = code_insee[0]
        name_insee = name_insee[0]
        prelevement = prelevements_2024_df[
            (prelevements_2024_df.commune_code_insee == code_insee)
        ]
        if len(prelevement) >= 0:
            properties = {}
            properties["commune_code_insee"] = code_insee
            properties["commune_nom"] = name_insee
            properties["resultat_cvm"] = {}
            for index, row in prelevement.iterrows():
                properties["resultat_cvm"][row["annee"]] = row["resultat_cvm"]
            elem["properties"] = properties
        else:
            # print("elem not found")
            elem["properties"] = default_properties
            elem["properties"]["commune_code_insee"] = code_insee
            elem["properties"]["commune_nom"] = name_insee

In [None]:
new_geo_json = {"type": "FeatureCollection"}
new_geo_json["features"] = data_geo_features

In [None]:
# Write the dictionary to a GeoJSON file
filename = "georef-france-commune-prelevement.geojson"
write_filepath = os.path.join(CACHE_FOLDER, filename)
with open(
    write_filepath,
    "w",
    encoding="utf-8",
) as file:
    json.dump(new_geo_json, file)

In [None]:
from pipelines.utils.storage_client import ObjectStorageClient
from pipelines.config.config import load_env_variables

load_env_variables()
s3 = ObjectStorageClient()

db_path = DUCKDB_FILE  # Fichier local
s3_path = "dev/geojson/georef-france-commune-prelevement.geojson.removeme"  # Destination sur S3

s3.upload_object(local_path=write_filepath, file_key=s3_path, public_read=True)
print(f"✅ geojson uploadée sur s3://{s3.bucket_name}/{s3_path}")
# La donnée peut maintenant être récupéré depuis https://pollution-eau-s3.s3.fr-par.scw.cloud/dev/geojson/georef-france-commune-prelevement.geojson.removeme

# Tests


In [None]:
df_geo = pd.DataFrame(new_geo_json["features"])
df_geo_flatten = df_geo.join(pd.json_normalize(df_geo["properties"]))
df_geo_flatten

In [None]:
df_geo_flatten.commune_code_insee.unique()

In [None]:
len(df_geo_flatten[df_geo_flatten["resultat_cvm.2024"].isna()])

In [None]:
# s3.delete_object("dev/geojson/georef-france-commune-prelevement.csv")

In [None]:
# from pipelines.tasks.config.common import download_file_from_https

# download_file_from_https(
#     url="https://pollution-eau-s3.s3.fr-par.scw.cloud/dev/geojson/georef-france-commune-prelevement-small.geojson.removeme",
#     filepath="test.geojson",
# )

In [None]:
df_geo = pd.DataFrame(data_geo["features"])
df_geo_flatten = df_geo.join(pd.json_normalize(df_geo["properties"]))
df_geo_flatten.com_code.map(lambda x: len(x) if x is not None else 0).value_counts()