In [None]:
import pandas as pd
import numpy as np
import os, glob, gc
from google.colab import drive
import geopandas as gpd
from shapely.geometry import Point

# Monter Google Drive
drive.mount('/content/drive')

# Chemins
INPUT_RAW_METEO_DATA_PATH = '/content/drive/MyDrive/Stage_CEFREM_Salma_2025/Data/Meteo/InSitu/OriginalFromMeteoFrance/'
INPUT_METADATA_WITH_GIS_PATH = '/content/drive/MyDrive/Stage_CEFREM_Salma_2025/Data/Meteo/InSitu/Consolidated_Stations_With_Raster_Values_And_Aspect_Components_StrictlyFiltered.csv'
FINAL_CONSOLIDATED_DATA_PATH = '/content/drive/MyDrive/Stage_CEFREM_Salma_2025/Data/Meteo/InSitu/Fully_Consolidated_Stations_Meteo_GIS_Data.csv'


In [None]:
# Mapping colonnes fichiers bruts
column_mapping_rrt_vent = {
    'NUM_POSTE': 'NUM_POSTE', 'AAAAMMJJ': 'DATE', 'RR': 'RR_obs',
    'TX': 'TX_obs', 'TN': 'TN_obs', 'TM': 'TM_obs', 'FFM': 'FFM_obs'
    # ... ajoute d'autres colonnes au besoin
}

column_mapping_autres_params = {
    'NUM_POSTE': 'NUM_POSTE', 'AAAAMMJJ': 'DATE',
    'ETPMON': 'ETPMON_obs', 'HNEIGEF': 'HNEIGEF_obs'
    # ... ajoute d'autres colonnes au besoin
}

# Réduction mémoire
def reduce_mem_usage(df, verbose=True):
    start_mem = df.memory_usage(deep=True).sum() / 1024**2
    for col in df.columns:
        col_type = df[col].dtype
        if pd.api.types.is_numeric_dtype(col_type):
            if 'int' in str(col_type):
                c_min, c_max = df[col].min(), df[col].max()
                if c_min >= np.iinfo(np.int8).min and c_max <= np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_max <= np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_max <= np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
            elif 'float' in str(col_type):
                c_min, c_max = df[col].min(), df[col].max()
                if c_max <= np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
        elif col_type == 'object':
            if df[col].nunique() / len(df[col]) < 0.5:
                df[col] = df[col].astype('category')
    end_mem = df.memory_usage(deep=True).sum() / 1024**2
    if verbose:
        print(f" Mémoire réduite de {start_mem:.2f} MB à {end_mem:.2f} MB")
    return df


In [None]:
print("--- Étape 1 : Chargement des métadonnées des stations ---")
stations_metadata_df = pd.read_csv(INPUT_METADATA_WITH_GIS_PATH)
stations_metadata_df['NUM_POSTE'] = stations_metadata_df['NUM_POSTE'].astype(str)
stations_metadata_df = reduce_mem_usage(stations_metadata_df)
print(f"{len(stations_metadata_df)} stations chargées avec GIS.")


In [None]:
print("\n--- Étape 2 : Lecture des fichiers bruts ---")
all_raw_meteo_for_pyrenees = []
raw_data_by_department = {}
all_raw_csv_files = glob.glob(os.path.join(INPUT_RAW_METEO_DATA_PATH, 'Q_*.csv'))
unique_dept_periods = sorted({(f.split('_')[1], f.split('_')[2]) for f in map(os.path.basename, all_raw_csv_files) if len(f.split('_')) >= 3})

for dept_code, period in unique_dept_periods:
    print(f" Département {dept_code}, période {period}")
    file1 = os.path.join(INPUT_RAW_METEO_DATA_PATH, f"Q_{dept_code}_{period}_RR-T-Vent.csv")
    file2 = os.path.join(INPUT_RAW_METEO_DATA_PATH, f"Q_{dept_code}_{period}_autres-parametres.csv")

    df1 = pd.read_csv(file1, sep=';', on_bad_lines='skip') if os.path.exists(file1) else pd.DataFrame()
    df2 = pd.read_csv(file2, sep=';', on_bad_lines='skip') if os.path.exists(file2) else pd.DataFrame()

    if not df1.empty:
        df1.rename(columns=column_mapping_rrt_vent, inplace=True)
        df1['DATE'] = pd.to_datetime(df1['DATE'], format='%Y%m%d', errors='coerce')
        df1['NUM_POSTE'] = df1['NUM_POSTE'].astype(str)
        df1 = reduce_mem_usage(df1, verbose=False)

    if not df2.empty:
        df2.rename(columns=column_mapping_autres_params, inplace=True)
        df2['DATE'] = pd.to_datetime(df2['DATE'], format='%Y%m%d', errors='coerce')
        df2['NUM_POSTE'] = df2['NUM_POSTE'].astype(str)
        df2 = reduce_mem_usage(df2, verbose=False)

    merged = pd.merge(df1, df2, on=['NUM_POSTE', 'DATE'], how='outer') if not df1.empty and not df2.empty else df1 if not df1.empty else df2

    if not merged.empty:
        merged = reduce_mem_usage(merged, verbose=False)
        if dept_code not in raw_data_by_department:
            raw_data_by_department[dept_code] = merged
        else:
            raw_data_by_department[dept_code] = pd.concat([raw_data_by_department[dept_code], merged])
            raw_data_by_department[dept_code].drop_duplicates(subset=['NUM_POSTE', 'DATE'], inplace=True)
            raw_data_by_department[dept_code] = reduce_mem_usage(raw_data_by_department[dept_code], verbose=False)


In [None]:
processed_ids = set()

for station_id in stations_metadata_df['NUM_POSTE'].unique():
    dept = station_id[:2].zfill(2)
    if dept in raw_data_by_department:
        df_station = raw_data_by_department[dept]
        station_data = df_station[df_station['NUM_POSTE'] == station_id]
        if not station_data.empty:
            all_raw_meteo_for_pyrenees.append(station_data.copy())
            processed_ids.add(station_id)

del raw_data_by_department
gc.collect()

if all_raw_meteo_for_pyrenees:
    consolidated_meteo_df = pd.concat(all_raw_meteo_for_pyrenees, ignore_index=True)
    consolidated_meteo_df = reduce_mem_usage(consolidated_meteo_df)
else:
    print(" Aucune donnée brute trouvée pour les stations.")
    exit()


In [None]:
print("\n--- Étape 3 : Fusion finale ---")
final_df = pd.merge(consolidated_meteo_df, stations_metadata_df, on='NUM_POSTE', how='left')
del consolidated_meteo_df
gc.collect()

# Supprimer colonnes doublons venant des fichiers bruts
cols_to_drop = [c for c in final_df.columns if c.endswith('_raw_data')]
final_df.drop(columns=cols_to_drop, inplace=True, errors='ignore')
final_df['DATE'] = pd.to_datetime(final_df['DATE'], errors='coerce')
final_df = reduce_mem_usage(final_df)


In [None]:
final_df.to_csv(FINAL_CONSOLIDATED_DATA_PATH, index=False)
print(f"\n Données finales enregistrées dans : {FINAL_CONSOLIDATED_DATA_PATH}")

print("\n--- Analyse des NaN ---")
nan_summary = final_df.isna().sum().to_frame('NaN')
nan_summary['%'] = (nan_summary['NaN'] / len(final_df)) * 100
print(nan_summary[nan_summary['NaN'] > 0].sort_values(by='%', ascending=False))

print(f"\n Fin du script : {len(processed_ids)} stations des Pyrénées traitées.")
