In [None]:
# === Bloc 1 : Monter Google Drive ===
from google.colab import drive
drive.mount('/content/drive')


In [None]:
# === Bloc 2 : Détection des colonnes à supprimer ===
import pandas as pd
import gc

INPUT_CONSOLIDATED_PATH = '/content/drive/MyDrive/Stage_CEFREM_Salma_2025/Data/Meteo/InSitu/Fully_Consolidated_Stations_Meteo_GIS_Data.csv'
CHUNKSIZE = 100_000

# Lire l'en-tête
with open(INPUT_CONSOLIDATED_PATH, 'r') as f:
    header = f.readline().strip().split(',')
total_lines = sum(1 for _ in open(INPUT_CONSOLIDATED_PATH)) - 1
nan_counts = {col: 0 for col in header}

# Compter les NaN
for chunk in pd.read_csv(INPUT_CONSOLIDATED_PATH, chunksize=CHUNKSIZE):
    for col in chunk.columns:
        nan_counts[col] += chunk[col].isna().sum()
    del chunk
    gc.collect()

# Colonnes à 100% NaN
cols_100_nan = [col for col, count in nan_counts.items() if count == total_lines]

# Colonnes à suffixes (_x, _y, _raw_data)
core_metadata_cols = ['NOM_USUEL', 'LAT', 'LON', 'ALTI']
suffixes = ['_x', '_y', '_raw_data']
cols_suffixes = [f"{col}{suf}" for col in core_metadata_cols for suf in suffixes if f"{col}{suf}" in header]

# Sauvegarde
with open('/content/cols_to_drop_100_nan.txt', 'w') as f:
    for col in cols_100_nan:
        f.write(col + '\n')
with open('/content/cols_to_remove_suffixes.txt', 'w') as f:
    for col in cols_suffixes:
        f.write(col + '\n')


In [None]:
# === Bloc 3 : Nettoyage du fichier CSV ===
OUTPUT_CLEANED_PATH = '/content/drive/MyDrive/Stage_CEFREM_Salma_2025/Data/Meteo/InSitu/Cleaned_Consolidated_Meteo_GIS_Data.csv'
critical_meta = ['LAT', 'LON', 'ALTI', 'ALTI_MNT', 'SLOPE', 'ASPECT']

with open('/content/cols_to_drop_100_nan.txt') as f:
    drop_100 = [line.strip() for line in f]
with open('/content/cols_to_remove_suffixes.txt') as f:
    drop_suffix = [line.strip() for line in f]
cols_to_drop = list(set(drop_100 + drop_suffix))

def reduce_mem_usage(df):
    for col in df.columns:
        if df[col].dtype == 'float64':
            df[col] = pd.to_numeric(df[col], downcast='float')
        elif df[col].dtype == 'int64':
            df[col] = pd.to_numeric(df[col], downcast='integer')
        elif df[col].dtype == 'object':
            if df[col].nunique() / len(df[col]) < 0.5:
                df[col] = df[col].astype('category')
    return df

first = True
for chunk in pd.read_csv(INPUT_CONSOLIDATED_PATH, chunksize=CHUNKSIZE):
    chunk.drop(columns=[c for c in cols_to_drop if c in chunk.columns], inplace=True)
    if 'DATE' in chunk.columns:
        chunk['DATE'] = pd.to_datetime(chunk['DATE'], errors='coerce')
    if 'NUM_POSTE' in chunk.columns:
        chunk['NUM_POSTE'] = chunk['NUM_POSTE'].astype(str)
    chunk = reduce_mem_usage(chunk)

    # Supprimer lignes avec NaN dans métadonnées GIS
    chunk.dropna(subset=[c for c in critical_meta if c in chunk.columns], inplace=True)

    chunk.to_csv(OUTPUT_CLEANED_PATH, mode='w' if first else 'a', index=False, header=first)
    first = False
    del chunk
    gc.collect()


In [None]:
# === Bloc 4 : Filtrer à partir de 1970 ===
INPUT_CLEANED = OUTPUT_CLEANED_PATH
OUTPUT_FILTERED_PATH = '/content/drive/MyDrive/Stage_CEFREM_Salma_2025/Data/Meteo/InSitu/Filtered_1970_Consolidated_Meteo_GIS_Data.csv'
START_DATE = pd.to_datetime('1970-01-01')

first = True
for chunk in pd.read_csv(INPUT_CLEANED, chunksize=CHUNKSIZE):
    if 'DATE' in chunk.columns:
        chunk['DATE'] = pd.to_datetime(chunk['DATE'], errors='coerce')
        filtered = chunk[chunk['DATE'] >= START_DATE].copy()
        filtered.to_csv(OUTPUT_FILTERED_PATH, mode='w' if first else 'a', index=False, header=first)
        first = False
        del filtered
    del chunk
    gc.collect()
