In [1]:
import pandas as pd
import os
import re

# Dossier contenant les fichiers CSV bruts
data_folder = "raw_datasets"

all_dfs = []

# Parcourir chaque fichier du dossier
for file in os.listdir(data_folder):
    if file.endswith(".csv"):
        
        file_path = os.path.join(data_folder, file)
        df = pd.read_csv(file_path)
        
        # Phase 1: Normalisation
        
        # 1. Extraire Year du nom de fichier (n'importe quelle année à 4 chiffres)
        match = re.search(r"\d{4}", file)
        if match:
            year = int(match.group(0))
        else:
            raise ValueError(f"No valid year found in filename: {file}")
        
        # Ajouter la colonne "year"
        df["year"] = year
        
        # 2. Normaliser les noms de colonnes (minuscules, remplacer les espaces/tirets)
        df.columns = (
            df.columns
            .str.lower()
            .str.strip()
            .str.replace(" ", "_")
            .str.replace("-", "_")
        )
        
        # Ajouter le DF nettoyé à la liste
        all_dfs.append(df)

# Phase 2: Merging
whr_2015_2023 = pd.concat(all_dfs, ignore_index=True)

print("Merged dataset shape", whr_2015_2023.shape)
whr_2015_2023.head()


Merged dataset shape (1367, 10)


Unnamed: 0,country,region,happiness_score,gdp_per_capita,social_support,healthy_life_expectancy,freedom_to_make_life_choices,generosity,perceptions_of_corruption,year
0,Switzerland,Western Europe,7.587,1.39651,1.34951,0.94143,0.66557,0.29678,0.41978,2015
1,Iceland,Western Europe,7.561,1.30232,1.40223,0.94784,0.62877,0.4363,0.14145,2015
2,Denmark,Western Europe,7.527,1.32548,1.36058,0.87464,0.64938,0.34139,0.48357,2015
3,Norway,Western Europe,7.522,1.459,1.33095,0.88521,0.66973,0.34699,0.36503,2015
4,Canada,North America and ANZ,7.427,1.32629,1.32261,0.90563,0.63297,0.45811,0.32957,2015


In [2]:
# Exporter au format CSV
whr_2015_2023.to_csv("whr_2015_2023.csv", index=False)