In [1]:
# run this cell to download data!

import os

directory_path = "data/00--raw/macro"

if not os.path.exists(directory_path) or not os.listdir(directory_path):
    !python3 -m src.fetch.pipeline


In [2]:
import pandas as pd
from typing import *
from src.preprocess.dataset import Dataset, DatasetConfig
from src.preprocess.result import ResultData


# Usage Example:

dataset_names: Optional[List[str]]

result_data = ResultData(
    datadict = True, # Optional[Dict[str, pd.DataFrame]]
    ml_ready = True, # Optional[pd.DataFrame]
    metadata = True  # Optional["Metadata"]
    )

dataset = Dataset(DatasetConfig(type="raw"))

result_data = dataset.get(datadict=True, ml_ready=True, metadata=True)
datadict = result_data.datadict



In [3]:
from src.clean.clean import clean_datadict
filtered_dd, cleaned_dd = clean_datadict(
    datadict, 
    start_year=1991, 
    spline_order=3,
    feat_missing_thresh = 0.2,
    country_missing_thresh = 0.2,
)

In [4]:
from pathlib import Path

def save_cleaned_datasets(
        result: ResultData, 
        base_dir: str = "data/01--clean/macro"
    )-> None:
    """
    Save cleaned datasets to category-based structure in 01--clean/macro.
    """
    base_dir = Path(base_dir)
    if result.datadict is None:
        raise ValueError("result data must contain a valid datadict")
    if result.metadata.category_dict is None:
        raise ValueError("result data must contain valid metadata.category_dict")
    datadict = result.datadict
    category_dict = result.metadata.category_dict

    for name, df in datadict.items():
        category = category_dict.get(name, "uncategorized")
        folder = base_dir / category
        folder.mkdir(parents=True, exist_ok=True)

        file_path = folder / f"{name}_world_bank.csv"
        df.to_csv(file_path)
        print(f"✅ Saved: {file_path}")


filt_rd:  ResultData = Dataset(DatasetConfig(type="raw")).get(datadict=True, metadata=True)
clean_rd: ResultData = Dataset(DatasetConfig(type="raw")).get(datadict=True, metadata=True)

filt_rd.datadict = filtered_dd
clean_rd.datadict = cleaned_dd

save_cleaned_datasets(result=filt_rd, base_dir="data/01--filter/")                      
save_cleaned_datasets(result=clean_rd, base_dir="data/02--clean/")

✅ Saved: data/01--filter/trade-and-commerce/fdi_net_inflows_current_usd_world_bank.csv
✅ Saved: data/01--filter/geographic/area_world_bank.csv
✅ Saved: data/01--filter/education/education_years_world_bank.csv
✅ Saved: data/01--filter/sectoral-performance/services_value_added_percent_of_gdp_world_bank.csv
✅ Saved: data/01--filter/demography/total_population_world_bank.csv
✅ Saved: data/01--filter/demography/economic_activity_world_bank.csv
✅ Saved: data/01--filter/demography/life_expectancy_at_birth_total_years_world_bank.csv
✅ Saved: data/01--filter/demography/population_size_world_bank.csv
✅ Saved: data/01--filter/energy/petroleum_energy_production_world_bank.csv
✅ Saved: data/01--filter/energy/hydro_electric_energy_production_world_bank.csv
✅ Saved: data/01--filter/macroeconomic/unemployment_rate_percent_of_total_labor_force_world_bank.csv
✅ Saved: data/01--filter/macroeconomic/gdp_current_usd_world_bank.csv
✅ Saved: data/02--clean/trade-and-commerce/fdi_net_inflows_current_usd_world