# Preprocessing

## download data

In [1]:
# run this cell to download data!

import os

directory_path = "data/00--raw/macro"

if not os.path.exists(directory_path) or not os.listdir(directory_path):
    !python3 -m src.fetch.pipeline


In [2]:
import pandas as pd
from typing import *
from src.preprocess.dataset import Dataset
from src.preprocess.result import ResultData


# Usage Example:

dataset_names: Optional[List[str]]

result_data = ResultData(
    datadict = True, # Optional[Dict[str, pd.DataFrame]]
    ml_ready = True, # Optional[pd.DataFrame]
    metadata = True  # Optional["Metadata"]
    )

dataset = Dataset()

result_data = dataset.get(result_data)


In [3]:
# dictionary with key=names : value=dataframe

"""
{"feature1" : dataframe}

frames in format:
year | country1 | country2 ...
2019 | value1   | value2   
"""

datadict : Dict[str, pd.DataFrame] = result_data.datadict

for key in list(datadict.keys()): 
    print(f"{key}") 

individuals_using_the_internet_percent_of_population
net_trade_in_goods_and_services_current_usd
fdi_net_inflows_current_usd
area
education_years
services_value_added_percent_of_gdp
manufacturing_value_added_percent_of_gdp
total_population
economic_activity
life_expectancy_at_birth_total_years
gdp_per_person_employed_constant_2011_ppp_usd
population_size
petroleum_energy_production
renewables_excluding_hydro_share
hydro_electric_energy_production
gas_energy_production
coal_energy_production
consumer_price_index_change
unemployment_rate_percent_of_total_labor_force
gdp_current_usd


In [4]:
# single dataframe in format
"""
year country   |  feature1 |  feature2 ...
2019 country1  |  value    |  value    ...
2019 country2  |  value    |  value    ...
2019 country3  |  value    |  value    ...
...
2020 country1  |  feature1 |  value    ...
2020 country2  |  feature1 |  value    ...
2020 country3  |  feature1 |  value    ...
...
"""


ml_data : pd.DataFrame = result_data.ml_ready
print(ml_data.head().to_markdown())

|    | date                | country     |   individuals_using_the_internet_percent_of_population |   net_trade_in_goods_and_services_current_usd |   fdi_net_inflows_current_usd |             area |   education_years |   services_value_added_percent_of_gdp |   manufacturing_value_added_percent_of_gdp |   total_population |   economic_activity |   life_expectancy_at_birth_total_years |   gdp_per_person_employed_constant_2011_ppp_usd |   population_size |   petroleum_energy_production |   renewables_excluding_hydro_share |   hydro_electric_energy_production |   gas_energy_production |   coal_energy_production |   consumer_price_index_change |   unemployment_rate_percent_of_total_labor_force |   gdp_current_usd |
|---:|:--------------------|:------------|-------------------------------------------------------:|----------------------------------------------:|------------------------------:|-----------------:|------------------:|--------------------------------------:|----------------------

In [5]:


import re
from typing import List, Dict, Tuple

def normalize_name(name: str) -> str:
    name = name.strip().lower()
    name = re.sub(r"[^\w]+", "_", name)  # replaces any non-alphanumeric (incl. apostrophes) with "_"
    return re.sub(r"_+", "_", name).strip("_")

def check_country_coverage(unique_keys: List[str], countries: Dict[str, Dict]) -> Tuple[List[str], List[str]]:
    matched = []
    unmatched = []

    # Build a set of all normalized aliases
    alias_map = set()
    for entry in countries.values():
        alias_map.update(normalize_name(alias) for alias in entry["aliases"])

    for raw_key in unique_keys:
        normalized = normalize_name(raw_key)
        if normalized in alias_map:
            matched.append(raw_key)
        else:
            unmatched.append(raw_key)

    return matched, unmatched

unique_keys = ml_data["country"].unique().tolist()
countries = result_data.metadata.countries
matched, unmatched = check_country_coverage(unique_keys, countries)

print("Matched:", matched)
print("Unmatched:", unmatched)


Matched: ['afghanistan', 'albania', 'algeria', 'angola', 'armenia', 'australia', 'austria', 'azerbaijan', 'bahamas', 'bahrain', 'bangladesh', 'belarus', 'belgium', 'belize', 'benin', 'bhutan', 'bolivia', 'bosnia_and_herzegovina', 'botswana', 'brazil', 'brunei', 'burkina_faso', 'burundi', 'cabo_verde', 'cambodia', 'cameroon', 'canada', 'central_african_republic', 'chad', 'chile', 'china', 'colombia', 'congo_democratic_republic', 'congo_republic', 'costa_rica', 'croatia', 'cyprus', 'czechia', 'denmark', 'djibouti', 'dominican_republic', 'ecuador', 'egypt', 'el_salvador', 'equatorial_guinea', 'estonia', 'ethiopia', 'fiji', 'finland', 'france', 'gabon', 'gambia', 'georgia', 'germany', 'ghana', 'greece', 'guatemala', 'guinea', 'guinea_bissau', 'guyana', 'haiti', 'honduras', 'hong_kong', 'hungary', 'iceland', 'india', 'indonesia', 'iran', 'iraq', 'ireland', 'israel', 'italy', 'ivory_coast', 'jamaica', 'japan', 'jordan', 'kazakhstan', 'kenya', 'kuwait', 'kyrgyzstan', 'laos', 'latvia', 'lebano

In [6]:

print(ml_data.to_markdown())

|      | date                | country                   |   individuals_using_the_internet_percent_of_population |   net_trade_in_goods_and_services_current_usd |   fdi_net_inflows_current_usd |             area |   education_years |   services_value_added_percent_of_gdp |   manufacturing_value_added_percent_of_gdp |   total_population |   economic_activity |   life_expectancy_at_birth_total_years |   gdp_per_person_employed_constant_2011_ppp_usd |   population_size |   petroleum_energy_production |   renewables_excluding_hydro_share |   hydro_electric_energy_production |   gas_energy_production |   coal_energy_production |   consumer_price_index_change |   unemployment_rate_percent_of_total_labor_force |   gdp_current_usd |
|-----:|:--------------------|:--------------------------|-------------------------------------------------------:|----------------------------------------------:|------------------------------:|-----------------:|------------------:|------------------------------