# Preprocessing

## download data

In [1]:
# run this cell to download data!

import os

directory_path = "data/00--raw/macro"

if not os.path.exists(directory_path) or not os.listdir(directory_path):
    !python3 -m src.fetch.pipeline


In [2]:
import pandas as pd
from typing import *
from src.preprocess.dataset import Dataset
from src.preprocess.result import ResultData


# Usage Example:

dataset_names: Optional[List[str]]

result_data = ResultData(
    datadict = True, # Optional[Dict[str, pd.DataFrame]]
    ml_ready = True, # Optional[pd.DataFrame]
    metadata = True  # Optional["Metadata"]
    )

dataset = Dataset()

result_data = dataset.get(result_data)


In [3]:
# dictionary with key=names : value=dataframe

"""
{"feature1" : dataframe}

frames in format:
year | country1 | country2 ...
2019 | value1   | value2   
"""

datadict : Dict[str, pd.DataFrame] = result_data.datadict

for key in list(datadict.keys()): 
    print(f"{key}") 

research_and_development_expenditure_percent_of_gdp
individuals_using_the_internet_percent_of_population
net_trade_in_goods_and_services_current_usd
fdi_net_inflows_current_usd
area
education_expenditures
education_years
services_value_added_percent_of_gdp
manufacturing_value_added_percent_of_gdp
net_official_development_assistance_received_current_usd
total_population
poverty_headcount_ratio_at_1.90_a_day_2011_ppp_percent_of_population
economic_activity
life_expectancy_at_birth_total_years
gdp_per_person_employed_constant_2011_ppp_usd
population_size
gini_income_inequality
petroleum_energy_production
energy_use_kg_of_oil_equivalent_per_capita
hydro_electric_energy_production
gas_energy_production
coal_energy_production
consumer_price_index_change
unemployment_rate_percent_of_total_labor_force
gdp_current_usd
political_stability_and_absence_of_violence_terrorism_percentile_rank


In [4]:
# single dataframe in format
"""
year country   |  feature1 |  feature2 ...
2019 country1  |  value    |  value    ...
2019 country2  |  value    |  value    ...
2019 country3  |  value    |  value    ...
...
2020 country1  |  feature1 |  value    ...
2020 country2  |  feature1 |  value    ...
2020 country3  |  feature1 |  value    ...
...
"""


ml_data : pd.DataFrame = result_data.ml_ready
print(ml_data.head().to_markdown())

|    | date                | country                     |   research_and_development_expenditure_percent_of_gdp |   individuals_using_the_internet_percent_of_population |   net_trade_in_goods_and_services_current_usd |   fdi_net_inflows_current_usd |   area |   education_expenditures |   education_years |   services_value_added_percent_of_gdp |   manufacturing_value_added_percent_of_gdp |   net_official_development_assistance_received_current_usd |   total_population |   poverty_headcount_ratio_at_1.90_a_day_2011_ppp_percent_of_population |   economic_activity |   life_expectancy_at_birth_total_years |   gdp_per_person_employed_constant_2011_ppp_usd |   population_size |   gini_income_inequality |   petroleum_energy_production |   energy_use_kg_of_oil_equivalent_per_capita |   hydro_electric_energy_production |   gas_energy_production |   coal_energy_production |   consumer_price_index_change |   unemployment_rate_percent_of_total_labor_force |   gdp_current_usd |   political_stabilit

In [5]:


import re
from typing import List, Dict, Tuple

def normalize_name(name: str) -> str:
    name = name.strip().lower()
    name = re.sub(r"[^\w]+", "_", name)  # replaces any non-alphanumeric (incl. apostrophes) with "_"
    return re.sub(r"_+", "_", name).strip("_")

def check_country_coverage(unique_keys: List[str], countries: Dict[str, Dict]) -> Tuple[List[str], List[str]]:
    matched = []
    unmatched = []

    # Build a set of all normalized aliases
    alias_map = set()
    for entry in countries.values():
        alias_map.update(normalize_name(alias) for alias in entry["aliases"])

    for raw_key in unique_keys:
        normalized = normalize_name(raw_key)
        if normalized in alias_map:
            matched.append(raw_key)
        else:
            unmatched.append(raw_key)

    return matched, unmatched




unique_keys = ml_data["country"].unique().tolist()

countries = result_data.metadata.countries

matched, unmatched = check_country_coverage(unique_keys, countries)

print("Matched:", matched)
print("Unmatched:", unmatched)




Matched: ['afghanistan', 'africa eastern and southern', 'africa western and central', 'albania', 'algeria', 'american samoa', 'andorra', 'angola', 'antigua and barbuda', 'arab world', 'argentina', 'armenia', 'aruba', 'australia', 'austria', 'azerbaijan', 'bahamas, the', 'bahrain', 'bangladesh', 'barbados', 'belarus', 'belgium', 'belize', 'benin', 'bermuda', 'bhutan', 'bolivia', 'bosnia and herzegovina', 'botswana', 'brazil', 'british virgin islands', 'brunei darussalam', 'bulgaria', 'burkina faso', 'burundi', 'cabo verde', 'cambodia', 'cameroon', 'canada', 'caribbean small states', 'cayman islands', 'central african republic', 'central europe and the baltics', 'chad', 'channel islands', 'chile', 'china', 'colombia', 'comoros', 'congo, dem. rep.', 'congo, rep.', 'costa rica', "cote d'ivoire", 'croatia', 'cuba', 'curacao', 'cyprus', 'czechia', 'denmark', 'djibouti', 'dominica', 'dominican republic', 'early-demographic dividend', 'east asia & pacific', 'east asia & pacific (excluding high