In [1]:
import pandas as pd
from typing import *
from src.preprocess.dataset import Dataset, DatasetConfig
from src.preprocess.result import ResultData


# Usage Example:

dataset_names: Optional[List[str]]

rd_raw = ResultData(
    datadict = True, # Optional[Dict[str, pd.DataFrame]]
    ml_ready = True, # Optional[pd.DataFrame]
    metadata = True  # Optional["Metadata"]
    )

rd_clean = ResultData(
    datadict = True, # Optional[Dict[str, pd.DataFrame]]
    ml_ready = True, # Optional[pd.DataFrame]
    metadata = True  # Optional["Metadata"]
    )

dataset_raw = Dataset(
    DatasetConfig(use_raw=True)
)

dataset_clean = Dataset(
    DatasetConfig(use_raw=False)
)

result_raw   = dataset_raw.get(rd_raw)
result_clean = dataset_clean.get(rd_clean)

In [2]:
print(f"total NaN in raw:     {result_raw.ml_ready.isna().sum().sum()}")
print(f"total NaN in cleaned: {result_clean.ml_ready.isna().sum().sum()}")

print(dataset_raw.config.use_raw)
print(dataset_clean.config.use_raw)

print(f"\ngroup sizes raw:   {result_raw.ml_ready.groupby('country').size()}")
print(f"\ngroup sizes clean: {result_clean.ml_ready.groupby('country').size()}")


total NaN in raw:     168192
total NaN in cleaned: 0
True
False

group sizes raw:   country
afghanistan           65
albania               65
algeria               65
american_samoa        65
andorra               65
                      ..
virgin_islands_us     65
west_bank_and_gaza    65
yemen                 65
zambia                65
zimbabwe              65
Length: 219, dtype: int64

group sizes clean: country
afghanistan           25
albania               25
algeria               25
angola                25
armenia               25
                      ..
vanuatu               25
vietnam               25
west_bank_and_gaza    25
zambia                25
zimbabwe              25
Length: 158, dtype: int64


In [3]:
print("raw dataset")
print(result_raw.ml_ready.head().to_markdown())

print("cleaned dataset")
print(result_clean.ml_ready.head().to_markdown())

raw dataset
|    | date                | country        |   research_and_development_expenditure_percent_of_gdp |   individuals_using_the_internet_percent_of_population |   net_trade_in_goods_and_services_current_usd |   fdi_net_inflows_current_usd |   area |   education_expenditures |   education_years |   services_value_added_percent_of_gdp |   manufacturing_value_added_percent_of_gdp |   net_official_development_assistance_received_current_usd |   total_population |   poverty_headcount_ratio_at_1.90_a_day_2011_ppp_percent_of_population |   economic_activity |   life_expectancy_at_birth_total_years |   gdp_per_person_employed_constant_2011_ppp_usd |   population_size |   gini_income_inequality |   petroleum_energy_production |   renewables_excluding_hydro_share |   energy_use_kg_of_oil_equivalent_per_capita |   hydro_electric_energy_production |   gas_energy_production |   coal_energy_production |   consumer_price_index_change |   unemployment_rate_percent_of_total_labor_force |   gd

In [4]:
# dictionary with key=names : value=dataframe

"""
{"feature1" : dataframe}

frames in format:
year | country1 | country2 ...
2019 | value1   | value2   
"""

datadict : Dict[str, pd.DataFrame] = result_clean.datadict

for key in list(datadict.keys()): 
    print(f"{key}") 

individuals_using_the_internet_percent_of_population
net_trade_in_goods_and_services_current_usd
fdi_net_inflows_current_usd
area
education_years
services_value_added_percent_of_gdp
manufacturing_value_added_percent_of_gdp
total_population
economic_activity
life_expectancy_at_birth_total_years
gdp_per_person_employed_constant_2011_ppp_usd
population_size
petroleum_energy_production
renewables_excluding_hydro_share
hydro_electric_energy_production
gas_energy_production
coal_energy_production
consumer_price_index_change
unemployment_rate_percent_of_total_labor_force
gdp_current_usd


In [5]:
# single dataframe in format
"""
year country   |  feature1 |  feature2 ...
2019 country1  |  value    |  value    ...
2019 country2  |  value    |  value    ...
2019 country3  |  value    |  value    ...
...
2020 country1  |  feature1 |  value    ...
2020 country2  |  feature1 |  value    ...
2020 country3  |  feature1 |  value    ...
...
"""


ml_data : pd.DataFrame = result_clean.ml_ready
print(ml_data.head().to_markdown())

|    | date                | country     |   individuals_using_the_internet_percent_of_population |   net_trade_in_goods_and_services_current_usd |   fdi_net_inflows_current_usd |             area |   education_years |   services_value_added_percent_of_gdp |   manufacturing_value_added_percent_of_gdp |   total_population |   economic_activity |   life_expectancy_at_birth_total_years |   gdp_per_person_employed_constant_2011_ppp_usd |   population_size |   petroleum_energy_production |   renewables_excluding_hydro_share |   hydro_electric_energy_production |   gas_energy_production |   coal_energy_production |   consumer_price_index_change |   unemployment_rate_percent_of_total_labor_force |   gdp_current_usd |
|---:|:--------------------|:------------|-------------------------------------------------------:|----------------------------------------------:|------------------------------:|-----------------:|------------------:|--------------------------------------:|----------------------