<div class="alert alert-block alert-info">A notebook that extracts data from the World Bank. It requires completing the indicators list with the World Bank indicator code, an arbitrary name, and an arbitrary category. The data is then extracted by year and country, with the country code added to help with future merging.</div>

# Import

In [None]:
# import joblib
# joblib.Memory(location=None)  ## disable persistent cache

In [None]:
import wbdata
import pandas as pd
import polars as pl
from polars import col as d
from datetime import datetime
from collections import defaultdict

# Define indicators

To have more informations about the indicators: https://data.worldbank.org/indicator/{code}

In [None]:
indicators = [
    ################
    ## Population
    ################
    {"code": "SP.POP.TOTL", "name": "POPU", "category": "Population"}, ## population
    {"code": "SP.POP.GROW", "name": "POPU_GROWTH_PCT", "category": "Population"}, ## population growth (annual %)

    {"code": "SP.URB.TOTL", "name": "URBAN_POPU", "category": "Population"}, ## urban population
    {"code": "SP.URB.TOTL.IN.ZS", "name": "URBAN_POPU_PCT", "category": "Population"}, ## urban population (% of total population)

    {"code": "SP.RUR.TOTL", "name": "RURAL_POPU", "category": "Population"}, ## rural population
    {"code": "SP.RUR.TOTL.ZS", "name": "RURAL_POPU_PCT", "category": "Population"}, ## rural population (% of total population)

    {"code": "EN.URB.LCTY", "name": "POPU_LARGEST_CITY", "category": "Population"}, ## population in the largest city
    {"code": "EN.URB.LCTY.UR.ZS", "name": "POPU_LARGEST_CITY_PCT", "category": "Population"}, ## population in the largest city (% of urban population)

    {"code": "EN.URB.MCTY", "name": "POPU_URBAN_1M", "category": "Population"}, ## population in urban agglomerations of more than 1 million
    {"code": "EN.URB.MCTY.TL.ZS", "name": "POPU_URBAN_1M_PCT", "category": "Population"}, ## population in urban agglomerations of more than 1 million (% of total population)

    {"code": "SP.POP.0014.TO.ZS", "name": "POPU_0_14_PCT", "category": "Population"}, ## population ages 0-14 (% of total population)
    {"code": "SP.POP.1564.TO.ZS", "name": "POPU_15_64_PCT", "category": "Population"}, ## population ages 15-64 (% of total population)
    {"code": "SP.POP.65UP.TO.ZS", "name": "POPU_65_PLUS_PCT", "category": "Population"}, ## population ages 65+ (% of total population)

    {"code": "EN.POP.DNST", "name": "POPU_DENSITY", "category": "Population"}, ## population density (people per sq.km of land area)


    ################
    ## Economic
    ################
    {"code": "NY.GDP.MKTP.KD", "name": "GDP_USD2015", "category": "Economic"}, ## GDP (constant 2015 US$)
    {"code": "NY.GDP.MKTP.KN", "name": "GDP_LCU", "category": "Economic"}, ## GDP (constant LCU) --> which year ?? we don't know
    {"code": "NY.GDP.MKTP.KD.ZG", "name": "GDP_GROWTH_PCT", "category": "Economic"}, ## GDP growth (annual %)

    {"code": "NY.GDP.MKTP.PP.KD", "name": "GDP_PPP_USD2021_INTER", "category": "Economic"}, ## GDP, PPP (constant 2021 international $)
    
    {"code": "NY.GDP.PCAP.KD", "name": "GDP_PER_CAPITA_USD2015", "category": "Economic"}, ## GDP per capita (contant 2015 US$)
    {"code": "NY.GDP.PCAP.KN", "name": "GDP_PER_CAPITA_LCU", "category": "Economic"}, ## GDP per capita (constant LCU)
    {"code": "NY.GDP.PCAP.KD.ZG", "name": "GDP_PER_CAPITA_GROWTH_PCT", "category": "Economic"}, ## GDP per capita growth (annual %)

    {"code": "NY.GNP.MKTP.KD", "name": "GNI_USD2015", "category": "Economic"}, ## GNI (constant 2015 US$)
    {"code": "NY.GNP.MKTP.KN", "name": "GNI_LCU", "category": "Economic"}, ## GNI (constant LCU)
    {"code": "NY.GNP.MKTP.KD.ZG", "name": "GNI_GROWTH_PCT", "category": "Economic"}, ## GNI growth (annual %)

    {"code": "NY.GNP.MKTP.PP.KD", "name": "GNI_PPP_USD2021_INTER", "category": "Economic"}, ## GNI, PPP (constant 2021 international $)

    {"code": "NY.GNP.PCAP.KD", "name": "GNI_PER_CAPITA_USD2015", "category": "Economic"}, ## GNI per capita (constant 2015 US$)
    {"code": "NY.GNP.PCAP.KN", "name": "GNI_PER_CAPITA_LCU", "category": "Economic"}, ## GNI per capita (constant LCU)
    {"code": "NY.GNP.PCAP.KD.ZG", "name": "GNI_PER_CAPITA_GROWTH_PCT", "category": "Economic"}, ## GNI per capita growth (annual %)

    {"code": "FP.CPI.TOTL.ZG", "name": "INFLATION_PCT", "category": "Economic"}, ## inflation, consumer prices (annual%) --> base 2015 US$

    {"code": "NE.EXP.GNFS.ZS", "name": "EXPORT_GDP_PCT", "category": "Economic"}, ## exports of goods and services (% of GDP)
    {"code": "NE.EXP.GNFS.KD", "name": "EXPORT_USD2015", "category": "Economic"}, ## exports of goods and services (constant 2015 US$)
    {"code": "NE.EXP.GNFS.KN", "name": "EXPORT_LCU", "category": "Economic"}, ## exports of goods and services (constant of LCU)
    {"code": "NE.EXP.GNFS.KD.ZG", "name": "EXPORT_GROWTH_PCT", "category": "Economic"}, ## exports of goods and services (annual % growth)

    {"code": "NE.IMP.GNFS.ZS", "name": "IMPORT_GDP_PCT", "category": "Economic"}, ## imports of goods and services (% of GDP)
    {"code": "NE.IMP.GNFS.KD", "name": "IMPORT_USD2015", "category": "Economic"}, ## imports of goods and services (constant 2015 US$)
    {"code": "NE.IMP.GNFS.KN", "name": "IMPORT_LCU", "category": "Economic"}, ## imports of goods and services (constant of LCU)
    {"code": "NE.IMP.GNFS.KD.ZG", "name": "IMPORT_GROWTH_PCT", "category": "Economic"}, ## imports of goods and services (annual % growth)

    ## not sure but in case
    # {"code": "NY.GNS.ICTR.KS", "name": "GROSS_SAVINGS_GDP_PCT", "category": "Economic"}, ## gross savings (% of GDP)
    # {"code": "NY.GNS.ICTR.GN.ZS", "name": "GROSS_SAVINGS_GNI_PCT", "category": "Economic"}, ## gross savings (% of GNI)
    # {"code": "GC.XPN.TOTL.GD.ZS", "name": "EXPENSE_GDP_PCT", "category": "Economic"}, ## expense (% of GDP)
    # {"code": "NY.GDP.DEFL.KD.ZG", "name": "GDP_DEFLATOR_PCT", "category": "Economic"}, ## inflation, GDP deflator (annual %)


    ################
    ## Energy
    ################
    ## not sure about that category
    {"code": "EG.USE.PCAP.KG.OE", "name": "NRJ_USE", "category": "Energy"}, ## energy use (kg of oil equivalent per capita)
    {"code": "EG.USE.COMM.GD.PP.KD", "name": "ENERGY_USE_1000GDP", "category": "Energy"}, ## energy use (kg of oil equivalent) per $1000 GDP (constant 2021 PPP)
    {"code": "EG.USE.COMM.FO.ZS", "name": "FOSSIL_FUEL_NRJ_PCT", "category": "Energy"}, ## fossil fuel energy consumption (% of total)
    {"code": "EG.USE.COMM.CL.ZS", "name": "ALTER_AND_NUCLEAR_NRJ_PCT", "category": "Energy"}, ## alternative and nuclear energy (% of total energy use)
    {"code": "EG.IMP.CONS.ZS", "name": "NRJ_IMPORT_NET_PCT", "category": "Energy"}, ## energy imports, net (% of energy use)
    {"code": "EG.FEC.RNEW.ZS", "name": "RNEW_NRJ_PCT", "category": "Energy"}, ## renewable energy consumption (% of total final energy consumption)
    {"code": "EG.USE.ELEC.KH.PC", "name": "ELEC_CONSUMPTION", "category": "Energy"}, ## electric power consumption (kWh per capita)
    {"code": "TX.VAL.FUEL.ZS.UN", "name": "FUEL_EXPORT_PCT", "category": "Energy"}, ## fuel exports (% of merchandise exports)
    {"code": "TM.VAL.FUEL.ZS.UN", "name": "FUEL_IMPORT_PCT", "category": "Energy"}, ## fuel imports (% of merchandise exports)


    ################
    ## Infrastructure 
    ################
    {"code": "IS.RRS.TOTL.KM", "name": "RAIL_LINES_KM", "category": "Infrastructure"}, ## rail lines (total route-km) ## to normalize with the size of the country
    {"code": "IS.RRS.PASG.KM", "name": "RAILWAYS_PAX", "category": "Infrastructure"}, ## railways, passengers carried (million passenger-km)
    {"code": "IE.PPI.TRAN.CD", "name": "INVEST_TRANSPORT", "category": "Infrastructure"}, ## investment in transport with private participation (current US$)


    ################
    ## Labor
    ################
    {"code": "SL.TLF.TOTL.IN", "name": "LABOR_FORCE_TOTAL", "category": "Labor"}, ## labor force total
    {"code": "SL.UEM.TOTL.ZS", "name": "UNEMPLOYMENT", "category": "Labor"}, ## unemployment, total (% of total labor force) (modeled ILO estimate) --> what does it mean ? 
    
    ################
    ## Geography
    ################
    {"code": "AG.LND.TOTL.K2", "name": "LAND_AREA_KM2", "category": "Geography"}, ## land area (sq.km)
    {"code": "AG.LND.TOTL.UR.K2", "name": "URBAN_LAND_AREA_km2", "category": "Geography"}, ## urban land area (sq.km)
    {"code": "AG.LND.TOTL.RU.K2", "name": "RURAL_LAND_AREA_km2", "category": "Geography"}, ## rural land area (sq.km)
    {"code": "AG.LND.FRST.ZS", "name": "FOREST_AREA_PCT", "category": "Geography"}, ## forest area (% of land area)
    {"code": "AG.LND.AGRI.ZS", "name": "AGR_LAND_PCT", "category": "Geography"}, ## agricultural land (% of land area)

    ################
    ## Tourism
    ################
    {"code": "ST.INT.RCPT.CD", "name": "INTER_TOURISM_RECEIPT_USD", "category": "Tourism"}, ## international tourism, receipts (current US$)
    {"code": "ST.INT.RCPT.XP.ZS", "name": "INTER_TOURISM_RECEIPT_PCT", "category": "Tourism"}, ## international tourism, receipts (% of total exports)

    {"code": "ST.INT.XPND.CD", "name": "INTER_TOURISM_EXPENDITURE_USD", "category": "Tourism"}, ## international tourism, expenditures (current US$)
    {"code": "ST.INT.XPND.MP.ZS", "name": "INTER_TOURISM_EXPENDITURE_PCT", "category": "Tourism"}, ## international tourism, expenditures (% of total imports)

    {"code": "ST.INT.ARVL", "name": "INTER_TOURISM_ARRIVAL", "category": "Tourism"}, ## international tourism, number of arrivals
    {"code": "ST.INT.DPRT", "name": "INTER_TOURISM_DEPARTURE", "category": "Tourism"}, ## international tourism, number of departures

]

# Create list with the columns name depending on the category

function useful for analysis if we want to zoom/filter more easily (not usefull here)

In [None]:
# def create_category_variables(data):
#     grouped = defaultdict(list)
#     for item in data:
#         grouped[item['category']].append(item['name'])
    
#     created_categories = []
#     for category, names in grouped.items():
#         var_name = f"{category}_columns_list"
#         globals()[var_name] = names
#         created_categories.append(var_name)
    
#     return created_categories

# categories = create_category_variables(indicators)

# print("Variables :", categories)


# Download data

In [None]:
## create a dictionary for wbdata
ind_dict = {ind["code"]: ind["name"] for ind in indicators}

## define the date range (need to be datetime object don't work otherwise)
start_date = datetime(2000, 1, 1)
end_date = datetime(2023, 1, 1)

In [None]:
## download data using wbdata
print("Downloading data from World Bank API...")
df_data_extract_raw = wbdata.get_dataframe(ind_dict, country="all", date=(start_date, end_date)).reset_index()
print("Download completed!")

# Processed the raw extract

In [None]:
## retrieve country codes to facilitate mapping
countries = wbdata.get_countries()
country_df = pl.DataFrame({"COUNTRY_NAME": [c["name"] for c in countries],"COUNTRY_CODE": [c["id"] for c in countries],})

In [None]:
df_data_extract_processed = (
    pl.from_pandas(df_data_extract_raw)
    .rename({'date':'YEAR', 'country':'COUNTRY_NAME'})
    .with_columns(YEAR = d.YEAR.cast(pl.Int32))
    .join(country_df, how = 'left', on = 'COUNTRY_NAME')
)

In [None]:
# df_data_extract_processed.head()

# Save csv

In [None]:
# df_data_extract_processed.write_csv("worldbank_indicators.csv")
# print("Saved CSV: worldbank_indicators.csv")