## Basic country data

In [31]:
import json
import pandas as pd
import numpy as np
import itertools
import datetime
from utils import *
import os

# Ensure we're running in the right directory
chdir_this_file()

# ------------------------------------------------------------------------------------------------------------------
# Import data
df = pd.DataFrame(json.load(open("../../public/data/local/countries_processed.json", encoding="utf-8")))

# Filter & rename columns
df.columns = ['iso', 'iso3', 'iso_numeric', 'fips', 'name', 'capital',
              'area_km2', 'population', 'continent', 'tld',
              'currency_code', 'currency_name', 'phone', 'zip_format', 'zip_regex',
              'languages', 'geonameid', 'neighbors', 'eq_fips', 'parent', 'territories', 'neighbors_t']

subset = ['iso', 'iso3', 'name', 'capital', 'continent',
          'area_km2', 'population',
          'currency_code', 'currency_name', 'languages',
          'territories', 'neighbors', 'neighbors_t']
df = df[subset]

# Import GDP data
# GDP data from https://github.com/datasets/gdp/blob/master/data/gdp.csv
gdp_data = pd.read_csv("../../public/data/local/gdp.csv")
gdp = gdp_data.sort_values("Year").groupby("Country Code").tail(1).set_index("Country Code").rename(columns={"Value": "gdp"})
df = df.join(gdp["gdp"], on="iso3")
# print("no gdp data:")
# print(df[df["gdp"].isna()][["iso", "name", "population", "gdp"]])
df.loc[df.iso == "TW", "gdp"] = 790.7e9  # https://en.wikipedia.org/wiki/Economy_of_Taiwan (2023 data, accessed Aug 2023)
df.loc[df.iso == "KP", "gdp"] = 28.5e9  # https://en.wikipedia.org/wiki/Economy_of_North_Korea (2016 data, accessed Aug 2023)
df["gdp_per_capita"] = df["gdp"] / df["population"]
df.loc[df.iso == "VA", "gdp_per_capita"] = 21198  #  https://en.wikipedia.org/wiki/Economy_of_Vatican_City (2016 data, accessed Aug 2023)
df["gdp"] = df["gdp_per_capita"] * df["population"]


# Custom data changes

In [32]:
# Individual fixes
df.loc[df["name"] == "Palau", "capital"] = "Ngerulmud"  # old value seems wrong

# Consider territorial borders as alternative values (e.g. France-Brazil)
neighbors_alt = df.apply(lambda row: [c for c in row["neighbors_t"] if c not in row["neighbors"]], axis=1)
df.drop(columns=["neighbors_t"], inplace=True)
df.insert(list(df.columns).index("neighbors") + 1, "neighbors_alt", neighbors_alt)

# Border fixes
remove_border(df, "US", "Cuba")  # not so narrow maritime border
remove_border(df, "US", "Bahamas")  # not so narrow maritime border
add_alternative_border(df, "Singapore", "Malaysia")  # narrow maritime border
add_alternative_border(df, "Spain", "Morocco")  # Ceuta/Melilla provinces

# Additional columns & global fixes
df["continent"].fillna("NA", inplace=True)  # North America fix
df["landlocked"] = df["iso"].isin("AF,AD,AM,AT,AZ,BY,BT,BO,BW,BF,BI,CF,TD,CZ,SZ,ET,HU,KZ,XK,KG,LA,LS,LI,LU,MW,ML,MD,MN,NP,NE,MK,PY,RW,SM,RS,SK,SS,CH,TJ,TM,UG,UZ,VA,ZM,ZW".split(","))
df["island"] = (df["neighbors"].apply(len) == 0) | df["iso"].isin("ID,PG,TL,SG,BN,GB,IE,DO,HT".split(","))
add_alternative_value(df, "island", "Australia", False, True)

# Alternative values
# Names
add_alternative_value(df, "name", "CI", "Ivory Coast", "Côte d'Ivoire")
add_alternative_value(df, "name", "MK", "North Macedonia", "Macedonia")
add_alternative_value(df, "name", "PS", "Palestine", "Palestinian Territory")
add_alternative_value(df, "name", "TR", "Türkiye", "Turkey")
add_alternative_value(df, "name", "VA", "Vatican", "Vatican City")
add_alternative_value(df, "name", "US", "United States", "United States of America", "USA")
add_alternative_value(df, "name", "CZ", "Czech Republic", "Czechia")
add_alternative_value(df, "name", "CV", "Cabo Verde", "Cape Verde")

# Multiple continents (source: https://en.wikipedia.org/wiki/List_of_transcontinental_countries)
add_alternative_value(df, "continent", "Armenia", "AS", "EU")
add_alternative_value(df, "continent", "Georgia", "AS", "EU")
add_alternative_value(df, "continent", "Azerbaijan", "AS", "EU")
add_alternative_value(df, "continent", "Trinidad and Tobago", "NA", "SA")
add_alternative_value(df, "continent", "Panama", "NA", "SA")
add_alternative_value(df, "continent", "Egypt", "AF", "AS")
add_alternative_value(df, "continent", "Russia", "EU", "AS")
add_alternative_value(df, "continent", "TR", "AS", "EU")
add_alternative_value(df, "continent", "Timor Leste", "AS", "OC")

# Borders
add_alternative_value(df, "name", "CI", "Ivory Coast", "Côte d'Ivoire")

# Multiple/unclear capital (source: https://en.wikipedia.org/wiki/List_of_countries_with_multiple_capitals)
add_alternative_value(df, "capital", "Kazakhstan", "Astana", "Nur-Sultan")
add_alternative_value(df, "capital", "Bolivia", "La Paz", "Sucre")
add_alternative_value(df, "capital", "Burundi", "Gitega", "Bujumbura")
add_alternative_value(df, "capital", "CI", "Yamoussoukro", "Abidjan")
add_alternative_value(df, "capital", "Eswatini", "Mbabane", "Lobamba")
add_alternative_value(df, "capital", "Malaysia", "Kuala Lumpur", "Putrajaya")
add_alternative_value(df, "capital", "Netherlands", "Amsterdam", "The Hague")
add_alternative_value(df, "capital", "Palestine", "Ramallah", "Jerusalem", "East Jerusalem")
add_alternative_value(df, "capital", "South Africa", "Pretoria", "Cape Town", "Bloemfontein")
add_alternative_value(df, "capital", "Sri Lanka", "Colombo", "Sri Jayawardenepura Kotte")

# Capitals with multiple spellings / alternative names
add_alternative_value(df, "capital", "US", "Washington", "Washington, DC")
add_alternative_value(df, "capital", "Chile", "Santiago", "Santiago de Chile")

Australia/island: 'True' is already set as main value - swapping with 'False'
CI/name: 'Ivory Coast' is already set as main value - skipping
MK/name: 'North Macedonia' is already set as main value - skipping
PS/name: 'Palestinian Territory' is already set as main value - swapping with 'Palestine'
TR/name: 'Turkey' is already set as main value - swapping with 'Türkiye'
VA/name: 'Vatican' is already set as main value - skipping
US/name: 'United States' is already set as main value - skipping
CZ/name: 'Czechia' is already set as main value - swapping with 'Czech Republic'
CV/name: 'Cabo Verde' is already set as main value - skipping
Armenia/continent: 'AS' is already set as main value - skipping
Georgia/continent: 'AS' is already set as main value - skipping
Azerbaijan/continent: 'AS' is already set as main value - skipping
Trinidad and Tobago/continent: 'NA' is already set as main value - skipping
Panama/continent: 'NA' is already set as main value - skipping
Egypt/continent: 'AF' is alr

True

In [33]:

# ------------------------------------------------------------------------------------------------------------------
# Import flag colors
from colors import add_flag_colors
df = add_flag_colors(df)

Assigned colors. 0 countries missing a flag.


## Elevation Data

In [34]:
import re
import unicodedata

elev = pd.read_csv("../../public/data/local/elevation/elevation.csv", encoding="utf-8", sep=";")
elev.columns = ["name", "max_elev_name", "max_elev", "min_elev_name", "min_elev", "elev_span"]
elev["name"].fillna(method='ffill', inplace=True)
elev = elev.groupby("name").agg({col: list for col in elev.columns[1:]}).reset_index()

def parse_name(s):
    if s is None:
        return None
    s = unicodedata.normalize("NFKD", s).strip()
    # remove footnote
    return re.sub(r"\[.*?\]", "", s)
    
def parse_elevation(specs):
    for s in specs:
        if s is None or pd.isna(s):
            continue
        if type(s) == str:
            if s.lower() == "sea level":
                return 0
            if s.lower() == "data missing" or s.lower == "[data missing]":
                return None
            match = re.search(r"([\-−\+]?[\d\.,]+)\s*m", s)
            if match is None:
                continue
            return int(float(match.groups()[0].replace("−", "-")))
        return specs
    return None
def parse_elevation_name(specs):
    for s in specs:
        if s is None or pd.isna(s):
            continue
        if type(s) == str:
            if s.lower() == "data missing" or s.lower == "[data missing]":
                print(s)
                return None
            return parse_name(s)
    return None
elev["name"] = elev["name"].apply(parse_name)
elev["max_elev"] = elev["max_elev"].apply(parse_elevation).astype(float)
elev["max_elev_name"] = elev["max_elev_name"].apply(parse_elevation_name)
elev["min_elev"] = elev["min_elev"].apply(parse_elevation).astype(float)
elev["min_elev_name"] = elev["min_elev_name"].apply(parse_elevation_name)
# elev[elev["min_elev_name"].str.contains("data")]
# elev[elev["min_elev_name"].apply(lambda x: x is not None and not pd.isna(x).any() and any(["data" in s for s in x]))]
# elev[elev["min_elev_name1"].isna()]
elev.loc[48, "name"] = "Cabo Verde"
elev.loc[79, "name"] = "Micronesia"
elev.loc[220, "name"] = "Sao Tome and Principe"
elev.loc[225, "name"] = "Timor Leste"
elev.loc[231, "name"] = "Türkiye"
elev.loc[252, "name"] = "Vatican"

elev = elev.sort_values("name").drop(columns="elev_span")

df = df.merge(elev, on="name", how="left")
print(f"Added elevation data ({df['max_elev'].isna().sum()} countries without data)")

Added elevation data (0 countries without data)


In [35]:
# set(df["name"]).difference(set(elev["name"]))

In [36]:
# elev[elev.name.apply(lambda name: name in ["Vatican City", "Turkey", "Cape Verde"] or any(s in name for s in ["Micro", "ncipe", "Timor", "Verde"]))]

In [37]:
# df[df['max_elev'].isna()]

In [38]:
# Display all alternative values
altcols = [col for col in df.columns if col.endswith("_alt")]
print("\nAll countries with alternative values:")
df[df[altcols].applymap(len).sum(axis=1) > 0]



All countries with alternative values:


Unnamed: 0,iso,iso3,name,name_alt,capital,capital_alt,continent,continent_alt,area_km2,population,...,gdp_per_capita,landlocked,island,island_alt,flag_colors,flag_colors_alt,max_elev_name,max_elev,min_elev_name,min_elev
5,AM,ARM,Armenia,[],Yerevan,[],AS,[EU],29800.0,2951776,...,3581.673657,True,False,[],"[Blue, Orange, Red]",[],Mount Aragats,4090.0,Debed,400.0
7,AR,ARG,Argentina,[],Buenos Aires,[],SA,[],2766890.0,44494502,...,12259.404621,False,False,[],"[Blue, White]",[Yellow/Gold],Aconcagua,6960.0,Laguna del Carbón,-105.0
9,AU,AUS,Australia,[],Canberra,[],OC,[],7686850.0,24992369,...,48199.369969,False,False,[True],"[Blue, Red, White]",[],Mount Kosciuszko,2228.0,Lake Eyre,-15.0
10,AZ,AZE,Azerbaijan,[],Baku,[],AS,[EU],86600.0,9942334,...,3806.723425,True,False,[],"[Blue, Green, Red, White]",[],Mount Bazardüzü,4485.0,Caspian Sea,-28.0
18,BI,BDI,Burundi,[],Gitega,[Bujumbura],AF,[],27830.0,11175378,...,269.07627,True,False,[],"[Green, Red, White]",[],Mount Heha,2684.0,Lake Tanganyika,772.0
21,BO,BOL,Bolivia,[],La Paz,[Sucre],SA,[],1098580.0,11353142,...,2977.712735,True,False,[],"[Green, Red, Yellow/Gold]",[],Sajama,6542.0,Paraguay River,90.0
22,BR,BRA,Brazil,[],Brasilia,[],SA,[],8511965.0,209469333,...,8574.938205,False,False,[],"[Blue, Green, White, Yellow/Gold]",[],Pico da Neblina,2995.0,Atlantic Ocean,0.0
27,BZ,BLZ,Belize,[],Belmopan,[],,[],22966.0,383071,...,4545.110436,False,False,[],"[Blue, Red, White]",[Green],Doyle's Delight,1124.0,Caribbean Sea,0.0
28,CA,CAN,Canada,[],Ottawa,[],,[],9984670.0,37058856,...,41279.215208,False,False,[],"[Red, White]",[],Mount Logan,5959.0,North Atlantic Ocean,0.0
33,CI,CIV,Ivory Coast,[Côte d'Ivoire],Yamoussoukro,[Abidjan],AF,[],322460.0,25069229,...,1450.886783,False,False,[],"[Green, Orange, White]",[],Mont Nimba,1752.0,Gulf of Guinea,0.0


# Export country data (EN)

In [39]:
export_country_data(df, "en")

CSV Country data exported to ../../public/data/countries/countries-en.csv.
JSON Country data exported to ../../public/data/countries/countries-en.json.
