## Download the obesity rates from World Health Organisation

In [1]:
import requests
import pandas as pd
from bs4 import BeautifulSoup
import os


# simple function for downloading file from url
def download_file(url, destination_path):
    if os.path.exists(destination_path):
        print(f"File {destination_path} already exists.")
        return

    response = requests.get(url)
    if response.status_code == 200:
        with open(destination_path, "wb") as file:
            file.write(response.content)
            print(f"File {destination_path} downloaded successfully")
    else:
        print(f"Failed to download {destination_path} from url {url}")

In [2]:
# url used to download the data
url_obesity = "https://apps.who.int/gho/athena/data/GHO/NCD_BMI_30C?filter=AGEGROUP:*;COUNTRY:*;SEX:*&ead=&x-sideaxis=COUNTRY&x-topaxis=YEAR;GHO;SEX&profile=crosstable&format=csv"

source_data_folder = "source_data"
# make a directory for downloaded files
if not os.path.exists(source_data_folder):
    print(f"Creating directory '{source_data_folder}'")
    os.makedirs(source_data_folder)


download_file(url_obesity, os.path.join(source_data_folder, "obesity_rate_data.csv"))

File source_data/obesity_rate_data.csv already exists.


In [3]:
df_obesity_temp = pd.read_csv(os.path.join(source_data_folder, "obesity_rate_data.csv"))
df_obesity_temp.head()

Unnamed: 0.1,Unnamed: 0,2016,2016.1,2016.2,2015,2015.1,2015.2,2014,2014.1,2014.2,...,1978.2,1977,1977.1,1977.2,1976,1976.1,1976.2,1975,1975.1,1975.2
0,,"Prevalence of obesity among adults, BMI &amp;...","Prevalence of obesity among adults, BMI &amp;...","Prevalence of obesity among adults, BMI &amp;...","Prevalence of obesity among adults, BMI &amp;...","Prevalence of obesity among adults, BMI &amp;...","Prevalence of obesity among adults, BMI &amp;...","Prevalence of obesity among adults, BMI &amp;...","Prevalence of obesity among adults, BMI &amp;...","Prevalence of obesity among adults, BMI &amp;...",...,"Prevalence of obesity among adults, BMI &amp;...","Prevalence of obesity among adults, BMI &amp;...","Prevalence of obesity among adults, BMI &amp;...","Prevalence of obesity among adults, BMI &amp;...","Prevalence of obesity among adults, BMI &amp;...","Prevalence of obesity among adults, BMI &amp;...","Prevalence of obesity among adults, BMI &amp;...","Prevalence of obesity among adults, BMI &amp;...","Prevalence of obesity among adults, BMI &amp;...","Prevalence of obesity among adults, BMI &amp;..."
1,"Countries, territories and areas",Both sexes,Male,Female,Both sexes,Male,Female,Both sexes,Male,Female,...,Female,Both sexes,Male,Female,Both sexes,Male,Female,Both sexes,Male,Female
2,Afghanistan,4.5 [2.8-6.7],2.7 [1.1-5.4],6.2 [3.5-10.2],4.3 [2.6-6.4],2.6 [1.1-5.1],6.0 [3.3-9.8],4.1 [2.5-6.1],2.4 [1.0-4.8],5.7 [3.2-9.4],...,0.8 [0.3-1.8],0.5 [0.2-1.0],0.2 [0.0-0.6],0.7 [0.2-1.8],0.4 [0.2-1.0],0.2 [0.0-0.6],0.7 [0.2-1.7],0.4 [0.1-0.9],0.2 [0.0-0.6],0.7 [0.2-1.6]
3,Albania,22.3 [17.4-27.4],21.9 [14.9-29.4],22.8 [16.1-30.1],21.7 [17.0-26.6],21.1 [14.6-28.4],22.2 [15.9-29.2],21.1 [16.6-25.7],20.4 [14.1-27.6],21.7 [15.6-28.4],...,7.9 [3.9-13.5],6.0 [3.5-9.4],4.3 [1.8-8.4],7.8 [3.7-13.6],5.8 [3.3-9.4],4.1 [1.6-8.3],7.7 [3.6-13.7],5.7 [3.2-9.3],4.0 [1.5-8.3],7.6 [3.4-13.7]
4,Algeria,26.6 [21.7-32.0],19.4 [13.2-26.5],34.0 [26.6-41.9],25.7 [21.0-30.8],18.5 [12.7-25.3],33.0 [25.9-40.6],24.8 [20.3-29.7],17.7 [12.1-24.1],32.0 [25.2-39.4],...,10.1 [5.4-16.2],6.2 [3.6-9.6],2.6 [1.0-5.3],9.8 [5.2-16.0],6.1 [3.4-9.5],2.5 [0.9-5.2],9.5 [4.9-15.9],5.9 [3.3-9.4],2.4 [0.8-5.2],9.3 [4.6-15.9]


In [4]:
# rename column
df_obesity_temp.rename(columns={"Unnamed: 0": "country"}, inplace=True)


# drop the columns indicating males and females, we only care about both

# drop male columns
df_obesity_temp.drop(axis=1, columns=df_obesity_temp.columns[2::3], inplace=True)

# drop female columns
df_obesity_temp.drop(axis=1, columns=df_obesity_temp.columns[2::2], inplace=True)

# drop the first  row
df_obesity_temp.drop(axis=0, index=[0, 1], inplace=True)
df_obesity_temp.reset_index(drop=True, inplace=True)


df_obesity_temp.head()

Unnamed: 0,country,2016,2015,2014,2013,2012,2011,2010,2009,2008,...,1984,1983,1982,1981,1980,1979,1978,1977,1976,1975
0,Afghanistan,4.5 [2.8-6.7],4.3 [2.6-6.4],4.1 [2.5-6.1],3.9 [2.4-5.8],3.7 [2.3-5.5],3.5 [2.2-5.2],3.3 [2.1-4.9],3.1 [1.9-4.7],2.9 [1.8-4.4],...,0.7 [0.3-1.3],0.7 [0.3-1.3],0.6 [0.3-1.2],0.6 [0.3-1.2],0.6 [0.2-1.1],0.5 [0.2-1.1],0.5 [0.2-1.0],0.5 [0.2-1.0],0.4 [0.2-1.0],0.4 [0.1-0.9]
1,Albania,22.3 [17.4-27.4],21.7 [17.0-26.6],21.1 [16.6-25.7],20.5 [16.2-25.0],19.9 [15.8-24.3],19.3 [15.3-23.7],18.7 [14.8-22.9],18.1 [14.3-22.2],17.5 [13.7-21.5],...,7.2 [4.8-10.3],7.0 [4.6-10.2],6.8 [4.4-10.0],6.6 [4.2-9.8],6.4 [4.0-9.7],6.3 [3.8-9.5],6.1 [3.6-9.5],6.0 [3.5-9.4],5.8 [3.3-9.4],5.7 [3.2-9.3]
2,Algeria,26.6 [21.7-32.0],25.7 [21.0-30.8],24.8 [20.3-29.7],23.9 [19.6-28.6],23.1 [18.9-27.6],22.2 [18.2-26.6],21.4 [17.6-25.6],20.7 [16.9-24.7],19.9 [16.3-23.8],...,8.1 [5.4-11.3],7.8 [5.1-10.9],7.4 [4.8-10.7],7.2 [4.6-10.4],6.9 [4.3-10.2],6.7 [4.1-10.0],6.4 [3.8-9.8],6.2 [3.6-9.6],6.1 [3.4-9.5],5.9 [3.3-9.4]
3,Andorra,28.0 [22.0-34.0],27.7 [21.8-33.5],27.3 [21.7-33.0],27.0 [21.6-32.5],26.6 [21.4-32.1],26.3 [21.2-31.7],26.0 [21.0-31.3],25.7 [20.7-30.9],25.4 [20.5-30.6],...,17.9 [13.3-22.9],17.4 [12.9-22.4],17.0 [12.4-22.0],16.5 [12.0-21.6],16.0 [11.5-21.2],15.6 [11.0-20.7],15.0 [10.5-20.2],14.5 [10.0-19.7],14.0 [9.4-19.2],13.4 [8.9-18.7]
4,Angola,6.8 [4.2-10.3],6.5 [4.0-9.8],6.2 [3.8-9.4],5.9 [3.6-8.9],5.6 [3.5-8.5],5.3 [3.3-8.1],5.1 [3.1-7.7],4.8 [2.9-7.3],4.6 [2.8-7.0],...,1.2 [0.5-2.2],1.1 [0.5-2.1],1.1 [0.4-2.1],1.0 [0.4-2.0],0.9 [0.4-1.9],0.9 [0.3-1.8],0.8 [0.3-1.8],0.8 [0.3-1.7],0.8 [0.3-1.7],0.7 [0.2-1.6]


In [5]:
# get rid of the range, leave only average

for i in range(1, 43):
    df_obesity_temp.iloc[:, i] = df_obesity_temp.iloc[:, i].str.split(expand=True)[0]

df_obesity_temp.head()

Unnamed: 0,country,2016,2015,2014,2013,2012,2011,2010,2009,2008,...,1984,1983,1982,1981,1980,1979,1978,1977,1976,1975
0,Afghanistan,4.5,4.3,4.1,3.9,3.7,3.5,3.3,3.1,2.9,...,0.7,0.7,0.6,0.6,0.6,0.5,0.5,0.5,0.4,0.4
1,Albania,22.3,21.7,21.1,20.5,19.9,19.3,18.7,18.1,17.5,...,7.2,7.0,6.8,6.6,6.4,6.3,6.1,6.0,5.8,5.7
2,Algeria,26.6,25.7,24.8,23.9,23.1,22.2,21.4,20.7,19.9,...,8.1,7.8,7.4,7.2,6.9,6.7,6.4,6.2,6.1,5.9
3,Andorra,28.0,27.7,27.3,27.0,26.6,26.3,26.0,25.7,25.4,...,17.9,17.4,17.0,16.5,16.0,15.6,15.0,14.5,14.0,13.4
4,Angola,6.8,6.5,6.2,5.9,5.6,5.3,5.1,4.8,4.6,...,1.2,1.1,1.1,1.0,0.9,0.9,0.8,0.8,0.8,0.7


In [6]:
# transpose and assign to new variable
df_obesity = df_obesity_temp.melt(
    id_vars=["country"], value_name="obesity_rate"
).rename(columns={"variable": "year"})


# trying to convert data to float it looks like there are some "No" values
try:
    df_obesity["obesity_rate"] = df_obesity["obesity_rate"].astype(float)
except ValueError as e:
    print(str(e))

countries_to_skip = df_obesity[df_obesity["obesity_rate"] == "No"]["country"].unique()
print(f"Countires with no obesity data: {countries_to_skip}")

# filter data frame to skip the countires without data
df_obesity = df_obesity[~df_obesity["country"].isin(countries_to_skip)]

# convert obesity rate to float
df_obesity["obesity_rate"] = df_obesity["obesity_rate"].astype(float)

# convert year to integer
df_obesity["year"] = df_obesity["year"].astype(int)

could not convert string to float: 'No'
Countires with no obesity data: ['Monaco' 'San Marino' 'South Sudan' 'Sudan']


In [7]:
# map of cleaned country names (generated using dataprep.clean.clean_country)
countrymap = {
    "Bolivia (Plurinational State of)": "Bolivia",
    "Congo": "Congo Republic",
    "Côte d'Ivoire": "Cote d'Ivoire",
    "Czechia": "Czech Republic",
    "Democratic People's Republic of Korea": "North Korea",
    "Democratic Republic of the Congo": "DR Congo",
    "Iran (Islamic Republic of)": "Iran",
    "Kyrgyzstan": "Kyrgyz Republic",
    "Lao People's Democratic Republic": "Laos",
    "Micronesia (Federated States of)": "Micronesia, Fed. Sts.",
    "Republic of Korea": "South Korea",
    "Republic of Moldova": "Moldova",
    "North Macedonia": "Macedonia",
    "Russian Federation": "Russia",
    "Saint Kitts and Nevis": "St. Kitts and Nevis",
    "Saint Lucia": "St. Lucia",
    "Saint Vincent and the Grenadines": "St. Vincent and the Grenadines",
    "Sudan (former)": "Sudan",
    "Syrian Arab Republic": "Syria",
    "Türkiye": "Turkey",
    "United Kingdom of Great Britain and Northern Ireland": "United Kingdom",
    "United Republic of Tanzania": "Tanzania",
    "United States of America": "United States",
    "Venezuela (Bolivarian Republic of)": "Venezuela",
    "Viet Nam": "Vietnam",
}

# replace countries with their unified names
df_obesity["country"] = df_obesity["country"].replace(countrymap)


# cleaned obesity dataframe
df_obesity.head()

Unnamed: 0,country,year,obesity_rate
0,Afghanistan,2016,4.5
1,Albania,2016,22.3
2,Algeria,2016,26.6
3,Andorra,2016,28.0
4,Angola,2016,6.8


## Scrape Wikipedia for calorie intake per country

In [8]:
URL = "https://en.wikipedia.org/wiki/List_of_countries_by_food_energy_intake"
resp = requests.get(URL)
if resp.status_code == 200:
    soup = BeautifulSoup(resp.content, "html.parser")
    tab = soup.find("table", {"class": "wikitable"})


df_calorie_intake = pd.read_html(str(tab), skiprows=[0], header=0)[0]


df_calorie_intake.drop(axis=1, columns=["Rank", "Year"], inplace=True)
df_calorie_intake.rename(columns={"Country": "country"}, inplace=True)
df_calorie_intake.sort_values(by="country", inplace=True)
df_calorie_intake.reset_index(inplace=True, drop=True)

In [9]:
# map of cleaned country names
countrymap = {
    "Brunei": "Brunei Darussalam",
    "Cape Verde": "Cabo Verde",
    "Ivory Coast": "Cote d'Ivoire",
    "Kyrgyzstan": "Kyrgyz Republic",
    "New Caledonia (France)": "New Caledonia",
    "North Macedonia": "Macedonia",
    "Republic of the Congo": "Congo Republic",
    "Saint Kitts and Nevis": "St. Kitts and Nevis",
    "Saint Lucia": "St. Lucia",
    "Saint Vincent and the Grenadines": "St. Vincent and the Grenadines",
    "São Tomé and Príncipe": "Sao Tome and Principe",
}

df_calorie_intake["country"] = df_calorie_intake["country"].replace(countrymap)


# cleaned calorie intake dataframe
df_calorie_intake.head()

Unnamed: 0,country,kilocalories
0,Afghanistan,2040
1,Albania,3360
2,Algeria,3322
3,Angola,2385
4,Antigua and Barbuda,2445


## Download GDP per capita

In [10]:
url = "https://api.worldbank.org/v2/en/indicator/NY.GDP.PCAP.CD?downloadformat=csv"

In [11]:
import zipfile

if not os.path.exists(os.path.join(source_data_folder, "gdp_data.csv")):
    download_file(url, os.path.join(source_data_folder, "gdp_data.zip"))
    # extract csv file from zip folder
    with zipfile.ZipFile(os.path.join(source_data_folder, "gdp_data.zip")) as zip_file:
        zip_file.extract(
            "API_NY.GDP.PCAP.CD_DS2_en_csv_v2_5454904.csv",
            path=os.path.join(source_data_folder),
        )

    # delete redundant files
    os.remove(os.path.join(source_data_folder, "gdp_data.zip"))

    # rename file
    os.rename(
        os.path.join(
            source_data_folder, "API_NY.GDP.PCAP.CD_DS2_en_csv_v2_5454904.csv"
        ),
        os.path.join(source_data_folder, "gdp_data.csv"),
    )
else:
    print("Gdp data already downloaded")

Gdp data already downloaded


In [12]:
df_gdp_temp = pd.read_csv(os.path.join(source_data_folder, "gdp_data.csv"), skiprows=4)

# drop irrelevant columns
df_gdp_temp.drop(
    axis=1, columns=["Indicator Name", "Indicator Code", "Country Code"], inplace=True
)

# transpose and assign to new variable
df_gdp = df_gdp_temp.melt(id_vars="Country Name", value_name="gdp").rename(
    columns={"Country Name": "country", "variable": "year"}
)

# drop the rows with invalid year values ('Unnamed: 67')
df_gdp = df_gdp[df_gdp["year"] != "Unnamed: 67"]

# convert year column to integers
df_gdp["year"] = df_gdp["year"].astype(int)

# drop rows without gdp
df_gdp.dropna(subset=["gdp"], inplace=True)

# drop new gdp values
df_gdp = df_gdp[df_gdp["year"] <= 2016]

# leave only the newest gdp value for each country
df_gdp = df_gdp.loc[df_gdp.groupby("country")["year"].idxmax()]

# drop the year column, we only care about newest data
df_gdp.drop(axis=1, columns=["year"], inplace=True)

# reset index
df_gdp.reset_index(drop=True, inplace=True)

In [13]:
# map of cleaned country names (some names are set to 'nan' because they're not countires)
countrymap = {
    "Africa Eastern and Southern": "nan",
    "Africa Western and Central": "nan",
    "Arab World": "nan",
    "Bahamas, The": "Bahamas",
    "Caribbean small states": "nan",
    "Central Europe and the Baltics": "nan",
    "Congo, Dem. Rep.": "DR Congo",
    "Congo, Rep.": "Congo Republic",
    "Czechia": "Czech Republic",
    "Early-demographic dividend": "nan",
    "East Asia & Pacific": "nan",
    "East Asia & Pacific (IDA & IBRD countries)": "nan",
    "East Asia & Pacific (excluding high income)": "nan",
    "Egypt, Arab Rep.": "Egypt",
    "Euro area": "nan",
    "Europe & Central Asia": "nan",
    "Europe & Central Asia (IDA & IBRD countries)": "nan",
    "Europe & Central Asia (excluding high income)": "nan",
    "European Union": "nan",
    "Faroe Islands": "Faeroe Islands",
    "Fragile and conflict affected situations": "nan",
    "Gambia, The": "Gambia",
    "Heavily indebted poor countries (HIPC)": "nan",
    "High income": "nan",
    "Hong Kong SAR, China": "nan",
    "IBRD only": "nan",
    "IDA & IBRD total": "nan",
    "IDA blend": "nan",
    "IDA only": "nan",
    "IDA total": "nan",
    "Iran, Islamic Rep.": "Iran",
    "Korea, Rep.": "South Korea",
    "Lao PDR": "Laos",
    "Late-demographic dividend": "nan",
    "Latin America & Caribbean": "nan",
    "Latin America & Caribbean (excluding high income)": "nan",
    "Latin America & the Caribbean (IDA & IBRD countries)": "nan",
    "Least developed countries: UN classification": "nan",
    "Low & middle income": "nan",
    "Low income": "nan",
    "Lower middle income": "nan",
    "Macao SAR, China": "nan",
    "Middle East & North Africa": "nan",
    "Middle East & North Africa (IDA & IBRD countries)": "nan",
    "Middle East & North Africa (excluding high income)": "nan",
    "Middle income": "nan",
    "North America": "nan",
    "North Macedonia": "Macedonia",
    "OECD members": "nan",
    "Other small states": "nan",
    "Pacific island small states": "nan",
    "Post-demographic dividend": "nan",
    "Pre-demographic dividend": "nan",
    "Russian Federation": "Russia",
    "Sint Maarten (Dutch part)": "Sint Maarten",
    "Slovak Republic": "Slovakia",
    "Small states": "nan",
    "South Asia": "nan",
    "South Asia (IDA & IBRD)": "nan",
    "St. Martin (French part)": "Saint-Martin",
    "Sub-Saharan Africa": "nan",
    "Sub-Saharan Africa (IDA & IBRD countries)": "nan",
    "Sub-Saharan Africa (excluding high income)": "nan",
    "Syrian Arab Republic": "Syria",
    "Turkiye": "nan",
    "Upper middle income": "nan",
    "Venezuela, RB": "Venezuela",
    "Virgin Islands (U.S.)": "United States Virgin Islands",
    "West Bank and Gaza": "Palestine",
    "World": "nan",
    "Yemen, Rep.": "Yemen",
}

# replace countires with clean, unified names
df_gdp["country"] = df_gdp["country"].replace(countrymap)

# drop the values where country is 'nan'
df_gdp = df_gdp[df_gdp["country"] != "nan"]

df_gdp.reset_index(drop=True, inplace=True)

# cleaned gdp dataframe
df_gdp.country.unique()

array(['Afghanistan', 'Albania', 'Algeria', 'American Samoa', 'Andorra',
       'Angola', 'Antigua and Barbuda', 'Argentina', 'Armenia', 'Aruba',
       'Australia', 'Austria', 'Azerbaijan', 'Bahamas', 'Bahrain',
       'Bangladesh', 'Barbados', 'Belarus', 'Belgium', 'Belize', 'Benin',
       'Bermuda', 'Bhutan', 'Bolivia', 'Bosnia and Herzegovina',
       'Botswana', 'Brazil', 'Brunei Darussalam', 'Bulgaria',
       'Burkina Faso', 'Burundi', 'Cabo Verde', 'Cambodia', 'Cameroon',
       'Canada', 'Cayman Islands', 'Central African Republic', 'Chad',
       'Channel Islands', 'Chile', 'China', 'Colombia', 'Comoros',
       'DR Congo', 'Congo Republic', 'Costa Rica', "Cote d'Ivoire",
       'Croatia', 'Cuba', 'Curacao', 'Cyprus', 'Czech Republic',
       'Denmark', 'Djibouti', 'Dominica', 'Dominican Republic', 'Ecuador',
       'Egypt', 'El Salvador', 'Equatorial Guinea', 'Eritrea', 'Estonia',
       'Eswatini', 'Ethiopia', 'Faeroe Islands', 'Fiji', 'Finland',
       'France', 'French P

## Download urbanization data

In [14]:
url = "https://api.worldbank.org/v2/en/indicator/SP.URB.TOTL.IN.ZS?downloadformat=csv"

In [15]:
if not os.path.exists(os.path.join(source_data_folder, "urbanization_data.csv")):
    download_file(url, os.path.join(source_data_folder, "urbanization_data.zip"))
    # extract csv file from zip folder
    with zipfile.ZipFile(
        os.path.join(source_data_folder, "urbanization_data.zip")
    ) as zip_file:
        zip_file.extract(
            "API_SP.URB.TOTL.IN.ZS_DS2_en_csv_v2_5455584.csv",
            path=os.path.join(source_data_folder),
        )

    # delete redundant files
    os.remove(os.path.join(source_data_folder, "urbanization_data.zip"))

    # rename file
    os.rename(
        os.path.join(
            source_data_folder, "API_SP.URB.TOTL.IN.ZS_DS2_en_csv_v2_5455584.csv"
        ),
        os.path.join(source_data_folder, "urbanization_data.csv"),
    )
else:
    print("Gdp data already downloaded")

Gdp data already downloaded


In [16]:
df_urbanization_temp = pd.read_csv(
    os.path.join(source_data_folder, "urbanization_data.csv"), skiprows=4
)

# drop irrelevant columns
df_urbanization_temp.drop(
    axis=1, columns=["Indicator Name", "Indicator Code", "Country Code"], inplace=True
)

# transpose and assign to new variable
df_urbanization = df_urbanization_temp.melt(
    id_vars="Country Name", value_name="urbanization"
).rename(columns={"Country Name": "country", "variable": "year"})

# drop the rows with invalid year values ('Unnamed: 67')
df_urbanization = df_urbanization[df_urbanization["year"] != "Unnamed: 67"]

# convert year column to integers
df_urbanization["year"] = df_urbanization["year"].astype(int)

# drop rows without gdp
df_urbanization.dropna(subset=["urbanization"], inplace=True)

# drop new gdp values
df_urbanization = df_urbanization[df_urbanization["year"] <= 2016]

# leave only the newest urbanization value for each country
df_urbanization = df_urbanization.loc[
    df_urbanization.groupby("country")["year"].idxmax()
]

# drop the year column, we only care about newest data
df_urbanization.drop(axis=1, columns=["year"], inplace=True)

# reset index
df_urbanization.reset_index(drop=True, inplace=True)

In [17]:
# map of cleaned country names (some names are set to 'nan' because they're not countires)
countrymap = {
    "Africa Eastern and Southern": "nan",
    "Africa Western and Central": "nan",
    "Arab World": "nan",
    "Bahamas, The": "Bahamas",
    "Caribbean small states": "nan",
    "Central Europe and the Baltics": "nan",
    "Congo, Dem. Rep.": "DR Congo",
    "Congo, Rep.": "Congo Republic",
    "Czechia": "Czech Republic",
    "Early-demographic dividend": "nan",
    "East Asia & Pacific": "nan",
    "East Asia & Pacific (IDA & IBRD countries)": "nan",
    "East Asia & Pacific (excluding high income)": "nan",
    "Egypt, Arab Rep.": "Egypt",
    "Euro area": "nan",
    "Europe & Central Asia": "nan",
    "Europe & Central Asia (IDA & IBRD countries)": "nan",
    "Europe & Central Asia (excluding high income)": "nan",
    "European Union": "nan",
    "Faroe Islands": "Faeroe Islands",
    "Fragile and conflict affected situations": "nan",
    "Gambia, The": "Gambia",
    "Heavily indebted poor countries (HIPC)": "nan",
    "High income": "nan",
    "Hong Kong SAR, China": "nan",
    "IBRD only": "nan",
    "IDA & IBRD total": "nan",
    "IDA blend": "nan",
    "IDA only": "nan",
    "IDA total": "nan",
    "Iran, Islamic Rep.": "Iran",
    "Korea, Rep.": "South Korea",
    "Lao PDR": "Laos",
    "Late-demographic dividend": "nan",
    "Latin America & Caribbean": "nan",
    "Latin America & Caribbean (excluding high income)": "nan",
    "Latin America & the Caribbean (IDA & IBRD countries)": "nan",
    "Least developed countries: UN classification": "nan",
    "Low & middle income": "nan",
    "Low income": "nan",
    "Lower middle income": "nan",
    "Macao SAR, China": "nan",
    "Middle East & North Africa": "nan",
    "Middle East & North Africa (IDA & IBRD countries)": "nan",
    "Middle East & North Africa (excluding high income)": "nan",
    "Middle income": "nan",
    "North America": "nan",
    "North Macedonia": "Macedonia",
    "OECD members": "nan",
    "Other small states": "nan",
    "Pacific island small states": "nan",
    "Post-demographic dividend": "nan",
    "Pre-demographic dividend": "nan",
    "Russian Federation": "Russia",
    "Sint Maarten (Dutch part)": "Sint Maarten",
    "Slovak Republic": "Slovakia",
    "Small states": "nan",
    "South Asia": "nan",
    "South Asia (IDA & IBRD)": "nan",
    "St. Martin (French part)": "Saint-Martin",
    "Sub-Saharan Africa": "nan",
    "Sub-Saharan Africa (IDA & IBRD countries)": "nan",
    "Sub-Saharan Africa (excluding high income)": "nan",
    "Syrian Arab Republic": "Syria",
    "Turkiye": "nan",
    "Upper middle income": "nan",
    "Venezuela, RB": "Venezuela",
    "Virgin Islands (U.S.)": "United States Virgin Islands",
    "West Bank and Gaza": "Palestine",
    "World": "nan",
    "Yemen, Rep.": "Yemen",
}

# replace countires with clean, unified names
df_urbanization["country"] = df_urbanization["country"].replace(countrymap)

# drop the values where country is 'nan'
df_urbanization = df_urbanization[df_urbanization["country"] != "nan"]

df_urbanization.reset_index(drop=True, inplace=True)

# cleaned urbanization dataframe
df_urbanization.head()

Unnamed: 0,country,urbanization
0,Afghanistan,25.02
1,Albania,58.421
2,Algeria,71.459
3,American Samoa,87.198
4,Andorra,88.248


## Combine cleaned data

In [18]:
df_merged = pd.merge(
    pd.merge(
        pd.merge(
            df_obesity[df_obesity["year"] == 2016], df_calorie_intake, on="country"
        ),
        df_gdp,
        on="country",
    ),
    df_urbanization,
    on="country",
)
df_merged.drop(axis=1, columns=["year"], inplace=True)

# final merged dateframe
df_merged.head()

Unnamed: 0,country,obesity_rate,kilocalories,gdp,urbanization
0,Afghanistan,4.5,2040,520.252064,25.02
1,Albania,22.3,3360,4124.05539,58.421
2,Algeria,26.6,3322,3967.199451,71.459
3,Angola,6.8,2385,1709.515534,64.149
4,Antigua and Barbuda,19.1,2445,15862.651663,24.846


## Serialize cleaned data

In [19]:
processed_data_folder = "processed_data"

# make a directory for processed files
if not os.path.exists(processed_data_folder):
    print(f"Creating directory '{processed_data_folder}'")
    os.makedirs(processed_data_folder)


# serialize processed data frames
df_obesity.to_pickle(os.path.join(processed_data_folder, "df_obesity.pkl"))
df_calorie_intake.to_pickle(
    os.path.join(processed_data_folder, "df_calorie_intake.pkl")
)
df_gdp.to_pickle(os.path.join(processed_data_folder, "df_gdp.pkl"))
df_urbanization.to_pickle(os.path.join(processed_data_folder, "df_urbanization.pkl"))
df_merged.to_pickle(os.path.join(processed_data_folder, "df_merged.pkl"))