In [1]:
# Import Modules
import pandas as pd
import numpy as np
import altair as alt
import warnings
import re


In [2]:
# Set the notebook to display all columns of a dataframe
pd.set_option("display.max_columns", None)

# Suppress warnings for clean cell outputs
warnings.filterwarnings("ignore")


# Load Data

In [3]:
# Load in cleaned & combined data
df = pd.read_csv("./data/complete_dataset_with_interpolation.csv")


# **Explore the cleaned dataset**

We're going to start by looking at the combined dataset. This cleaned dataset includes the `World Banks's Development Indicators`, `Climate Watch Greenhouse Gas Emissions Data`, and the `UN's Sustainable Development Poverty Data`. Recording observations on the structure of this data will guide further analysis.

In [4]:
# Look at values of GHG emissions that are not NaN
df[df["gas"].notna()].sample(5)


Unnamed: 0,country,c_code,indicator,i_code,gas,1990,1991,1992,1993,1994,1995,1996,1997,1998,1999,2000,2001,2002,2003,2004,2005,2006,2007,2008,2009,2010,2011,2012,2013,2014,2015,2016,2017,2018,2019
4376,Brunei,BRN,Fugitive Emissions,,CO2,0.06,0.06,0.06,0.06,0.13,0.1,0.11,0.14,0.2,0.24,0.29,0.4,0.26,0.27,0.18,0.17,0.3,0.3,0.69,0.44,0.31,0.32,0.33,0.36,0.38,0.42,0.56,0.58,0.4,0.4
21952,Netherlands,NLD,Other Fuel Combustion,,CO2,10.04,10.96,10.64,11.0,10.4,10.61,11.34,9.89,9.77,9.68,9.79,9.55,8.82,8.73,8.72,8.83,7.72,7.61,7.62,7.52,8.78,7.66,7.53,7.55,6.72,7.11,7.12,6.86,7.16,6.98
7589,Cyprus,CYP,Manufacturing/Construction,,CO2,0.8,1.29,1.13,1.19,1.22,1.24,1.37,1.29,1.33,1.37,1.4,1.34,1.33,1.37,1.4,1.01,0.89,0.91,0.89,0.77,0.68,0.57,0.47,0.54,0.69,0.61,0.6,0.62,0.55,0.56
24765,Portugal,PRT,Bunker Fuels,,All GHG,3.34,3.43,3.49,3.13,3.08,3.12,3.15,3.2,2.92,3.76,4.08,3.4,3.37,3.82,4.23,4.05,4.36,4.15,4.35,4.02,4.14,4.58,4.75,4.91,4.89,5.24,5.94,6.39,6.74,7.49
3016,Belize,BLZ,Bunker Fuels,,CO2,0.04,0.05,0.07,0.04,0.05,0.03,0.04,0.05,0.08,0.08,0.1,0.06,0.07,0.07,0.07,0.08,0.08,0.06,0.06,0.15,0.05,0.04,0.06,0.04,0.04,0.05,0.05,0.05,0.05,0.05


In [5]:
# Examine records for the poverty variable
df[
    df["indicator"]
    == "Proportion of population below international poverty line (%)PERCENTALLAGEALLAREABOTHSEX"
].head()


Unnamed: 0,country,c_code,indicator,i_code,gas,1990,1991,1992,1993,1994,1995,1996,1997,1998,1999,2000,2001,2002,2003,2004,2005,2006,2007,2008,2009,2010,2011,2012,2013,2014,2015,2016,2017,2018,2019
320,Albania,ALB,Proportion of population below international p...,SI_POV_DAY1,,,,0.9,0.9,0.9,0.9,0.9,1.016667,1.133333,1.25,1.366667,1.483333,1.6,1.366667,1.133333,0.9,0.7,0.5,0.3,0.425,0.55,0.675,0.8,1.2,1.6,0.2,0.4,0.4,0.1,0.0
505,Algeria,DZA,Proportion of population below international p...,SI_POV_DAY1,,,5.6,5.6,5.6,5.6,5.6,5.275,4.95,4.625,4.3,,,,,,,,1.7,1.375,1.05,0.725,0.4,0.4,0.4,0.4,0.4,,,,
767,Angola,AGO,Proportion of population below international p...,SI_POV_DAY1,,,,,,,,36.4,36.4,36.4,36.4,36.4,36.15,35.9,35.65,35.4,35.15,34.9,34.65,34.4,35.95,37.5,39.05,40.6,,43.7,45.25,46.8,48.35,49.9,49.9
1115,Argentina,ARG,Proportion of population below international p...,SI_POV_DAY1,,1.3,1.3,2.5,2.8,2.7,4.9,5.3,4.8,5.6,5.5,6.8,10.7,16.9,7.9,5.7,4.2,3.3,2.6,2.5,2.3,1.4,1.1,1.2,1.0,0.9,1.0,1.1,0.9,1.4,1.3
1318,Armenia,ARM,Proportion of population below international p...,SI_POV_DAY1,,,,,,,12.2,12.2,12.2,12.2,12.2,13.3,14.4,10.5,7.9,5.5,2.7,2.1,1.6,0.9,1.3,1.0,1.2,0.9,1.8,1.5,1.3,1.2,0.9,1.4,1.1


In [6]:
# Look at unique Greenhouse Gases
df["gas"].unique()


array(['All GHG', 'CH4', 'N2O', nan, 'CO2', 'F-Gas'], dtype=object)

In [7]:
# Look at the types of sectors that emit GHG
df[df["gas"].notna()]["indicator"].unique()


array(['Agriculture', 'Building', 'Bunker Fuels', 'Electricity/Heat',
       'Energy', 'Fugitive Emissions', 'Industrial Processes',
       'Land-Use Change and Forestry', 'Manufacturing/Construction',
       'Other Fuel Combustion', 'Total excluding LUCF',
       'Total including LUCF', 'Transportation', 'Waste'], dtype=object)

### Observations 
* Data is broken down for various GHG types in the climate watch data. To maintain consistency - we will analyze relationships in our combined dataset using only `All GHG` climate watch data. 
* It looks like the GHG climate watch data is also broken down by sector of a country's economy. There may be some interesting relationships that exist within these sub groups of emissions data.

In [8]:
# Clean Country values of the Democratic Republic of the Congo
df["country"] = df["country"].apply(
    lambda x: "Democratic Republic of the Congo" if x == "Republic of Congo" else x
)

# Make a copy of the dataframe so the
# original one is still in the workspace and
# remains unaltered.
combined_df = df.copy()

# Drop the World and European Union records
to_drop = ["European Union (27)", "World"]
combined_df = combined_df[~combined_df["country"].isin(to_drop)]

# Filter data to grab only the variables we care about
# List of variables to grab from the dataset
# Dict to rename variable labels
vars = {
    "Proportion of population below international poverty line (%)PERCENTALLAGEALLAREABOTHSEX": "Percent_Poverty",
    "Total including LUCF, All GHG": "Total_GHG",
    "Total excluding LUCF, All GHG": "Total_GHG_exc",
    "GDP, PPP (current international $)": "GDP_Total",
    "Population, total": "Total_Population",
}


def get_long_form(to_melt_df, vars_of_interest, var_dict):
    """
    Converts a wide-format dataframe into a long form using
    the specified variables in vars_of_interest.

    Parameters:

         to_melt_df: dataframe in wide format.

         vars_of_interest: list of variables to include in the
         long -form dataframe. Should exist within the combined_df.

         var_dict: a dictionary to rename the variables in vars_of_interest

    Returns:

         dataframe: in long format

    """
    # Combine gas and indicator field
    to_melt_df["indicator"] = to_melt_df.apply(
        lambda row: row["indicator"] + ", " + row["gas"]
        if pd.notnull(row["gas"])
        else row["indicator"],
        axis=1,
    )

    # Filter the dataframe to grab vars of interest
    interest_df = to_melt_df[to_melt_df["indicator"].isin(vars_of_interest)]

    # Get the years in the dataframe
    pat = "[\d]{4}"
    year_cols = [col for col in interest_df.columns if re.search(pat, col)]

    # Melt the interest_df to long form
    out_dat = interest_df.melt(
        id_vars=["country", "c_code", "indicator"],
        value_vars=year_cols,
        var_name="Year",
        value_name="Value",
    )

    # Rename column labels
    out_dat = out_dat.replace(var_dict)

    # Groupby the indicator-year-country combos and get the mean
    # value to avoid duplicates in the index
    out_dat = (
        out_dat.groupby(["country", "c_code", "indicator", "Year"])["Value"]
        .mean()
        .reset_index()
    )

    # Pivot the dataframe to get the variables
    # as the column labels
    out_dat = out_dat.pivot(
        index=["country", "c_code", "Year"], columns=["indicator"], values=["Value"]
    ).reset_index()

    # Drop multi-levels in the columsn from the pivot
    out_dat.columns = out_dat.columns.droplevel(0)
    out_dat = out_dat.rename_axis(None, axis=1)

    # Rename empty column labels
    out_dat.columns = ["Country", "c_code", "Year"] + list(out_dat.columns[3:])

    # Convert year column to type int
    out_dat["Year"] = out_dat["Year"].astype(int)

    return out_dat

out_df = get_long_form(combined_df, vars.keys(), vars)

# View the result
out_df.head()


Unnamed: 0,Country,c_code,Year,GDP_Total,Percent_Poverty,Total_GHG,Total_GHG_exc
0,Afghanistan,AFG,1990,,,9.58,11.97
1,Afghanistan,AFG,1991,,,9.81,12.2
2,Afghanistan,AFG,1992,,,9.03,11.42
3,Afghanistan,AFG,1993,,,9.11,11.5
4,Afghanistan,AFG,1994,,,9.15,11.54


In [9]:
# Send long form of the data to CSV so it can be referenced in future notebooks

# out_df.to_csv("./data/complete_dataset_LongFormat.csv", index=False)


##  **Examine Missing Values for each variable and year**
Here I will call a custom groupby function (defined below) that gives a total count of non-null values for each variable in `out_df`. This will help give us an idea of where we have missing data in the dataframe. 

In [10]:
def summarize_na(dat):
    """
    Summarize function to be used in a groupby and
    determine the number of non-missing data in each
    column of our dataframe.

    Parameters:

        dataframe: a dataframe with columns to summarize missing values

    Returns:

        Series: series that contains the summary of missing values
    """

    # Use a dictionary comprehension to summarize
    # the number of non-missing values in each column
    out_dict = {col: len(dat[dat[col].notnull()]) for col in dat.columns}

    return pd.Series(out_dict, index=out_dict.keys())


# Look at counts of non-NaN protected values values by year
count_by_year = out_df.groupby("Year").apply(summarize_na)
count_by_year.tail(20)


Unnamed: 0_level_0,Country,c_code,Year,GDP_Total,Percent_Poverty,Total_GHG,Total_GHG_exc
Year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2000,191,191,191,183,128,191,191
2001,191,191,191,183,131,191,191
2002,191,191,191,183,140,191,191
2003,191,191,191,183,139,191,191
2004,191,191,191,184,139,191,191
2005,191,191,191,184,143,191,191
2006,191,191,191,184,146,191,191
2007,191,191,191,184,147,191,191
2008,191,191,191,184,149,191,191
2009,191,191,191,186,149,191,191


### Observations of missing values:
* SDS poverty data is sparse for the first decade of the 21st century. 
* More poverty data is available from 2010 onward, but still only ~25% of all countries have this data. 
* We have good coverage for all years and countries total population, GHG data, and GDP. 

### **Quantify Emissions Reductions by Country**
I'm wondering about which countries have decreased their emissions most from their peak values. Below I group-by country and calculate the change in greenhouse gas emissions between the peak value and the 2019 value. Questions I hope to answer here include:

1. Which countries have reduced their emissions _**the most**_?
2. Of these countries that have decreased emissions from their peak values, how many of them are high GDP countries or low GDP countries?

In [11]:
def get_GHG_delta(dat):
    # Funtion that is to be used in the groupby
    # of countries to calculate the difference
    # in peak emissions and the current value

    # Get the max emissions data for the country
    record_peak_emissions = dat[dat["Total_GHG_exc"] == dat["Total_GHG_exc"].max()]

    # Grab the GHG value and the year of
    # peak emissions
    peak_emissions_val = record_peak_emissions["Total_GHG_exc"].iloc[0]
    peak_emissions_yr = int(record_peak_emissions["Year"].iloc[0])

    # Get the most recent emissions data
    emissions_recent = dat[dat["Year"] == dat["Year"].max()]

    # Grab the GHG value and the year of
    # most recent emissions
    emissions_recent_val = emissions_recent["Total_GHG_exc"].iloc[0]
    emission_recent_yr = int(emissions_recent["Year"].iloc[0])

    # Get the delta between peak and most recent
    # emissions amounts
    delta = peak_emissions_val - emissions_recent_val

    # output dictionary with data
    out_dict = {
        "Peak_Emissions_Yr": peak_emissions_yr,
        "Peak_Emissions_Val": peak_emissions_val,
        "Most_Recent_Yr": emission_recent_yr,
        "Recent_Emission_Val": emissions_recent_val,
        "Emissions_Delta": delta,
    }

    return pd.Series(out_dict, index=list(out_dict.keys()))


# Groupby Country and Apply the Custom Function
reductions_df = out_df.groupby("Country").apply(get_GHG_delta).reset_index()

# View Results
reductions_df.sample(5)


Unnamed: 0,Country,Peak_Emissions_Yr,Peak_Emissions_Val,Most_Recent_Yr,Recent_Emission_Val,Emissions_Delta
172,Trinidad and Tobago,2014.0,33.03,2019.0,28.34,4.69
142,Saint Lucia,2009.0,1.65,2019.0,0.88,0.77
102,Malaysia,2019.0,313.02,2019.0,313.02,0.0
70,Guyana,2019.0,5.68,2019.0,5.68,0.0
60,Gabon,1998.0,15.35,2019.0,13.17,2.18


### **Visualize Emissions Reductions**

In [12]:
def get_reductions_chart():
    chart_df = reductions_df[reductions_df["Emissions_Delta"] > 0]

    chart_df.sort_values("Emissions_Delta", ascending=False, inplace=True)

    sort_order = [con for con in chart_df["Country"].unique()]

    chart_df = chart_df.iloc[:10]

    # Specs for left visual bar chart
    bars = (
        alt.Chart(chart_df)
        .mark_bar()
        .encode(
            y=alt.Y("Country:N", sort=sort_order, axis=alt.Axis(labelFontSize=12)),
            x=alt.X(
                "Emissions_Delta:Q",
                axis=alt.Axis(
                    title="Reductions in GHG Since Peak Emissions (MtCO2, excluding LUCF)",
                    titleFontSize=12,
                    labelFontSize=12,
                ),
            ),
        )
    )

    text = bars.mark_text(align="left", baseline="middle", dx=3).encode(
        y=alt.Y("Country:N", axis=alt.Axis(title=None), sort=sort_order),
        x=alt.X("Emissions_Delta:Q"),
        text=alt.Text("Emissions_Delta:Q", format=".0f"),
    )

    out = (
        alt.layer(bars, text)
        .properties(
            title={
                "text": "Reductions in Greenhouse Gasses since Peak Emissions by Country"
            },
        )
        .configure_title(fontSize=14, align="left")
        .configure_text(fontSize=12)
    )

    return out, sort_order


chart, order = get_reductions_chart()

chart


## **Identify Top 10 GHG emitting countries**
### First: prep the data for the visualization.

In [13]:
var = "Total_GHG"

# Filter for only data from 2009-2019
plot_df = out_df[out_df["Year"] >= 2010]

# Grab columns to summarize
plot_df = plot_df[["Country", "GDP_Total", "Total_GHG"]]

# Grab the mean over those years
plot_df = plot_df.groupby("Country").agg(np.mean).reset_index()

# edit the field specifying GDP to show $ amount in billions
plot_df["GDP_Total"] = plot_df["GDP_Total"] / 1000000000

# Grab top 15 bottom 15 countries
top_15 = plot_df.sort_values(var, ascending=False)
sort_top = [country for i, country in enumerate(top_15["Country"]) if i < 10]
bottom_15 = plot_df.sort_values(var, ascending=True).iloc[:14]
sort_bottom = [country for i, country in enumerate(bottom_15["Country"]) if i < 10]


# Melt the data so the values are in one column
viz_df = plot_df.melt(
    id_vars=["Country"],
    value_vars=["GDP_Total", "Total_GHG"],
    var_name="Parameter",
    value_name="Value",
)

viz_df = viz_df.replace(
    {"GDP_Total": "GDP PPP (billions $)", "Total_GHG": "Emissions (MtCO2)"}
)

# Redefine the to & bottom dataframes to get the
# countries we are interested in
top_15 = viz_df[viz_df["Country"].isin(sort_top)]
bottom_15 = viz_df[viz_df["Country"].isin(sort_bottom)]


### Second: Make the visual.

In [14]:
# Use altair to render the visualization
out_chart = (
    alt.Chart(top_15)
    .mark_bar()
    .encode(
        x="Value:Q",
        y=alt.Y("Parameter:N", axis=alt.Axis(title=None)),
        color="Parameter:N",
    )
    .facet(
        facet=alt.Facet(
            "Country:N",
            sort=sort_top,
            title=None,
            header=alt.Header(labelFontSize=14, labelFontWeight="bold"),
        ),
        columns=1,
        spacing=4,
    )
    .properties(
        title={
            "text": "Emissions and GDP Values for the top 10 Greenhouse Gas Emitting Countries",
            "subtitle": "Emissions and GDP values shown are the average of values between 2010 and 2019",
        }
    )
    .configure_title(fontSize=16, subtitleFontSize=12)
    .configure_legend(
        titleFontSize=14,
        labelFontSize=12,
        titlePadding=10,
        strokeColor="gray",
        padding=10,
    )
    .configure_header(
        labelAnchor="middle", labelOrient="right", labelAngle=0, labelPadding=20
    )
)

out_chart


In [15]:
%reload_ext watermark

%watermark -iv -v -m

Python implementation: CPython
Python version       : 3.10.6
IPython version      : 8.5.0

Compiler    : Clang 13.1.6 (clang-1316.0.21.2.5)
OS          : Darwin
Release     : 21.5.0
Machine     : x86_64
Processor   : i386
CPU cores   : 8
Architecture: 64bit

sys   : 3.10.6 (main, Aug 30 2022, 05:12:36) [Clang 13.1.6 (clang-1316.0.21.2.5)]
re    : 2.2.1
altair: 4.2.0
numpy : 1.23.3
pandas: 1.5.0

