In [5]:
# Import Modules 
import pandas as pd 
import numpy as np 
import altair as alt 
import warnings

In [6]:
# Set the notebook to display all columns of a dataframe
pd.set_option('display.max_columns', None)

# Suppress warnings for clean cell outputs
warnings.filterwarnings("ignore")

# Load Data

In [7]:
# Load in cleaned & combined data 
#- processed in long format
out_df = pd.read_csv("./data/complete_dataset_LongFormat.csv")

# Load in country correlation dataframe
# that was produced in notebook 07
country_corr_df = pd.read_csv('./data/corr_country.csv')


## **Examine Negative Correlations between Poverty and Emissions**

Here I'm interested in evaluating the negative correlations between the percent poverty and emissions variables that occur within countries. Since the global correlation was very weak, it is interesting that some countries may show strong negative correlations here. 

### Identify countries with strong negative correlations with Poverty and emissions

In [8]:
# Select the correlation from the corr df
pov_ghg = country_corr_df[country_corr_df['Var_Label'] == 'Percent_Poverty + Total_GHG']

# sort correlations in ascending order
pov_ghg.sort_values('Pearson').head()

Unnamed: 0,Country,Var_1,Var_2,Var_Label,Pearson,Pearson_Pval,Spearman,Spearman_Pval,n
736,Philippines,Percent_Poverty,Total_GHG,Percent_Poverty + Total_GHG,-0.985929,1.382829e-18,-0.903899,1.418452e-09,24
814,Seychelles,Percent_Poverty,Total_GHG,Percent_Poverty + Total_GHG,-0.981364,9.384581e-08,-0.953245,5.653471e-06,11
380,Guyana,Percent_Poverty,Total_GHG,Percent_Poverty + Total_GHG,-0.979046,5.979104e-09,-0.949415,7.192982e-07,13
410,India,Percent_Poverty,Total_GHG,Percent_Poverty + Total_GHG,-0.975914,4.878877e-16,-0.993457,3.145828e-22,24
573,Mali,Percent_Poverty,Total_GHG,Percent_Poverty + Total_GHG,-0.972769,2.5566929999999997e-19,-0.98562,3.62738e-23,30


### **Visualize the Negative Correlations for Poverty and Emissions**

In [9]:
# Declare the countries with the top 15 most negative correlations between poverty and emissions
# We chose these countries because of their dramatic difference in poverty values, the Pearson r
# for the top 15 is within 0.05 and they all show very strong trends.
top_15_neg_pov_ghg = [
    "Philippines",
    # "Seychelles",
    # "Guyana",
    "India",
    "Mali",
    "Senegal",
    # "Bangladesh",
    # "Peru",
    # "Algeria",
    # "Niger",
    "China",
    # "Chad",
    # "Uganda",
    # "Maldives",
    # "Tunisia",
]

# Data frame for the visual
corr_df_2 = out_df[out_df["Country"].isin(top_15_neg_pov_ghg)]


# Change names of countries to show the correlation coefficients on the facet titles
developing_labels = {
    c["Country"]: f"({c['Country']} {round(c['Pearson'], 2)}, n={c['n']})"
    for _, c in country_corr_df[
        country_corr_df["Var_Label"].eq("Percent_Poverty + Total_GHG")
        & country_corr_df["Country"].isin(top_15_neg_pov_ghg)
    ]
    .sort_values("Pearson")
    .iterrows()
}
# Redefine the country field for titles
corr_df_2["Country"] = corr_df_2["Country"].apply(lambda x: developing_labels[x])

# Get the order display for facet visual
sort_order = [val for val in developing_labels.values()]


def get_neg_corr_viz(country):
    """
    Function to get the negative correlation visual
    for selected countries having strong negative
    relationship with emissions and percent poverty.

    Parameters:

        country: str, a country in the dataframe

    Returns:
        A single chart showing the correlation of emissions and
        percent poverty variables.
    """

    # Only show y axes for the first visual
    # define the y encoding for the first chart
    if country == "Philippines (r=-0.99, n=24)":
        y_params = alt.Y(
            "Percent_Poverty:Q",
            axis=alt.Axis(title="Percent Poverty (%)"),
            # scale = alt.Scale(domain=[-60, 100])
        )
    # If the visual is not first in the list,
    # do not show the y axis - define encodings
    # for y for all other charts
    else:
        y_params = alt.Y("Percent_Poverty:Q")

    # Scatter plot layer -1 specifications
    base = (
        alt.Chart(corr_df_2)
        .transform_filter(alt.datum.Country == country)
        .mark_circle()
        .encode(
            x=alt.X(
                "Total_GHG:Q",
                axis=alt.Axis(
                    title=[
                        "Total Greenhouse",
                        "Gas Emissions",
                        "(including LUCF, MtCO2)",
                    ]
                ),
            ),
            y=y_params,
        )
    )

    scatter = base.encode(
        tooltip=["Year:N"], color=alt.Color("Country:N", legend=None)
    ).properties(height=150, width=200)

    # Regression plot - layer 2 specs
    reg = base.mark_line(color="black", opacity=0.5).transform_regression(
        "Total_GHG", "Percent_Poverty"
    )

    # Layer regression plot and scatter plot
    out_regress = (scatter + reg).properties(title=country)

    return out_regress


# Make a correlation visual for each country
# add to a list
lines = [get_neg_corr_viz(_) for _ in sort_order]

# Combine all the line charts together into a
# horizontally concatenated chart
out = alt.hconcat(*lines)

out


In [10]:
%reload_ext watermark

%watermark -iv -v -m

Python implementation: CPython
Python version       : 3.10.6
IPython version      : 8.5.0

Compiler    : Clang 13.1.6 (clang-1316.0.21.2.5)
OS          : Darwin
Release     : 21.5.0
Machine     : x86_64
Processor   : i386
CPU cores   : 8
Architecture: 64bit

altair: 4.2.0
numpy : 1.23.3
pandas: 1.5.0
sys   : 3.10.6 (main, Aug 30 2022, 05:12:36) [Clang 13.1.6 (clang-1316.0.21.2.5)]

