# Analyzing the top Emitters in the World


In [1]:
import pandas as pd
import plotly.graph_objects as go
import plotly.express as px

df = pd.read_csv("./data/complete_dataset_with_interpolation.csv")


In [2]:
ghg_gdp_df = df[
    # Omit EU and World aggregates
    ~df["c_code"].isin(["EUU", "WLD"])
    # Grab emission totals
    & (df["gas"].eq("All GHG") & df["indicator"].eq("Total including LUCF"))
    # Grab GDP indicator
    | df["indicator"].eq("GDP, PPP (current international $)")
]

years = list(map(str, range(1990, 2020)))


In [3]:
ghg_top_5 = ghg_gdp_df[ghg_gdp_df["indicator"].eq("Total including LUCF")].nlargest(
    n=5, columns="2019"
)

ghg_df = ghg_top_5.melt(
    id_vars=["country", "c_code", "indicator", "i_code", "gas"],
    value_vars=years,
    var_name="year",
    value_name="ghg",
)

# Melt the data into a plot-able format.
gdp_df = ghg_gdp_df[
    ghg_gdp_df["indicator"].eq("GDP, PPP (current international $)")
].melt(
    id_vars=["country", "c_code", "indicator", "i_code", "gas"],
    value_vars=years,
    var_name="year",
    value_name="gdp",
)

ghg_gdp_all_others = ghg_gdp_df[~ghg_gdp_df["country"].isin(ghg_top_5["country"])]
gdp_all_others = (
    ghg_gdp_all_others[
        ghg_gdp_all_others["indicator"].eq("GDP, PPP (current international $)")
    ]
    .sum(numeric_only=True)
    .to_frame(name="gdp")
    .reset_index(names=["year"])
    .assign(country="All Other Countries")
)

ghg_all_others = (
    ghg_gdp_all_others[ghg_gdp_all_others["indicator"].eq("Total including LUCF")]
    .sum(numeric_only=True)
    .to_frame(name="ghg")
    .reset_index(names=["year"])
    .assign(country="All Other Countries")
)

all_others = pd.merge(
    ghg_all_others,
    gdp_all_others,
    how="left",
    on=["country", "year"],
)

plot_df = pd.concat(
    [
        pd.merge(
            ghg_df[["country", "year", "ghg"]],
            gdp_df[["country", "year", "gdp"]],
            how="left",
            on=["country", "year"],
        ),
        # Initially wanted to show all other countries, but this distorted the plot too much.
        # This insight is recalculated below and written into the report.
        all_others,
    ]
).sort_values(["country", "year"])

plot_df.sample(5, random_state=42)


Unnamed: 0,country,year,ghg,gdp
19,All Other Countries,2005,21056.48,117326100000000.0
40,China,1998,4095.97,3040856000000.0
11,United States,1992,5456.12,6520327000000.0
88,Indonesia,2007,1107.1,1682011000000.0
124,Russia,2014,1621.85,3763535000000.0


In [4]:
fig = px.line(
    # Just show the top 5 in this plot
    plot_df[plot_df['country'].isin(ghg_top_5["country"])],
    x="gdp",
    y="ghg",
    hover_data=["gdp", "ghg", "country", "year"],
    color="country",
    markers=True,
    title="30 years of GDP vs GHG Emissions for the Top 5 Emitting Countries",
    labels={
        "gdp": "GDP, Purchasing Power Parity (international trillion $)",
        "ghg": "Total Greenhouse Gas Emissions (including LUCF)",
        "country": "Country",
    },
    width=1000,
    height=800,
)

fig.update_layout(legend={"yanchor": "top", "y": 0.98, "xanchor": "left", "x": 0.01})

fig.show()


In [5]:
last_5_years = list(map(str, range(2015, 2020)))

emissions_df = df[
    # Omit EU and World aggregates, we're going to recalculate the world totals
    ~df["c_code"].isin(["EUU", "WLD"])
    # Grab emission totals
    & (df["gas"].eq("All GHG") & df["indicator"].eq("Total including LUCF"))
]

last_5_mean = emissions_df[[*last_5_years]].mean(axis="columns")

last_5_mean_df = pd.DataFrame({"country": emissions_df['country'], "last_5_mean": last_5_mean})

last_5_mean_df.sample(5, random_state=42)

Unnamed: 0,country,last_5_mean
8202,Democratic Republic of the Congo,679.676
24721,Poland,329.086
13995,India,3203.756
25983,Saint Kitts and Nevis,0.342
20510,Mongolia,53.674


In [6]:
world_total = last_5_mean_df["last_5_mean"].sum()
# print(world_total)
# 46558.462

groups = last_5_mean_df.groupby("last_5_mean")
top_emitters = groups.filter(lambda x: x["last_5_mean"].gt(350))
top_emitters_sum = top_emitters["last_5_mean"].sum()
remainder = groups.filter(lambda x: x["last_5_mean"].lt(350))
remainder_sum = remainder["last_5_mean"].sum()

print(
    f"This data is heavily skewed with the top {len(top_emitters)} countries emitting "
    + f"{round((top_emitters_sum / world_total) * 100, 2)}% of the world's greenhouse gas, "
    + f"while the bottom {len(remainder)} countries make up the remaining "
    + f"{round((remainder_sum / world_total) * 100, 2)}% based on the mean aggregation "
    + "of total GHG emissions for 2015-2019."
)


This data is heavily skewed with the top 25 countries emitting 78.53% of the world's greenhouse gas, while the bottom 168 countries make up the remaining 21.47% based on the mean aggregation of total GHG emissions for 2015-2019.


In [7]:
%reload_ext watermark

%watermark -iv -v -m

Python implementation: CPython
Python version       : 3.10.6
IPython version      : 8.5.0

Compiler    : Clang 13.1.6 (clang-1316.0.21.2.5)
OS          : Darwin
Release     : 21.5.0
Machine     : x86_64
Processor   : i386
CPU cores   : 8
Architecture: 64bit

pandas: 1.5.0
sys   : 3.10.6 (main, Aug 30 2022, 05:12:36) [Clang 13.1.6 (clang-1316.0.21.2.5)]
plotly: 5.10.0

