# Exploration of historical carbon dioxide emission data from 1751 to 2017

# Preparations

In [None]:
# packages
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
df = pd.read_csv('../input/co2-ghg-emissionsdata/co2_emission.csv')
df.head()

In [None]:
# change column name
df = df.rename(columns={"Annual COâ‚‚ emissions (tonnes )": "Emission"}, errors="raise")

In [None]:
df.describe(include='all')

In [None]:
# clean negative emission values
df.Emission[df.Emission<0] = 0

In [None]:
df.Emission.describe()

In [None]:
# add log10 of emissions as additional column
df['EmissionLog10'] = np.log10(df.Emission+0.01) # minor correction to avoid undefined log(0) issue

# Development worldwide

In [None]:
# world wide view
df_world = df[df.Entity=='World']

In [None]:
# plot worldwide development
plt.plot(df_world.Year, df_world.Emission)
plt.xlabel('Year')
plt.ylabel('Annual CO2 Emission in tons')
plt.title('World')
plt.grid()
plt.show()

In [None]:
# look only at years from 1900
plt.plot(df_world.Year, df_world.Emission)
plt.xlim(1900,2017)
plt.xlabel('Year')
plt.ylabel('Annual CO2 Emission in tons')
plt.title('World')
plt.grid()
plt.show()

In [None]:
# plot (full) development in log scale
plt.plot(df_world.Year, df_world.EmissionLog10)
plt.xlabel('Year')
plt.ylabel('log10(Annual CO2 Emission in tons)')
plt.title('World')
plt.grid()
plt.show()

### Almost linear development on log scale => exponential growth

# Look at most recent year

In [None]:
# select most recent year
df_2017 = df[df.Year==2017]
# sort by emission
df_2017 = df_2017.sort_values(by='Emission', ascending=False)
# remove aggregations, keep only states
df_2017 = df_2017[df_2017.Entity!='World']
df_2017 = df_2017[~pd.isna(df_2017.Code)]
# reset index
df_2017.reset_index(inplace=True, drop=True)

In [None]:
sum_2017 = df_2017.Emission.sum()
print('Total 2017 [million t]: ', np.round(sum_2017/1e6,2))

In [None]:
# add percentage
df_2017['Percentage'] = 100*(df_2017.Emission / sum_2017)
# and show data frame
df_2017

In [None]:
# plot top 20 countries
n_show = 20
fig = plt.figure(figsize=(12,5)) # need bigger plot area!
sns.barplot(df_2017.Entity[0:n_show], df_2017[0:n_show].Emission)
plt.xticks(rotation=90)
plt.title('Top 20 emissions in year 2017')
plt.grid()
plt.show()

# Development of top 10 countries

In [None]:
# extract top 10 countries of 2017 from complete data frame
top10 = list(df_2017.Entity[0:10])
df_top10 = df[(df.Entity.isin(top10))]
df_top10

In [None]:
fig = plt.figure(figsize=(12,5)) # need bigger plot area!
sns.lineplot(x='Year', y='Emission', hue='Entity', data=df_top10)
plt.title('Historical development for top 10 countries [in 2017]')
plt.grid()
plt.show()

In [None]:
# restrict to years from 1900
fig = plt.figure(figsize=(12,5)) # need bigger plot area!
sns.lineplot(x='Year', y='Emission', hue='Entity', data=df_top10[df_top10.Year>1900])
plt.title('Historical development for top 10 countries [in 2017]')
plt.grid()
plt.show()

In [None]:
# plot using log scale
fig= plt.figure(figsize=(12,5)) # need larger plot area!
sns.lineplot(x='Year', y='EmissionLog10', hue='Entity', data=df_top10[df_top10.Year>1900])
plt.title('Historical development for top 10 countries [in 2017]')
plt.grid()
plt.show()

In [None]:
# change y-range
fig= plt.figure(figsize=(12,5)) # need larger plot area!
sns.lineplot(x='Year', y='EmissionLog10', hue='Entity', data=df_top10[df_top10.Year>1900])
plt.ylim(5,11)
plt.title('Historical development for top 10 countries [in 2017]')
plt.grid()
plt.show()