In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import missingno as msno

In [None]:
#Original Dataset from Kaggle
df1 = pd.read_csv('../input/co2-ghg-emissionsdata/co2_emission.csv')

In [None]:
#Aditional Data set from Datahub so I can get the continents as well
df2 = pd.read_csv('../input/co2-emission-continents/continent.csv')

In [None]:
#Dropping unecessary columns
df2 = df2.drop(columns = ['Continent_Code', 'Country_Name', 'Two_Letter_Country_Code', 'Country_Number'])

In [None]:
#Renaming columns that I will use
df2.columns = ['continent', 'Code']

In [None]:
#Merging both datasets on the columns CODE so I can have only one dataset with continents on it
df3 = pd.merge(df1, df2, how = 'left', on = 'Code')

In [None]:
#MSNO.Matrix function brings in a visual way the nullity of my dataframe
msno.matrix(df3, figsize = (10,2), fontsize = 8)
plt.show()

In [None]:
#This cell brings the null rate of each columns of the dataframe
for i in df3.columns:
    null_rate = df3[i].isna().sum()/len(df3)
    if null_rate > 0:
        print(f'The null rate of the columns {i} is {null_rate:.2f}')

In [None]:
#We have 1% of null data on the CONTINENT Column, with this command we fill it with the STR NaN
df3['continent'] = df3['continent'].fillna('NaN')

In [None]:
#We see here that after ther merge 2 unique values came with problem. Let`s correct it
df3.query('continent == "NaN"')['Entity'].unique()

In [None]:
#Lets correct some information regarding Czech Republic since they are in Central Europe
df3['continent'] = np.where(df3['Entity'] ==  'Czechoslovakia' , 'Europe', df3['continent'])

In [None]:
#Droping rows with World 
df3 = df3.drop(df3[df3['Entity'] == 'World'].index, axis = 0)

In [None]:
#Fill up the column code of these countries to drop null vallues after
df3['Code'] = np.where(df3['Entity'] == 'Kyrgysztan', 'KYG', df3['Code'])
df3['Code'] = np.where(df3['Entity'] == 'Wallis and Futuna Islands', 'WFI', df3['Code'])

In [None]:
#If we check the Entity column, there are 2 values related to non country or continent: Antactic Fisheries and Int Transport
#Let`s split it into a new dataframe called OTHERS

In [None]:
others = df3.loc[df3['Entity'].isin(['Antarctic Fisheries', 'International transport'])]
others = others.drop(columns = ['Code', 'continent'])

In [None]:
#Droping all rows with null CODE vallues, these vallues are related to agregated continent rows. I have merged an alternative dataset to handle it.
df3 = df3.drop(df3[df3['Code'].isnull() == True].index, axis = 0)

In [None]:
#Checking once again if there is null data
for i in df3.columns:
    null_rate = df3[i].isna().sum()/len(df3)
    if null_rate > 0:
        print(f'The null rate of the columns {i} is {null_rate:.2f}')
    else:
        break
print('GO AHEAD, NO NULL DATAS ANYMORE')

In [None]:
#Kyrgysztan and Wallis and Futuna Islands may have wrong continent values. Let`s check
df3[df3['Entity'].isin(['Kyrgysztan', 'Wallis and Futuna Islands'])].head()

In [None]:
#Aaaand yeah, I was right! Let`s correct it!
df3['continent'] = np.where(df3['Entity'] == 'Kyrgysztan', 'Asia', df3['continent'])
df3['continent'] = np.where(df3['Entity'] == 'Wallis and Futuna Islands', 'Oceania', df3['continent'])

In [None]:
#Changing column names in order to make it simpler
others.columns = ['country', 'year', 'co2']
df3.columns = ['country', 'code', 'year', 'co2', 'continent']

In [None]:
#NOOOOOOW I THINK THAT ALL DATAS ARE GOOD TO GO!

In [None]:
# Some previous questions: How we measure CO2 emission?
# For CO2 measurements we consider amount of fossil fuel consumed, catle raised, amount of waste produced, imports and exports
# Basically we know the amount of CO2 and greenhouse gases each kilogram of fossil fuel can produce, and we estimate total CO2 emission
# Based on this knowledge

In [None]:
# Main impacts on the CO2 emission
# CO2 and greenhouse gases are responsible mainly for global warming. It creates a layer in the atmosphere that traps heat in Earth
# Side effects are: smog, air pollution, respiratory diseases, wildfire, climate change, crop disruptions , wildlife extinction

In [None]:
#WE HAVE 2 MAIN DATAFRAMES:

# 1. df3 - With CO2 emissions data from countries/continents
# 2. others - With CO2 emissions data from Antactic Fisheries and International Transports

In [None]:
#First, let`s take a look at the simplest dataframe: OTHERS

In [None]:
#What is Antactic Fisheries and International transport - Main definitions

In [None]:
#Antarctic Fishery

# This activity started in the 19th century mainly focused on seals and whales
# It changed through time, going from regional fishes to the current activity: Krill
# The main source of CO2 emissions are the fishing boats

In [None]:
# International transportation

#This activity consists of 2 main types of transport: people and goods
#People transportation includes: airplanes, boats, cruises
#Goods transportation inclues: Cargo ships and also Cargo airplanes
#The mains source of CO2 emissions are fossil fuel combustion

In [None]:
others1 = others.groupby(by = 'country').sum()

In [None]:
others1['co2'].plot(kind = 'bar', logy = True)
plt.xticks(rotation = 0)
plt.grid(linestyle = '-.')
plt.title('Emission of CO2 from 1750 - 2017')
plt.ylabel('Total emission of CO2 - log scale')
plt.show()

In [None]:
#This first plot show us that considering the interval 1750 - 2017, International transportation has contributed 100000 times more to CO2
#emission than Antactic fishery.

In [None]:
#Emission of CO2 - Antarctic Fisheries
#Tasks - investigate historical events along 1987 - 2007 that justifies increasing CO2 emission until 1994. Lower level until 2003. 
#And ramp up starting from 2003

In [None]:
#There was an intensive fishery activity during the 70-80`s, mainly for Icefishes.
#This kind of fishery was highly impacted by regulations after the 90`s in order to preserve wildlife and ecosystems
#In parallell the Krill Fishery was also going on, but the high costs for the expeditions and difficulties for processing krill
# for food (stinky and not nice to eat) made the activity drop sharply.
#After 2002, health food and pharmaceutical companies, lighted up the krill market once again. Krill oil has high concentration of Omega-3.
# Now, Krill is also used in foodstuff for farmed fish

In [None]:
plt.figure(figsize = (12,6))
sns.lineplot(data = others[others['country'] == 'Antarctic Fisheries'] , x = 'year', y = 'co2', hue = 'country')
plt.legend()
plt.xlim(1987,2007)
plt.xticks(ticks = others[others['country'] == 'Antarctic Fisheries']['year'].unique())
plt.ylabel('CO2 emission in tons')
plt.xlabel('Year (1987 - 2007)')
plt.title('CO2 emissions in tons - Antarctic Fisheries')
plt.text(1988,12000, 'Icefish and Krill\nfishery increase', backgroundcolor = 'gray', horizontalalignment = 'left')
plt.text(1999,6000, 'Icefish fishery regulation', backgroundcolor = 'gray', horizontalalignment = 'center')
plt.text(2005,8000, 'Omega 3 Mkt increase', backgroundcolor = 'gray', horizontalalignment = 'right')
plt.show()

In [None]:
#CO2 emissions in tons - International transportation
# We see a data division that must be analysed here: Before 1970 and after 1970
# Tasks: 1. Understand the difference in this 1970 division point
#        2. Study and bring to data historical events
#        3. No emission data before 1959. Why? How we measure emission?

In [None]:
plt.figure(figsize = (12,6))
sns.lineplot(data = others[others['country'] == 'International transport'] , x = 'year', y = 'co2', hue = 'country')
plt.legend()
plt.xticks(ticks = [1750,1775,1800,1825,1850,1875,1900,1925,1950,1975,2000,2009,2017], rotation = 45)
plt.ylabel('CO2 emission in tons (value x 10^9)')
plt.xlabel('Year (1750 - 2017)')
plt.title('CO2 emissions in tons - International transport')
plt.text(1825,200000000, 'No data available until 60`s', backgroundcolor = 'gray', horizontalalignment = 'left')
plt.grid()
plt.show()

In [None]:
#Internation transport before 1970 analysis
import re

In [None]:
#No trend detected here
#No data available until 60`s

In [None]:
plt.figure(figsize = (12,6))
sns.lineplot(data = others.loc[(others['country'] == 'International transport') & (others['year'] < 1970)] , x = 'year', y = 'co2', hue = 'country')
plt.legend()
#plt.xticks(ticks = [1750,1775,1800,1825,1850,1875,1900,1925,1950,1975,2000,2009,2017], rotation = 45)
plt.ylabel('CO2 emission in tons (value x 10^8)')
plt.xlabel('Year (1750 - 2017)')
plt.title('CO2 emissions in tons - International transport')
plt.grid()
plt.show()

In [None]:
#International transport - After 1960

In [None]:
#Peak at 70`s - Why?
#Development of world economy after WWII
#Slope until 80's - Why?
#OPEC Oil Price Shock - big recession due to oil embargo from Arab nations
#Peak until 2008 - why?
# Housing bubble in US - world crisis

In [None]:
plt.figure(figsize = (12,6))
sns.lineplot(data = others.loc[(others['country'] == 'International transport') & (others['year'] > 1960)] , x = 'year', y = 'co2', hue = 'country')
plt.legend()
plt.xticks(ticks = [1960,1970,1980,1990,2000,2010,2017])
plt.ylabel('CO2 emission in tons (value x 10^9)')
plt.xlabel('Year (1960 - 2017)')
plt.title('CO2 emissions in tons - International transport')
plt.text(1965, 400000000, 'World Economy\nDvp after WWII', ha = 'center')
plt.text(1975, 500000000, '1973 - Oil price Shock', ha = 'left')
plt.text(1995, 800000000, 'World Economy growth\nRising China', ha = 'center')
plt.text(2008, 1000000000, '2008 - US House Bubble', ha = 'left')
plt.grid()
plt.show()

In [None]:
###############################################################################################
###############################################################################################
###############################################################################################
###############################################################################################
###############################################################################################

In [None]:
#Let`s check the total polution from 1750-2017 of each continent

In [None]:
plt.figure(figsize = (10,5))
df3.groupby(by = 'continent').sum()['co2'].sort_values().plot(kind = 'bar')
plt.xticks(rotation = 0)
plt.xlabel('Continents on Earth')
plt.ylabel('CO2 emission in tons - values x 10^11')
plt.title('Total CO2 emission from 1750 - 2017 per continent')
plt.grid()

In [None]:
#Let`s check the CO2 emission per continent
#Tasks - Historical Facts

In [None]:
#We first must set our data as bellow

In [None]:
df4 = pd.pivot_table(data = df3, index = 'year', columns = 'continent', values = 'co2', aggfunc = 'sum')

In [None]:
#Oceania emissions

In [None]:
plt.figure(figsize = (10,5))
sns.lineplot(data = df4[['Oceania']])
plt.xlabel('Year')
plt.ylabel('CO2 emissions in tons - values x 10^8')
plt.title('Oceania emissions of CO2 in tons per year')
plt.grid()


In [None]:
dfoc = df3.query('continent == "Oceania"')

In [None]:
dfoc1 = pd.pivot_table(data = dfoc, index = 'year', columns = 'country', values = 'co2', aggfunc = 'sum')

In [None]:
plt.figure(figsize = (14,7))
sns.lineplot(data = dfoc1, dashes = False)
plt.xlabel('Year')
plt.ylabel('CO2 emissions in tons - values x 10^7')
plt.title('Oceania emissions of CO2 in tons per year')
plt.legend()
plt.grid()

In [None]:
dfoc2 = dfoc.groupby(by = 'country').sum().sort_values(by = 'co2')
dfoc2.reset_index(drop = False, inplace = True)

In [None]:
plt.figure(figsize = (14,7))
sns.barplot(data = dfoc2, x = 'country', y = 'co2')
plt.xticks(rotation = 45, horizontalalignment = 'right')
plt.grid()
plt.xlabel('Countries of Oceania')
plt.ylabel('CO2 emissions in tons - value x 10^10')
plt.title('Oceania countries total emission from 1750 - 2017')


In [None]:
#South America emissions

In [None]:
plt.figure(figsize = (10,5))
sns.lineplot(data = df4[['South America']])
plt.xlabel('Year')
plt.ylabel('CO2 emissions in tons - values x 10^9')
plt.title('South America emissions of CO2 in tons per year')
plt.grid()

In [None]:
dfsa = df3.query('continent == "South America"')

In [None]:
dfsa1 = pd.pivot_table(data = dfsa, index = 'year', columns = 'country', values = 'co2', aggfunc = 'sum')

In [None]:
plt.figure(figsize = (14,7))
sns.lineplot(data = dfsa1, dashes = False)
plt.xlabel('Year')
plt.ylabel('CO2 emissions in tons - values x 10^8')
plt.title('South American emissions of CO2 in tons per year')
plt.legend(framealpha = .2)
plt.grid()

In [None]:
dfsa2 = dfsa.groupby(by = 'country').sum().sort_values(by = 'co2')
dfsa2.reset_index(drop = False, inplace = True)

In [None]:
plt.figure(figsize = (14,7))
sns.barplot(data = dfsa2, x = 'country', y = 'co2')
plt.xticks(rotation = 45, horizontalalignment = 'right')
plt.grid()
plt.xlabel('Countries of Oceania')
plt.ylabel('CO2 emissions in tons - value x 10^10')
plt.title('South American countries total emission from 1750 - 2017')
plt.show()

In [None]:
#Africa emissions

In [None]:
plt.figure(figsize = (10,5))
sns.lineplot(data = df4[['Africa']])
plt.xlabel('Year')
plt.ylabel('CO2 emissions in tons - values x 10^9')
plt.title('Africa emissions of CO2 in tons per year')
plt.grid()

In [None]:
dfaf = df3.query('continent == "Africa"')

In [None]:
dfaf1 = pd.pivot_table(data = dfaf, index = 'year', columns = 'country', values = 'co2', aggfunc = 'sum')

In [None]:
plt.figure(figsize = (14,7))
sns.lineplot(data = dfaf1, dashes = False)
plt.xlabel('Year')
plt.ylabel('CO2 emissions in tons - values x 10^8')
plt.title('African emissions of CO2 in tons per year')
plt.legend(loc = 'upper left', ncol = 3, framealpha = .2)
plt.grid()

In [None]:
dfaf2 = dfaf.groupby(by = 'country').sum().sort_values(by = 'co2')
dfaf2.reset_index(drop = False, inplace = True)

In [None]:
plt.figure(figsize = (16,8))
sns.barplot(data = dfaf2, x = 'country', y = 'co2')
plt.xticks(rotation = 45, horizontalalignment = 'right')
plt.grid()
plt.xlabel('Countries of Africa')
plt.ylabel('CO2 emissions in tons - value x 10^10')
plt.title('African countries total emission from 1750 - 2017')
plt.show()

In [None]:
#North America emissions

In [None]:
plt.figure(figsize = (10,5))
sns.lineplot(data = df4[['North America']])
plt.xlabel('Year')
plt.ylabel('CO2 emissions in tons - values x 10^9')
plt.title('North America emissions of CO2 in tons per year')
plt.grid()

In [None]:
dfna = df3.query('continent == "North America"')

In [None]:
dfna1 = pd.pivot_table(data = dfna, index = 'year', columns = 'country', values = 'co2', aggfunc = 'sum')

In [None]:
plt.figure(figsize = (14,7))
sns.lineplot(data = dfna1, dashes = False)
plt.xlabel('Year')
plt.ylabel('CO2 emissions in tons - values x 10^9')
plt.title('North American emissions of CO2 in tons per year')
plt.legend(loc = 'upper left', ncol = 3, framealpha = .2)
plt.grid()

In [None]:
dfna2 = dfna.groupby(by = 'country').sum().sort_values(by = 'co2')
dfna2.reset_index(drop = False, inplace = True)

In [None]:
plt.figure(figsize = (16,8))
sns.barplot(data = dfna2, x = 'country', y = 'co2')
plt.xticks(rotation = 45, horizontalalignment = 'right')
plt.grid()
plt.xlabel('Countries of North America')
plt.ylabel('CO2 emissions in tons - value x 10^11')
plt.title('North American countries total emission from 1750 - 2017')
plt.show()

In [None]:
#Europe emissions

In [None]:
plt.figure(figsize = (10,5))
sns.lineplot(data = df4[['Europe']])
plt.xlabel('Year')
plt.ylabel('CO2 emissions in tons - values x 10^9')
plt.title('Europe emissions of CO2 in tons per year')
plt.grid()

In [None]:
dfeu = df3.query('continent == "Europe"')

In [None]:
dfeu1 = pd.pivot_table(data = dfeu, index = 'year', columns = 'country', values = 'co2', aggfunc = 'sum')

In [None]:
plt.figure(figsize = (14,7))
sns.lineplot(data = dfeu1, dashes = False)
plt.xlabel('Year')
plt.ylabel('CO2 emissions in tons - values x 10^9')
plt.title('European emissions of CO2 in tons per year')
plt.legend(loc = 'upper left', ncol = 3, framealpha = .2)
plt.grid()

In [None]:
dfeu2 = dfeu.groupby(by = 'country').sum().sort_values(by = 'co2')
dfeu2.reset_index(drop = False, inplace = True)

In [None]:
plt.figure(figsize = (16,8))
sns.barplot(data = dfeu2, x = 'country', y = 'co2')
plt.xticks(rotation = 45, horizontalalignment = 'right')
plt.grid()
plt.xlabel('Countries of Europe')
plt.ylabel('CO2 emissions in tons - value x 10^11')
plt.title('European countries total emission from 1750 - 2017')
plt.show()

In [None]:
#Asia emissions

In [None]:
plt.figure(figsize = (10,5))
sns.lineplot(data = df4[['Asia']])
plt.xlabel('Year')
plt.ylabel('CO2 emissions in tons - values x 10^10')
plt.title('Asia emissions of CO2 in tons per year')
plt.grid()

In [None]:
dfas = df3.query('continent == "Asia"')

In [None]:
dfas1 = pd.pivot_table(data = dfas , index = 'year', columns = 'country', values = 'co2', aggfunc = 'sum')

In [None]:
plt.figure(figsize = (14,7))
sns.lineplot(data = dfas1, dashes = False)
plt.xlabel('Year')
plt.ylabel('CO2 emissions in tons - values x 10^10')
plt.title('Asian emissions of CO2 in tons per year')
plt.legend(loc = 'upper left', ncol = 3, framealpha = .2)
plt.grid()

In [None]:
dfas2 = dfas.groupby(by = 'country').sum().sort_values(by = 'co2')
dfas2.reset_index(drop = False, inplace = True)

In [None]:
plt.figure(figsize = (16,8))
sns.barplot(data = dfas2, x = 'country', y = 'co2')
plt.xticks(rotation = 45, horizontalalignment = 'right')
plt.grid()
plt.xlabel('Countries of Europe')
plt.ylabel('CO2 emissions in tons - value x 10^11')
plt.title('European countries total emission from 1750 - 2017')
plt.show()

In [None]:
#World emission

In [None]:
plt.figure(figsize = (15,7.5))
sns.lineplot(data = df4)
plt.xlabel('Year')
plt.ylabel('CO2 emissions in tons - values x 10^10')
plt.title('All continents emissions of CO2 in tons per year')
plt.grid()

In [None]:
#All countries emission

In [None]:
all = pd.pivot_table(data = df3, index = 'year', columns = 'country', values = 'co2', aggfunc = 'sum')

In [None]:
plt.figure(figsize = (30,15))
sns.lineplot(data = all, dashes = False)
plt.legend(ncol = 5, framealpha = .2)

In [None]:
total_countries = df3.groupby(by = 'country').sum().sort_values(by = 'co2', ascending = False)

In [None]:
total_countries.reset_index(drop = False, inplace = True)

In [None]:
plt.figure(figsize = (20,10))
sns.barplot(data = total_countries.head(20), x = 'country' , y = 'co2')
plt.xticks(rotation = 45)
plt.grid()