### Importing the required libraries

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
sns.set_palette("dark")
sns.set_style("whitegrid")
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

### Reading data from the CSV file

In [None]:
emission_table = pd.read_csv("/kaggle/input/international-greenhouse-gas-emissions/greenhouse_gas_inventory_data_data.csv")

In [None]:
emission_table.head()

As we can see, the category data colum is having a large width that may bot be fit in pandas dataframe, hence using the colwidth for the better visibility of data across the category column.

In [None]:
pd.set_option('display.max_colwidth', -1)

In [None]:
by_category  = emission_table.groupby(['category'])

In [None]:
category_count = by_category.count()

In [None]:
category_count

As we know we need to slice the category names in many parts to get the exact name of the Green House Gas. Here we can see a common thing in every category name =, i.e. *"_in_kilotonne_co2_equivalent"* at the end of every category. So my workflow for next few hours will comprise of the following things.

  * Slicing the common part from each category name and updating that with a copy of the main dataframe.
  * Slicing GHG and the other required hyperparameters like - indirect co2 or land usage forestry etc. 

### Breaking the problem in smaller part and preparing a small algo

In [None]:
strp = category_count.index

In [None]:
io = strp[0]
io[108]

In [None]:
io

In [None]:
hdd = len(io)
hdd

In [None]:
io.find("_in_kilotonne_co2_equivalent",0,hdd)

In [None]:
# Here we get success
io[:81]

**Conclusion for the Algorithm is to run a for loop and update each category type one by onne, then update the main category dataframe.**

### Algorithm Begins

In [None]:
new_category_index = []
for string in strp:
    p = len(string)
    pos = string.find("_in_kilotonne_co2_equivalent",0,p)
    string = string[:pos]
    new_category_index.append(string)

In [None]:
new_category_index

In [None]:
new_category_index_reborn = []
for lingo in new_category_index:
    q = len(lingo)
    pos = lingo.find("_without",0,p)
    lingo = lingo[:pos]
    new_category_index_reborn.append(lingo)

In [None]:
new_category_index_reborn

In [None]:
short_category = ["co2","ghg(indirect co2)","ghg","hfc","ch4","nf3","n2o","pfc","sf6","hfc+pfc"]

In [None]:
category_count["Shorted_category"] = short_category

In [None]:
category_count

### Cleaning the data for better comprehension

Replacing the category colum by the following:-
* carbon_dioxide_co2_emissions_without_land_use_land_use_change_and_forestry_lulucf_in_kilotonne_co2_equivalent	= **CO2**
* greenhouse_gas_ghgs_emissions_including_indirect_co2_without_lulucf_in_kilotonne_co2_equivalent = **GHG(Indirect CO2)**
* greenhouse_gas_ghgs_emissions_without_land_use_land_use_change_and_forestry_lulucf_in_kilotonne_co2_equivalent = **GHG**
* hydrofluorocarbons_hfcs_emissions_in_kilotonne_co2_equivalent = **HFC**
* methane_ch4_emissions_without_land_use_land_use_change_and_forestry_lulucf_in_kilotonne_co2_equivalent = **CH4**
* nitrogen_trifluoride_nf3_emissions_in_kilotonne_co2_equivalent = **NF3**
* nitrous_oxide_n2o_emissions_without_land_use_land_use_change_and_forestry_lulucf_in_kilotonne_co2_equivalent = **N2O**
* perfluorocarbons_pfcs_emissions_in_kilotonne_co2_equivalent = **PFC**
* sulphur_hexafluoride_sf6_emissions_in_kilotonne_co2_equivalent = **SF6**
* unspecified_mix_of_hydrofluorocarbons_hfcs_and_perfluorocarbons_pfcs_emissions_in_kilotonne_co2_equivalent = **HFC+PFC**

In [None]:
trying_emission = emission_table

In [None]:
replaced_emission = trying_emission.replace(to_replace=["carbon_dioxide_co2_emissions_without_land_use_land_use_change_and_"
                                     "forestry_lulucf_in_kilotonne_co2_equivalent","greenhouse_gas_ghgs_emissions_including_indirect_co2"
                                    "_without_lulucf_in_kilotonne_co2_equivalent","greenhouse_gas_ghgs_emissions_without_land_use_land_use"
                                    "_change_and_forestry_lulucf_in_kilotonne_co2_equivalent","hydrofluorocarbons_hfcs_emissions_in_kilotonne_co2_equivalent",
                                    "methane_ch4_emissions_without_land_use_land_use_change"
                                    "_and_forestry_lulucf_in_kilotonne_co2_equivalent","nitrogen_trifluoride_nf3_emissions_in_kilotonne_co2_equivalent",
                                    "nitrous_oxide_n2o_emissions_without_land_use_land_use_change" 
                                    "_and_forestry_lulucf_in_kilotonne_co2_equivalent","perfluorocarbons_pfcs_emissions_in_kilotonne_co2_equivalent",
                                    "sulphur_hexafluoride_sf6_emissions_in_kilotonne_co2_equivalent",
                                    "unspecified_mix_of_hydrofluorocarbons_hfcs_and_perfluorocarbons"
                                    "_pfcs_emissions_in_kilotonne_co2_equivalent"], value = ["CO2","GHG(Indirect CO2)","GHG","HFC","CH4","NF3","N2O","PFC","SF6","HFC+PFC"])


In [None]:
# replacing and changing the data and it's index for better EDA(Exploratory Data Analysis)
l = replaced_emission.groupby(["category"],as_index=False)

In [None]:
l.count()

### Exploratory Data Analysis

In [None]:
plt.figure(figsize=(15,7))
ax = sns.countplot(replaced_emission["category"])
ax.set_xticklabels(ax.get_xticklabels(),rotation=40, ha="right", fontsize=14)
plt.tight_layout()
plt.xlabel("Gas category",fontsize=16)
plt.ylabel("Count",fontsize=16)
plt.rcParams["figure.figsize"] = [15, 10]
plt.show()

**Conclusion -1**:
    The data iteself is very irregular interms of distribution and labels. *GHG(Indirect CO2)* is a part of CO2 data. Hence *CO2* is the most occuring Green House Gas. Also the *HFC*,*PFC* anf *HFC+PFC* data can be overlapped, but the exact proportion is unknown. *NF3* turns out to be occured the least. Also *GHG* has a undefined label, we can't say exactly how much proportion of other gases are present or some of them are not present while labeling the data.

### Calculating the Total amount of gases emitted 

In [None]:
loct = replaced_emission.groupby(['category'])['value'].sum()

In [None]:
replaced_emission['Total Emitted Gas'] = replaced_emission['value'].groupby(replaced_emission['category']).transform('sum')

In [None]:
loct.values

In [None]:
new_dataframe_emission = pd.DataFrame(loct.index)

In [None]:
new_dataframe_emission["Total Amount Emitted(In Kilotones)"] = loct.values

### Sorting the dataframe to know the amount

In [None]:
new_dataframe_emission.sort_values(by=['Total Amount Emitted(In Kilotones)'], inplace=True,ascending=False)

In [None]:
new_dataframe_emission

**Conclusion - 2:**
    Yet another problem, *GHG* category has the maximum emitted amount, *CO2* and *indirect CO2*, sums up to make *CO2* maximum. *NF3* has the least emitted amount. 

*Lets try another approach*

In [None]:
replaced_emission.head()

In [None]:
Australia_data = replaced_emission[replaced_emission["country_or_area"]=="Australia"].groupby(["category","year"],as_index = False)

In [None]:
data_div = pd.pivot_table(replaced_emission,values="value",index = ["country_or_area", "year"],columns = ["category"])

In [None]:
data_div.head(10)

In [None]:
data_div.plot()

This doesn't depict the correct info and looks like a mess

### Plotting Country Wise

In [None]:
replaced_emission["country_or_area"].unique()

In [None]:
gases = data_div.columns.values

In [None]:
gases

In [None]:
# lets define a function that can plot the country data 
def plot_the_country(name):
    find = data_div.loc[name]
    plt.plot(find)
    plt.legend(gases)
    plt.tick_params(labelsize=12)
    plt.rcParams["figure.figsize"] = [15, 10]
    plt.xlim(2000,2014)

In [None]:
plot_the_country("Australia")

In [None]:
plot_the_country("United States of America")

During Recession from Dec,2007-June,2009, **United States Of America** had a deep decline in emission of CO2. There may be many reasons, one being the decrease in purchasing power of automobile in USA or fall in Industrial development and production.

**Note:-** It requires more research later.

May be **United States of America** has changed it's way of business and industrial laws after the Recession in 2009, that's why the emission value of CO2(direct and indirect) has increased till 2010.

On the other hand, **Australia** had no such effects.It had almost uniform increase in the amount of CO2 emission during recession.

Lets see few more countries

In [None]:
plot_the_country("Denmark")

May be after recession, **Denmark** had new laws on pollution control. There is a strong negative inclination.

Lets look at East-Asian countries

In [None]:
plot_the_country("Japan")

**Japan** had a decrease in CO2 emission during recession, may be they were affected the most or any new law arrived which reduced the purchasing power of their countymen. It requires more research.

**Conclusion-3** - We need to compare the emission data country wise instead of gas types,that can give us a better research output.

### Comparing Emission Data Country Wise

In [None]:
area_div = pd.pivot_table(replaced_emission, values='value', index=['category', 'year'], columns=['country_or_area'])
area_div.head(20)

In [None]:
countries = area_div.columns.values

In [None]:
def country_wise_plot(name):
    cname = area_div.loc[name]
    plt.plot(cname)
    plt.tick_params(labelsize=14)
    plt.legend(countries, loc = "center left",bbox_to_anchor=(1, 0.5),fontsize = 18,ncol = 3)
    plt.rcParams["figure.figsize"] = [15, 10]

In [None]:
gases

In [None]:
country_wise_plot(gases[0])

Above plot is OK for rough estimation but not good for a better grasp, hence let's design something that can plot the selected countries on the same graph with the same context.

### Comparing Countries By Passing Required Series

In [None]:
def gas_accord_country1(gas_name, country_name):                          # years from 1990-2004
    data = area_div.loc[gas_name]
    data.plot( y = country_name)
    plt.legend(country_name,loc = "center left",bbox_to_anchor=(1, 0.5),fontsize = 18,ncol = 2)
    plt.tick_params(labelsize=14)
    plt.xlabel("Year",fontsize=14)
    plt.xlim(1990,2004)
    plt.rcParams["figure.figsize"] = [15, 10]
    
def gas_accord_country2(gas_name, country_name):
    data = area_div.loc[gas_name]
    data.plot( y = country_name)
    plt.legend(country_name,loc = "center left",bbox_to_anchor=(1, 0.5),fontsize = 18,ncol = 2)
    plt.tick_params(labelsize=14)
    plt.xlabel("Year",fontsize=14)
    plt.xlim(2004,2017)
    plt.rcParams["figure.figsize"] = [15, 10]

In [None]:
gases

In [None]:
countries_name = replaced_emission["country_or_area"].unique()

In [None]:
countries_name

## Analysing the data WRT gas name and country

### For Methane(CH4) emission:

#### 1. Country Set - 1 :

In [None]:
gas_accord_country1(gases[0],countries_name[:8])

In [None]:
gas_accord_country2(gases[0],countries_name[:8])

#### 2. Country Set - 2 :

In [None]:
gas_accord_country1(gases[0],countries_name[8:16])

In [None]:
gas_accord_country2(gases[0],countries_name[8:16])

#### 3. Country Set - 3 :

In [None]:
gas_accord_country1(gases[0],countries_name[16:24])

In [None]:
gas_accord_country2(gases[0],countries_name[16:24])

#### 3. Country Set - 4 :

In [None]:
gas_accord_country1(gases[0],countries_name[24:32])

In [None]:
gas_accord_country2(gases[0],countries_name[24:32])

#### 4. Country Set - 4

In [None]:
gas_accord_country1(gases[0],countries_name[32:40])

In [None]:
gas_accord_country2(gases[0],countries_name[32:40])

#### 5. Final Set -

In [None]:
gas_accord_country1(gases[0],countries_name[40:43])

In [None]:
gas_accord_country2(gases[0],countries_name[40:43])

In [None]:
data_div.head()

### Cleaning the GHG & GHG(Indirect CO2) column

In [None]:
data_div["GHG"].plot()

In [None]:
data_div["GHG(Indirect CO2)"].plot()

It seems like **GHG(Indirect CO2)** has NaN data and it's discontinuous. Also the above plots depict that both the column are almost similar, lets check if there is any dissimilarity.

In [None]:
cleaned_data = data_div

In [None]:
cleaned_data.head()

In [None]:
cleaned_data["Check"] = cleaned_data["GHG"] - cleaned_data["GHG(Indirect CO2)"]

In [None]:
cleaned_data.head()

In [None]:
cleaned_data[(cleaned_data["Check"] !=0) & (cleaned_data["Check"] < 0)]

 So there are 275 rows who has a differenc of **GHG** and **GHG(Indirect CO2)** column. So we can safely drop the **GHG(Indirect CO2) column**.

In [None]:
cleaned_data = cleaned_data.drop("GHG(Indirect CO2)",axis = 1)

In [None]:
cleaned_data = cleaned_data.drop("Check",axis = 1)

### Checking the "HFC+PFC" Column 

In [None]:
cleaned_data["HFC+PFC"].isnull().sum()

In [None]:
Regular_data = cleaned_data[cleaned_data["HFC+PFC"].isnull()==False]
len(Regular_data)

So the conclusion is **HFC+PFC** column has only 75 values which are not nulls.

In [None]:
Regular_data = Regular_data.reset_index()

In [None]:
Regular_data.head(10)

In [None]:
Regular_data.groupby("country_or_area").count()

As we can see, the dataset has data for only for three countries

In [None]:
gases

In [None]:
gas_accord_country1(gases[5],["Germany","United States of America"])

In [None]:
gas_accord_country2(gases[5],["Germany","United States of America"])

In [None]:
gas_accord_country1(gases[5],["European Union"])

In [None]:
gas_accord_country2(gases[5],["European Union"])

### Analysing the HFC and PFC column

#### HFC

In [None]:
gases

In [None]:
cleaned_data.head()

In [None]:
countries_name

In [None]:
cleaned_data[cleaned_data["HFC"].isnull()==True]

As we can see 99 rows have Nan values in their HFC column, hence we can ignore them during our analysis

In [None]:
gas_accord_country1(gases[4],countries_name[:10])

In [None]:
gas_accord_country2(gases[4],countries_name[:10])

In [None]:
gas_accord_country1(gases[4],countries_name[10:20])

In [None]:
gas_accord_country2(gases[4],countries_name[10:20])

In [None]:
gas_accord_country1(gases[4],countries_name[20:30])

In [None]:
gas_accord_country2(gases[4],countries_name[20:30])

In [None]:
gas_accord_country1(gases[4],countries_name[30:40])

In [None]:
gas_accord_country2(gases[4],countries_name[30:40])

In [None]:
gas_accord_country1(gases[4],countries_name[40:])

In [None]:
gas_accord_country2(gases[4],countries_name[40:])

#### PFC

In [None]:
cleaned_data[cleaned_data["PFC"].isnull()==True]

Around 20% data is not available in **PFC** column, It's a big irregularity, but still anaysis and an approximation can be done from the remaining 80% of the data.

In [None]:
gas_accord_country1(gases[8],countries_name[10:20])

In [None]:
gas_accord_country2(gases[8],countries_name[10:20])

In [None]:
gas_accord_country1(gases[8],countries_name[20:30])

In [None]:
gas_accord_country2(gases[8],countries_name[20:30])

In [None]:
gas_accord_country1(gases[8],countries_name[30:40])

In [None]:
gas_accord_country2(gases[8],countries_name[30:40])

In [None]:
gas_accord_country1(gases[8],countries_name[40:43])

In [None]:
gas_accord_country2(gases[8],countries_name[40:43])

### Analysing the NF3 column

In [None]:
cleaned_data[cleaned_data["NF3"].isnull()==False]

Only **248** rows are having a **not Null** value in **NF3** column, which depicts the unavailability of the data, hecne we are going to create another dataframe object that has not Null data in NF3 column and we will use that for the analysis purpose.

In [None]:
nf3_data = cleaned_data[cleaned_data["NF3"].isnull()==False].reset_index()

In [None]:
nf3_data.groupby("country_or_area").count()

In [None]:
nf3_countries = nf3_data.groupby("country_or_area").count().index

In [None]:
nf3_countries

In [None]:
gas_accord_country1(gases[7],nf3_countries)

### Analysing the SF6 column 

In [None]:
len(cleaned_data[cleaned_data["SF6"].isnull()==True])

As we can see only a small chunk of data has Null value in the SF6 column, we can safely plot the data and analyse it.

In [None]:
gas_accord_country1(gases[9],countries_name[:10])

In [None]:
gas_accord_country2(gases[9],countries_name[:10])

In [None]:
gas_accord_country1(gases[9],countries_name[10:20])

In [None]:
gas_accord_country2(gases[9],countries_name[10:20])

In [None]:
gas_accord_country1(gases[9],countries_name[20:30])

In [None]:
gas_accord_country2(gases[9],countries_name[20:30])

In [None]:
gas_accord_country1(gases[9],countries_name[30:40])

In [None]:
gas_accord_country2(gases[9],countries_name[30:40])

In [None]:
gas_accord_country1(gases[9],countries_name[40:])

In [None]:
gas_accord_country2(gases[9],countries_name[40:])

### Analysing the N2O Column

In [None]:
gases

In [None]:
len(cleaned_data[cleaned_data["N2O"].isnull()==True])

In [None]:
gas_accord_country1(gases[6],countries_name[:10])

In [None]:
gas_accord_country2(gases[6],countries_name[:10])

In [None]:
gas_accord_country1(gases[6],countries_name[10:20])

In [None]:
gas_accord_country2(gases[6],countries_name[10:20])

In [None]:
gas_accord_country1(gases[6],countries_name[20:30])

In [None]:
gas_accord_country2(gases[6],countries_name[20:30])

In [None]:
gas_accord_country1(gases[6],countries_name[30:40])

In [None]:
gas_accord_country2(gases[6],countries_name[30:40])

In [None]:
gas_accord_country1(gases[6],countries_name[40:43])

In [None]:
gas_accord_country2(gases[6],countries_name[40:50])

### Analysing CO2 and GHG emission

In [None]:
len(cleaned_data[cleaned_data["GHG"].isnull()==True])

In [None]:
gases

In [None]:
gas_accord_country1(gases[1],countries_name[:5])

In [None]:
gas_accord_country1(gases[2],countries_name[:5])

In [None]:
cleaned_data.head()

In [None]:
cleaned_data["difference"] = cleaned_data["GHG"] - cleaned_data["CO2"]

In [None]:
cleaned_data.head()

In [None]:
gas_accord_country2(gases[1],countries_name[:5])

In [None]:
gas_accord_country1(gases[1],countries_name[5:10])

In [None]:
gas_accord_country2(gases[1],countries_name[5:10])

In [None]:
gas_accord_country1(gases[1],countries_name[10:15])

In [None]:
gas_accord_country2(gases[1],countries_name[10:15])

In [None]:
gas_accord_country1(gases[1],countries_name[15:25])

In [None]:
gas_accord_country2(gases[1],countries_name[15:25])

In [None]:
gas_accord_country1(gases[1],countries_name[25:30])

In [None]:
gas_accord_country2(gases[1],countries_name[25:30])

In [None]:
gas_accord_country1(gases[1],countries_name[30:35])

In [None]:
gas_accord_country2(gases[1],countries_name[30:35])

In [None]:
gas_accord_country1(gases[1],countries_name[35:40])

In [None]:
gas_accord_country2(gases[1],countries_name[35:40])

In [None]:
gas_accord_country1(gases[1],countries_name[40:43])

In [None]:
gas_accord_country2(gases[1],countries_name[40:43])

**GHG** will give a similar plot as of above with different range values in x and y axis, so analysis will be focused on some countries having high GHG emission.

In [None]:
new_table = pd.pivot_table(replaced_emission, values='value',index=['category'],columns=['country_or_area'])

In [None]:
new_table

In [None]:
clean_new_table = new_table.fillna(0)

In [None]:
clean_new_table['Australia'].index

In [None]:
clean_new_table = clean_new_table.reset_index()

In [None]:
clean_new_table['Australia']

In [None]:
clean_new_table = clean_new_table.drop(clean_new_table.index[[2,3]])

### Checking Gas Emissions in a country

Let's define a function, that can check for the plot WRT the country name passed.

In [None]:
def check_country(name):
    clean_new_table.plot(x = 'category', y = name)
    plt.tick_params(labelsize=14)
    plt.xlabel("Category Of GreenHouse Gases",fontsize=14)
    plt.rcParams["figure.figsize"] = [15, 10]
    plt.legend(fontsize = 20)

In [None]:
def tabulation_new(name):
    point = clean_new_table[name].sum()
    data_storage = clean_new_table[['category',name]]
    data_storage['Percent'] = (data_storage[name]/point * 100)
    print(data_storage)

#### Checking the plot with the country names

In [None]:
clean_new_table.columns

In [None]:
check_country('Australia')

In [None]:
tabulation_new('Australia')

In [None]:
check_country('Belgium')

In [None]:
tabulation_new('Belgium')

In [None]:
check_country('Canada')

In [None]:
tabulation_new('Canada')

In [None]:
check_country('European Union')

In [None]:
tabulation_new('European Union')

In [None]:
check_country('France')

In [None]:
tabulation_new('France')

In [None]:
check_country('Germany')

In [None]:
tabulation_new('Germany')

In [None]:
check_country('Italy')

In [None]:
tabulation_new('Canada')

In [None]:
check_country('Japan')

In [None]:
tabulation_new('Japan')

In [None]:
check_country('New Zealand')

In [None]:
tabulation_new('New Zealand')

In [None]:
check_country('Norway')

In [None]:
tabulation_new('Norway')

In [None]:
clean_new_table.columns

In [None]:
check_country('Russian Federation')

In [None]:
tabulation_new('Russian Federation')

### Top 10 countries responsible for GHG Emission

In [None]:
clean_new_table

In [None]:
new_table2 = pd.pivot_table(replaced_emission, values='value',index=['country_or_area'],columns=['category'])

In [None]:
new_table2 = new_table2.fillna(0)

In [None]:
sum_of_total_emission = new_table2.sum(axis=1)

In [None]:
new_table2['Total'] = sum_of_total_emission

In [None]:
sum_of_total_emission.sort_values(ascending=False)