## Assignment 3
Haider, Shabih, 21027325

In [10]:
import numpy as np
import pandas as pd
import wbgapi as wb
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn.cluster as cluster


In [2]:
# This function reads the data file and manipulates dataframe into our desired format
def world_bank_dfParser(df_file_name):
    df = pd.read_csv(df_file_name, index_col=[0,1])
    
    # Returning 2 Data Frames, One with years as columns, other with countries as columns
    return df, df.T   

In [3]:
# This function retrieves data from world bank website and saves it into a file
def get_world_bank_data(c_codes):
    # Indicator ids
    indicator_ids = ['EG.USE.PCAP.KG.OE', 'EN.ATM.METH.KT.CE', 'EN.ATM.GHGT.KT.CE']
    print(wb.series.info(indicator_ids))
    
    # Fetching data from World bank for listed countries and indicators
    wb_dataframe = wb.data.DataFrame(indicator_ids, country_codes, time=range(2010,2015)) # retrieving data for 2010 - 2015
    
    # Renaming indicator Ids with actual names
    wb_dataframe = wb_dataframe.rename(index={
        'EG.USE.PCAP.KG.OE' : 'Energy use', 
        'EN.ATM.METH.KT.CE' : 'Methane emissions', 
        'EN.ATM.GHGT.KT.CE' : 'Total greenhouse gas emissions'
    })
    
    # Renaming country and indicator column names
    wb_dataframe.index.rename(['Country', 'Indicators'], inplace=True)
    
    # Checking for null values in dataframe
    display(wb_dataframe.isnull().sum())
    
    # Writing received data to csv file
    print('Writing data to csv file.......')
    wb_dataframe.to_csv('wb_dataframe.csv', index=True)
    print('Done')
    


In [4]:
# Selected countries
country_codes = ['CHN', 'FRA', 'DEU', 'IND', 'USA', 'WLD']
country_names = {
    "CHN" : "China",
    "FRA" : "France",
    "DEU" : "Germany",
    "IND" : "India",
    "USA" : "United States",
    "WLD" : "World"
}

# This following function is used to fetch the data from WB API. Only call this function if you want to update the data.
get_world_bank_data(country_codes)

df_1, df_2 = world_bank_dfParser('wb_dataframe.csv')

id                 value
-----------------  -----------------------------------------------------
EG.USE.PCAP.KG.OE  Energy use (kg of oil equivalent per capita)
EN.ATM.METH.KT.CE  Methane emissions (kt of CO2 equivalent)
EN.ATM.GHGT.KT.CE  Total greenhouse gas emissions (kt of CO2 equivalent)
                   3 elements


YR2010    0
YR2011    0
YR2012    0
YR2013    0
YR2014    0
dtype: int64

Writing data to csv file.......
Done


In [18]:
df_2

Country,CHN,CHN,CHN,DEU,DEU,DEU,FRA,FRA,FRA,IND,IND,IND,USA,USA,USA,WLD,WLD,WLD
Indicators,Energy use,Total greenhouse gas emissions,Methane emissions,Energy use,Total greenhouse gas emissions,Methane emissions,Energy use,Total greenhouse gas emissions,Methane emissions,Energy use,Total greenhouse gas emissions,Methane emissions,Energy use,Total greenhouse gas emissions,Methane emissions,Energy use,Total greenhouse gas emissions,Methane emissions
YR2010,1954.722556,0.0,0.0,1.0,0.934923,1.0,1.0,1.0,1.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0
YR2011,2085.083022,0.524066,0.297514,0.415198,0.220815,0.554348,0.525448,0.629661,0.515738,0.218118,0.220918,0.977337,0.545707,0.518508,0.312677,0.150436,0.459635,0.405315
YR2012,2149.602569,0.719842,0.612937,0.44797,0.571535,0.746377,0.496334,0.689884,0.302663,0.500574,0.515557,0.076487,0.0,0.0,0.0,0.375937,0.673468,0.653795
YR2013,2204.243299,0.993512,0.802009,0.735546,1.0,0.416667,0.487607,0.709816,0.210654,0.589183,0.643441,0.96034,0.116004,0.295311,0.008179,0.429133,0.921906,0.695214
YR2014,2224.354898,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,0.306347,0.3383,0.083674,1.0,1.0,1.0


In [5]:
# Some statistical information about the data. Statistical analysis is performed on all years.
df_2.groupby(['Country', 'Indicators'], axis = 1).describe()

Unnamed: 0_level_0,Unnamed: 1_level_0,count,mean,std,min,25%,50%,75%,max
Country,Indicators,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
CHN,Energy use,5.0,2123.601,108.7821,1954.723,2085.083,2149.603,2204.243,2224.355
CHN,Total greenhouse gas emissions,5.0,11279610.0,686914.1,10203820.0,11074550.0,11399830.0,11854530.0,11865310.0
CHN,Methane emissions,5.0,1141070.0,56770.93,1063830.0,1106190.0,1151100.0,1178020.0,1206210.0
DEU,Energy use,5.0,3892.567,81.6741,3779.462,3869.816,3876.948,3939.53,3997.079
DEU,Total greenhouse gas emissions,5.0,865620.0,17570.72,843660.0,852550.0,866670.0,881300.0,883920.0
DEU,Methane emissions,5.0,58660.0,1033.755,57160.0,58310.0,58690.0,59220.0,59920.0
FRA,Energy use,5.0,3838.64,126.5881,3659.088,3833.534,3836.656,3847.072,4016.848
FRA,Total greenhouse gas emissions,5.0,452400.0,17157.4,424130.0,453510.0,456320.0,457250.0,470790.0
FRA,Methane emissions,5.0,63536.0,1570.694,61860.0,62730.0,63110.0,63990.0,65990.0
IND,Energy use,5.0,596.2339,28.5288,561.6534,577.9944,599.1556,605.794,636.5718


In [11]:
def norm(array):
    min_val = np.min(array)
    max_val = np.max(array)
    
    scaled = (array-min_val) / (max_val-min_val)
    
    return scaled

def norm_df(df):
    # iterate over all columns
    for col in df.columns[1:]:     # excluding the first column
        df[col] = norm(df[col])
        
    return df

In [14]:
# normalise result and inspect results
normalised_df = norm_df(df_2)
print(normalised_df.describe())
print()


Country             CHN                                                   \
Indicators   Energy use Total greenhouse gas emissions Methane emissions   
count          5.000000                       5.000000          5.000000   
mean        2123.601269                       0.647484          0.542492   
std          108.782091                       0.413433          0.398728   
min         1954.722556                       0.000000          0.000000   
25%         2085.083022                       0.524066          0.297514   
50%         2149.602569                       0.719842          0.612937   
75%         2204.243299                       0.993512          0.802009   
max         2224.354898                       1.000000          1.000000   

Country           DEU                                                   \
Indicators Energy use Total greenhouse gas emissions Methane emissions   
count        5.000000                       5.000000          5.000000   
mean         0.51

In [15]:
def makeplot(df, col1, col2):
    plt.figure(figsize=(5.0,5.0))
    plt.plot(df[col1], df[col2], "o", markersize=3)
    
    plt.xlabel(col1)
    plt.ylabel(col2)
    plt.show()
    
    
# exploratory plots
makeplot(df_2, "Energy use", "Total greenhouse gas emissions")
makeplot(df_2, "Methane emissions", "Total greenhouse gas emissions")

KeyError: 'Energy use'

<Figure size 360x360 with 0 Axes>