In [2]:
import pandas as pd
from tasks.compute_index import format_df_for_computation, ST
from index.IndexComputation.GreenGrowthIndex import GreenGrowthIndex

import random
import numpy as np

In [11]:
YEAR = 2020

# Import data
data = pd.read_csv('data/full_data/data.csv')
data = format_df_for_computation(data)
indicators = data.loc[YEAR].set_index('ISO') # Do Analysis on single year for now

In [4]:
# Compute index
GGI = GreenGrowthIndex(indicators=indicators, sustainability_targets=ST) # Index computation
test= GGI.to_long() # Results ! 

In [13]:
test

Unnamed: 0_level_0,Variable,Value,Aggregation
ISO,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
AGO,Index,51.667633,Index
ALB,Index,71.414849,Index
ARG,Index,58.238744,Index
ARM,Index,62.930841,Index
AUS,Index,67.085630,Index
...,...,...,...
XKX,GEO,,Dimension
YEM,GEO,,Dimension
ZAF,GEO,42.623329,Dimension
ZMB,GEO,,Dimension


# Tasks

Perform a Monte Carlo Analysis to study the effect of **missing values**. 

Step:
    - write a function that randomly removes value in the indicators dataframe. The function needs a parameter to control the percentage of missing values
    - Compute the index on a great number of modified dataframe (100 to 1000)
    - Combine the results to get uncertainty intervals
    

In [7]:
def monte_carlo_analysis(indicators , n):
    """
    df_indicators : dataset
    n: percentage of nan in new dataset
    """
    
    shape_old_Indicators = indicators.shape
    print("Shape old indicator", shape_old_Indicators)
    print("total number of nan in old indicator", indicators.isna().sum().sum())
    
    
    # modified indicators
    df_indicators = indicators
    ix = [(row, col) for row in range(df_indicators.shape[0]) for col in range(df_indicators.shape[1])]
    for row, col in random.sample(ix, int(round(n*len(ix)))):
        df_indicators.iat[row, col] = 0.00 #np.nan
    
    shape_modified_Indicators = df_indicators.shape
    print("Shape modified indicator", shape_modified_Indicators)
    print("total number of nan in modified indicator", df_indicators.isna().sum().sum())   
    
    # compute new index
    GGI_new = GreenGrowthIndex(indicators=df_indicators, sustainability_targets=ST) # Index computation
    GGI_new.to_long() # Results ! 
    print("new index =============")
    
    return GGI_new.to_long().sort_values(by='Value' , ascending=False).head(10)

In [8]:
monte_carlo_analysis(indicators , 0.1) # n = 10%

Shape old indicator (248, 44)
total number of nan in old indicator 2845
Shape modified indicator (248, 44)
total number of nan in modified indicator 2563


Unnamed: 0_level_0,Variable,Value,Aggregation
ISO,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
CAF,EQ2,8584.968676,Indicator
ISL,GN2,7926.925611,Indicator
TCD,EQ2,7667.563311,Indicator
NOR,GN2,6388.110506,Indicator
SOM,EQ2,4286.436405,Indicator
NGA,EQ2,4239.846675,Indicator
TGO,EQ2,4038.852438,Indicator
BDI,EQ2,3906.237729,Indicator
KWT,EW2,3850.5,Indicator
SSD,EQ2,3763.44363,Indicator


In [10]:
GGI.to_long().sort_values(by='Value' , ascending=False).head(10)

Unnamed: 0_level_0,Variable,Value,Aggregation
ISO,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
CAF,EQ2,8584.968676,Indicator
ISL,GN2,7926.925611,Indicator
TCD,EQ2,7667.563311,Indicator
NOR,GN2,6388.110506,Indicator
NER,EQ2,5703.31493,Indicator
SOM,EQ2,4286.436405,Indicator
NGA,EQ2,4239.846675,Indicator
TGO,EQ2,4038.852438,Indicator
BDI,EQ2,3906.237729,Indicator
KWT,EW2,3850.5,Indicator


In [11]:
monte_carlo_analysis(indicators , 0.5) # n = 50%

Shape old indicator (248, 44)
total number of nan in old indicator 2563
Shape modified indicator (248, 44)
total number of nan in modified indicator 1292


Unnamed: 0_level_0,Variable,Value,Aggregation
ISO,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
CAF,EQ2,8584.968676,Indicator
ISL,GN2,7926.925611,Indicator
SOM,EQ2,4286.436405,Indicator
NGA,EQ2,4239.846675,Indicator
TGO,EQ2,4038.852438,Indicator
BDI,EQ2,3906.237729,Indicator
SSD,EQ2,3763.44363,Indicator
GNB,EQ2,3546.849296,Indicator
CMR,EQ2,3306.786379,Indicator
SWE,GN2,2834.902468,Indicator


In [13]:
monte_carlo_analysis(indicators , 0.99) # n = 99%

Shape old indicator (248, 44)
total number of nan in old indicator 255
Shape modified indicator (248, 44)
total number of nan in modified indicator 2


Unnamed: 0_level_0,Variable,Value,Aggregation
ISO,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
ZWE,ME,100.0,Category
NRU,ME1,100.0,Indicator_normed
NFK,ME1,100.0,Indicator_normed
NGA,ME1,100.0,Indicator_normed
NIC,ME1,100.0,Indicator_normed
NIU,ME1,100.0,Indicator_normed
NLD,ME1,100.0,Indicator_normed
NOR,ME1,100.0,Indicator_normed
NPL,ME1,100.0,Indicator_normed
NZL,ME1,100.0,Indicator_normed


In [None]:
# Combine the results to get uncertainty intervals