## The Engie Factor Team NOTEBOOK


<img src="https://www.engie.com/themes/engie/assets/img/thumbnail-engie.png" width=430 align="left">

## Authors
Name, Kaggle ID, email<br>
Sergio Garcia, SergioGarciadea, QK5945@engie.com<br>
Louis Giron, LGiron, Louis.Giron@engie.com<br>
Abderrahim Benseguane,AbderrahimBenseguane, abderrahim.benseguane@engie.com<br>
## CREDITS:
### The following example Jupyter Notebooks helped get us get started:
#### https://www.kaggle.com/paultimothymooney/overview-of-the-eie-analytics-challenge
#### https://www.kaggle.com/parulpandey/understanding-the-data-wip
#### https://www.kaggle.com/paultimothymooney/how-to-get-started-with-the-earth-engine-data
#### https://www.kaggle.com/ragnar123/exploratory-data-analysis-and-factor-model-idea

## Useful links:
#### https://www.atmos-chem-phys.net/16/5283/2016/acp-16-5283-2016.pdf'

In [None]:
from scipy import integrate
from IPython.display import Math, Latex, display, HTML

from matplotlib import animation
from matplotlib import pyplot as plt

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import datetime as dt
from datetime import datetime 

try:
    import rasterio as rio
    import folium
except Exception:
    pass
import tifffile as tiff 
import random 

import os
from tqdm import tqdm
import ast

import warnings
warnings.filterwarnings('ignore')

%matplotlib inline
import seaborn as sns

def plot_points_on_map(dataframe,begin_index,end_index,latitude_column,latitude_value,longitude_column,longitude_value,zoom):
    df = dataframe[begin_index:end_index]
    location = [latitude_value,longitude_value]
    plot = folium.Map(location=location,zoom_start=zoom)
    for i in range(0,len(df)):
        popup = folium.Popup(str(df.primary_fuel[i:i+1]))
        folium.Marker([df[latitude_column].iloc[i],df[longitude_column].iloc[i]],popup=popup).add_to(plot)
    return(plot)

def overlay_puerto_rico(file_name,band_layer,lat,lon,zoom, bnds=[[18.6,-67.3,],[17.9,-65.7]]):
    band = rio.open(file_name).read(band_layer)
    band_below = rio.open(file_name).read(band_layer-1)
    m = folium.Map([lat, lon], zoom_start=zoom)
    folium.raster_layers.ImageOverlay(
        image=band,
        bounds = bnds,
        colormap=lambda x: (1, 0, 0, x),
    ).add_to(m)

    return m



In [None]:
WEATHER_images = []

data_dir='/kaggle/input/ds4g-environmental-insights-explorer/eie_data/'

import os
for dirname, _, filenames in os.walk(data_dir + 'gfs/'):
    for filename in filenames:
        WEATHER_images.append(os.path.join(dirname, filename))

print(len(WEATHER_images))

In [None]:
NO2_images = []

import os
for dirname, _, filenames in os.walk(data_dir + 's5p_no2/'):
    for filename in filenames:
        NO2_images.append(os.path.join(dirname, filename))

print(len(NO2_images))

In [None]:
example = tiff.imread(NO2_images[89])

In [None]:
example.shape

In [None]:
latitude=18.1429005246921; longitude=-65.4440010699994
overlay_puerto_rico('/kaggle/input/my-backgrounds/division.png',band_layer=4,lat=latitude,lon=longitude,zoom=8,
                             )

In [None]:
WIND_data = pd.DataFrame()
for i, num in tqdm(enumerate(range(len(WEATHER_images))), total=1460):
    img = tiff.imread(WEATHER_images[i])
    WIND_data[pd.to_datetime(WEATHER_images[i][-14:-6])+dt.timedelta(hours=int(WEATHER_images[i][-6:-4]))] = pd.Series(
        {
            'path':WEATHER_images[i],'WIND_X_M_SEC':np.average(np.nan_to_num(img[:, :, 3])), 
            'WIND_Y_M_SEC':np.average(np.nan_to_num(img[:, :, 4]))
        }
    )
    #print(dt.timedelta(hours=int(WEATHER_images[i][-6:-4])))
WIND_data = WIND_data.T
WIND_data = WIND_data.sort_index()

### We are going to focus on bands 3 and 4, so that we centered around 600 nm
### Source: Wikipedia, https://en.m.wikipedia.org/wiki/File:Mid-infrared_absorption_spectra_of_Gases.png
## ---------
![](https://upload.wikimedia.org/wikipedia/commons/thumb/3/34/Mid-infrared_absorption_spectra_of_Gases.png/440px-Mid-infrared_absorption_spectra_of_Gases.png)
## ---------

In [None]:
# NOTE: We are using nansum, given that sometimes the images had some NaN values in these band layers

NO2_data = pd.DataFrame()
for i, num in tqdm(enumerate(range(len(NO2_images))), total=387):
    img = tiff.imread(NO2_images[i])
    NO2_data[pd.to_datetime(NO2_images[i][-19:-11]) + dt.timedelta(hours=int(NO2_images[i][-10:-8]))] = pd.Series(
        {'path':NO2_images[i],'NO2':np.nansum(img[:, :, 2 : 4]),
        'NW_NO2':np.nansum(img[:int(example.shape[0]/2), :int(example.shape[1]/3), 2:4]),
         'SW_NO2':np.nansum(img[int(example.shape[0]/2):, :int(example.shape[1]/3), 2:4]),
         'N_NO2':np.nansum(img[:int(example.shape[0]/2), int(example.shape[1]/3):int(example.shape[1]*2/3), 2:4]),
        'S_NO2':np.nansum(img[int(example.shape[0]/2):, int(example.shape[1]/3):int(example.shape[1]*2/3), 2:4]),
        'NE_NO2':np.nansum(img[:int(example.shape[0]/2), int(example.shape[1]*2/3):, 2:4]),
        'SE_NO2':np.nansum(img[int(example.shape[0]/2):, int(example.shape[1]*2/3):, 2:4])}
    )

NO2_data = NO2_data.T

In [None]:
sns.set_style("darkgrid") #Available styles: darkgrid, whitegrid, dark, white, and ticks

In [None]:
fig = plt.figure(figsize=(14,11))
ax = NO2_data[['NO2']].plot(figsize=(14,11))
ax.set(xlabel='Date', ylabel='NO2 level')
plt.show()

In [None]:
NO2_data = NO2_data.sort_index()

In [None]:
# Lets compare a couple of regions

In [None]:
fig = plt.figure(figsize=(14,11))
ax = NO2_data[['NE_NO2', 'SW_NO2']].plot(figsize=(14,11))
ax.set(xlabel='Date', ylabel='NO2 level')
plt.show()

In [None]:
NO2_data.head()

In [None]:
graphs = ['NW_NO2','SW_NO2','N_NO2','S_NO2','NE_NO2','SE_NO2']
fig, ax = plt.subplots(2, 3, sharex='col', figsize=(17,9))

for i in range(2):
    for j in range(3):
        ax_tmp = plt.subplot(ax[i, j])
        ax_tmp.set(xlabel='Date', ylabel='NO2 level')
        ax_tmp.plot(NO2_data[graphs[i+j]])
        ax_tmp.text(0.05,0.95,str(graphs[i+j]),fontsize=11, color='grey',transform=ax_tmp.transAxes)


In [None]:
fig, ax = plt.subplots(2, 3, sharex='col', figsize=(17,9))

for i in range(2):
    for j in range(3):
        ax_tmp = plt.subplot(ax[i, j])
        ax_tmp.set(xlabel='Date', ylabel='NO2 daily variation')
        ax_tmp.plot(NO2_data[graphs[i+j]].diff())
        ax_tmp.text(0.05,0.95,str(graphs[i+j])+' Delta',fontsize=11, color='grey',transform=ax_tmp.transAxes)

In [None]:
# Heatmap of the different region's NO2 level correlations

Var_Corr = NO2_data.fillna(0).corr()
sns.heatmap(Var_Corr, xticklabels=Var_Corr.columns, yticklabels=Var_Corr.columns, annot=True, cmap = 'coolwarm')

In [None]:
for g in graphs:
    NO2_data[g+'_diff'] = NO2_data[g].diff()

In [None]:
# Heatmap of the different region's NO2 level variations correlations
Var_Corr = NO2_data[['NW_NO2_diff','SW_NO2_diff','N_NO2_diff','S_NO2_diff','NE_NO2_diff','SE_NO2_diff']].fillna(0).corr()
sns.heatmap(Var_Corr, xticklabels=Var_Corr.columns, yticklabels=Var_Corr.columns, annot=True, cmap = 'coolwarm')

In [None]:
NO2_data.head(3)

In [None]:
NO2_data.median()

In [None]:
NO2_data.var()

## Loading the Power Plant data frame
<img src="https://cdn.pixabay.com/photo/2012/04/11/11/33/lightning-27588_960_720.png" width=230 align="left">

In [None]:
#Power plants in Puerto Rico

power_df = pd.read_csv(data_dir+'gppd/gppd_120_pr.csv')
#power_df.head(3).T

In [None]:
power_df.describe()

# Removing non-polluting Hydro, Solar, and Wind

In [None]:
dirty_power_df = power_df[(power_df.primary_fuel != 'Hydro')&(power_df.primary_fuel != 'Solar') & (power_df.primary_fuel !='Wind')]

In [None]:
dirty_power_df.describe()

## Total Installed Capacity - Polluting Capacity
### Removing renewable plans that do not contribute to emissions

In [None]:
# Total capacity of all the plants
dirty_capacity_mw = dirty_power_df['capacity_mw'].sum()
print('Total Installed Dirty Capacity: '+'{:.2f}'.format(dirty_capacity_mw) + ' MW')


In [None]:
dirty_capacity = (dirty_power_df.groupby(['primary_fuel'])['capacity_mw'].sum()).to_frame()
dirty_capacity = dirty_capacity.sort_values('capacity_mw',ascending=False)
dirty_capacity['percentage_of_total'] = (dirty_capacity['capacity_mw']/dirty_capacity_mw)*100
dirty_capacity

### Estimated generation

In [None]:
# Total generation of all the plants
total_gen_mw = dirty_power_df['estimated_generation_gwh'].sum()
print('Total Generatation: '+'{:.2f}'.format(total_gen_mw) + ' GW')

In [None]:
generation = (dirty_power_df.groupby(['primary_fuel'])['estimated_generation_gwh'].sum()).to_frame()
generation = generation.sort_values('estimated_generation_gwh',ascending=False)
generation['percentage_of_total'] = (generation['estimated_generation_gwh']/total_gen_mw)*100
generation

## Wind Information
<img src="https://cdn.pixabay.com/photo/2013/07/13/10/23/weather-157129_1280.png" width=230 align="left">

In [None]:
WIND_data.head(5)

## Applying an exponential moving average function to the various wind speed measurements to get a better idea of the <i>Average Wind Speed</i>

In [None]:
WIND_data['WIND_X_M_SEC'] = WIND_data['WIND_X_M_SEC'].ewm(com=0.67).mean()
WIND_data['WIND_Y_M_SEC'] = WIND_data['WIND_Y_M_SEC'].ewm(com=0.67).mean()
#df['Wind_Speed_X'].rolling(4).apply(integrate.trapz)

In [None]:
def wind_to_X_pixels(wind_speed):
    kmd = (3600 / 1000.0 * 6.0)/2.0 # converting to average km / period displacement
    pto_rico_x = 200 # the island is approx. 200 km wide
    img_x = 475
    response = ((wind_speed * kmd)/pto_rico_x)*img_x
    return response
    
def wind_to_Y_pixels(wind_speed):
    kmd = (3600 / 1000.0 * 6.0)/2.0 # converting to average km / period displacement
    pto_rico_y = 75 # the island's height is approx. 75 km
    img_y = 148
    response = ((wind_speed * kmd)/pto_rico_y)*img_y 
    return response

In [None]:
WIND_data['DAILY_DISP_X_PX'] = WIND_data['WIND_X_M_SEC'].apply(wind_to_X_pixels)

In [None]:
WIND_data['DAILY_DISP_Y_PX'] = WIND_data['WIND_Y_M_SEC'].apply(wind_to_Y_pixels)

In [None]:
WIND_data.head()

In [None]:
WIND_data.describe()

### An interesting thing to note is that wind in the East to West direction is very constant, but in the vertical direction component it sometimes goes North to South and sometimes South to North.

In [None]:
all_data = pd.concat([NO2_data, WIND_data], axis=1, join='inner')

In [None]:
all_data.head()

In [None]:
all_data.describe()

## Building the simulation DataFrame

## Simple Generative Model
<div align="center">
<i>“What I cannot create, I do not understand”</i>
<img src="https://upload.wikimedia.org/wikipedia/commons/0/06/Richard_Feynman_1959.png" width=230 align="center">
</div>


### We are going to simulate particles being emitted everyday in the morning proportional to the daily energy generated by that plant times the Emission Factor for that type of energy (we are only working with dirty types of energy). We will then calculate the correlation between estimated pollution in the simulation and measured NO2 in each sector. Finally we will repeat the process varying the emission factors for each of the dirty fuels, seeking to minize the following formula:

In [None]:
display(Math(r'\min F(C^F,O^F,G^F) = 1- \sqrt{\frac{corr_{NW}^2 + corr_{NE}^2 + corr_{N}^2 …}{6}}'))

### The emission factors that minimizes the above are our proposed emission factor (EF) estimates 

In [None]:
def convert_geo_px(long, lat):


    image = data_dir + 's5p_no2/s5p_no2_20180701T161259_20180707T175356.tif'
    sat_data = rio.open(image)

    width_in_projected_units = sat_data.bounds.right - sat_data.bounds.left
    height_in_projected_units = sat_data.bounds.top - sat_data.bounds.bottom

    # Upper left pixel
    row_min = 0
    col_min = 0
    # Lower right pixel.  Rows and columns are zero indexing.
    row_max = sat_data.height - 1
    col_max = sat_data.width - 1
    # Transform coordinates with the dataset's affine transformation.
    topleft = sat_data.transform * (row_min, col_min)
    botright = sat_data.transform * (row_max, col_max)

    return sat_data.width*((long-topleft[0])/width_in_projected_units),\
            sat_data.height*((topleft[1]-lat)/height_in_projected_units)

In [None]:
sim_df = pd.DataFrame(index=all_data.index, columns = ['NW_NO2', 'SW_NO2','N_NO2','S_NO2',
                                                       'NE_NO2','SE_NO2'])

In [None]:
sim_df.head()

In [None]:
emission_sources = dirty_power_df[['primary_fuel', 'estimated_generation_gwh' ,'.geo']]

In [None]:
emission_sources['estimated_generation_gwh']/=365

In [None]:
emission_sources = emission_sources.rename(columns={"estimated_generation_gwh": "daily_gwh"})

In [None]:
emission_sources.head()

In [None]:
emission_sources['.geo'] = emission_sources['.geo'].apply(lambda x: ast.literal_eval(x))

In [None]:
emission_sources['.geo'] = emission_sources['.geo'].apply(lambda x: x.get('coordinates'))

In [None]:
emission_sources['source_px'] = emission_sources['.geo'].apply(lambda x: convert_geo_px(x[0],x[1]))

In [None]:
emission_sources

In [None]:
def in_interval(x, interval):
    if x < interval[1] and x >= interval[0]:
        return True
    else:
        return False

In [None]:
def assign_to_zone(pdot_x, pdot_y, pdot_size):
    x_size = 475; y_size=148
    zone_NW = [[0, int(y_size/2)], [0, int(x_size/3)]]
    zone_SW = [[int(y_size/2), y_size], [0, int(x_size/3)]]
    zone_N = [[0, int(y_size/2)], [int(x_size/3), int(x_size*2/3)]]
    zone_S = [[int(y_size/2), y_size], [int(x_size/3), int(x_size*2/3)]]
    zone_NE = [[0, int(y_size/2)], [int(x_size*2/3), x_size]]
    zone_SE = [[int(y_size/2), y_size], [int(x_size*2/3), x_size]]
    
    r = np.sqrt(pdot_size/np.math.pi)
    
    # pdot_size is considered as a surface

    # first assign to the main zone
    zones_and_percent = None
    
    # zone NW
    if in_interval(pdot_y, zone_NW[0]) and in_interval(pdot_x, zone_NW[1]):
        # y axis
        if in_interval(pdot_y + r, zone_NW[0]) and in_interval(pdot_y - r, zone_NW[0]):
            # x axis
            # Case 1 : the entire circle is in the zone
            if in_interval(pdot_x + r, zone_NW[1]) and in_interval(pdot_x - r, zone_NW[1]):
                zones_and_percent = [['NW_NO2', pdot_size]]
            # Case 2 : the circle is out of the zone - West side
            elif in_interval(pdot_x + r, zone_NW[1]) and not in_interval(pdot_x - r, zone_NW[1]):
                surface_diff = np.math.pi*(pdot_x - r)**2
                pdot_size_resize = pdot_size - surface_diff
                zones_and_percent = [['NW_NO2', pdot_size_resize]]
            # Case 3 : the circle is out of the zone - East side
            elif not in_interval(pdot_x + r, zone_NW[1]) and in_interval(pdot_x - r, zone_NW[1]):
                surface_diff = np.math.pi*(int(x_size/3) - (pdot_x + r))**2
                pdot_size_resize = pdot_size - surface_diff
                zones_and_percent = [['NW_NO2', pdot_size_resize], ['N_NO2', surface_diff]]
                
        # Case 4 : the circle is out of the zone - North side
        elif in_interval(pdot_y + r, zone_NW[0]) and not in_interval(pdot_y - r, zone_NW[0]):
            surface_diff = np.math.pi*(pdot_y - r)**2
            pdot_size_resize = pdot_size - surface_diff
            zones_and_percent = [['NW_NO2', pdot_size_resize]]
        # Case 5 : the circle is out of the zone - South side
        elif not in_interval(pdot_y + r, zone_NW[0]) and in_interval(pdot_y - r, zone_NW[0]):
            surface_diff = np.math.pi*(int(y_size/2) - (pdot_y + r))**2
            pdot_size_resize = pdot_size - surface_diff
            zones_and_percent = [['NW_NO2', pdot_size_resize], ['SW_NO2', surface_diff]]
    
    # zone SW
    elif in_interval(pdot_y, zone_SW[0]) and in_interval(pdot_x, zone_SW[1]):
        # y axis
        if in_interval(pdot_y + r, zone_SW[0]) and in_interval(pdot_y - r, zone_SW[0]):
            # x axis
            # Case 1 : the entire circle is in the zone
            if in_interval(pdot_x + r, zone_SW[1]) and in_interval(pdot_x - r, zone_SW[1]):
                zones_and_percent = [['SW_NO2', pdot_size]]
            # Case 2 : the circle is out of the zone - West side
            elif in_interval(pdot_x + r, zone_SW[1]) and not in_interval(pdot_x - r, zone_SW[1]):
                surface_diff = np.math.pi*(pdot_x - r)**2
                pdot_size_resize = pdot_size - surface_diff
                zones_and_percent = [['SW_NO2', pdot_size_resize]]
            # Case 3 : the circle is out of the zone - East side
            elif not in_interval(pdot_x + r, zone_SW[1]) and in_interval(pdot_x - r, zone_SW[1]):
                surface_diff = np.math.pi*(int(x_size/3) - (pdot_x + r))**2
                pdot_size_resize = pdot_size - surface_diff
                zones_and_percent = [['SW_NO2', pdot_size_resize], ['S_NO2', surface_diff]]
        # Case 4 : the circle is out of the zone - North side
        elif in_interval(pdot_y + r, zone_SW[0]) and not in_interval(pdot_y - r, zone_SW[0]):
            surface_diff = np.math.pi*(int(y_size/2) - (pdot_y - r))**2
            pdot_size_resize = pdot_size - surface_diff
            zones_and_percent = [['NW_NO2', surface_diff], ['SW_NO2', pdot_size_resize]]
        # Case 5 : the circle is out of the zone - South side
        elif not in_interval(pdot_y + r, zone_SW[0]) and in_interval(pdot_y - r, zone_SW[0]):
            surface_diff = np.math.pi*(y_size - (pdot_y + r))**2
            pdot_size_resize = pdot_size - surface_diff
            zones_and_percent = [['SW_NO2', pdot_size_resize]]

    # zone N
    elif in_interval(pdot_y, zone_N[0]) and in_interval(pdot_x, zone_N[1]):
        # y axis
        if in_interval(pdot_y + r, zone_N[0]) and in_interval(pdot_y - r, zone_N[0]):
            # x axis
            # Case 1 : the entire circle is in the zone
            if in_interval(pdot_x + r, zone_N[1]) and in_interval(pdot_x - r, zone_N[1]):
                zones_and_percent = [['N_NO2', pdot_size]]
            # Case 2 : the circle is out of the zone - West side
            elif in_interval(pdot_x + r, zone_N[1]) and not in_interval(pdot_x - r, zone_N[1]):
                surface_diff = np.math.pi*(int(x_size/3) - (pdot_x - r))**2
                pdot_size_resize = pdot_size - surface_diff
                zones_and_percent = [['N_NO2', pdot_size_resize], ['NW_NO2', surface_diff]]
            # Case 3 : the circle is out of the zone - East side
            elif not in_interval(pdot_x + r, zone_N[1]) and in_interval(pdot_x - r, zone_N[1]):
                surface_diff = np.math.pi*(int(2*x_size/3) - (pdot_x + r))**2
                pdot_size_resize = pdot_size - surface_diff
                zones_and_percent = [['N_NO2', pdot_size_resize], ['NE_NO2', surface_diff]]
        # Case 4 : the circle is out of the zone - North side
        elif in_interval(pdot_y + r, zone_N[0]) and not in_interval(pdot_y - r, zone_N[0]):
            surface_diff = np.math.pi*(pdot_y - r)**2
            pdot_size_resize = pdot_size - surface_diff
            zones_and_percent = [['N_NO2', pdot_size_resize]]
        # Case 5 : the circle is out of the zone - South side
        elif not in_interval(pdot_y + r, zone_N[0]) and in_interval(pdot_y - r, zone_N[0]):
            surface_diff = np.math.pi*(int(y_size/2) - (pdot_y + r))**2
            pdot_size_resize = pdot_size - surface_diff
            zones_and_percent = [['N_NO2', pdot_size_resize], ['S_NO2', surface_diff]]
    
    # zone S
    elif in_interval(pdot_y, zone_S[0]) and in_interval(pdot_x, zone_S[1]):
        # y axis
        if in_interval(pdot_y + r, zone_S[0]) and in_interval(pdot_y - r, zone_S[0]):
            # x axis
            # Case 1 : the entire circle is in the zone
            if in_interval(pdot_x + r, zone_S[1]) and in_interval(pdot_x - r, zone_S[1]):
                zones_and_percent = [['S_NO2', pdot_size]]
            # Case 2 : the circle is out of the zone - West side
            elif in_interval(pdot_x + r, zone_S[1]) and not in_interval(pdot_x - r, zone_S[1]):
                surface_diff = np.math.pi*(int(x_size/3) - (pdot_x - r))**2
                pdot_size_resize = pdot_size - surface_diff
                zones_and_percent = [['S_NO2', pdot_size_resize], ['SW_NO2', surface_diff]]
            # Case 3 : the circle is out of the zone - East side
            elif not in_interval(pdot_x + r, zone_S[1]) and in_interval(pdot_x - r, zone_S[1]):
                surface_diff = np.math.pi*(int(2*x_size/3) - (pdot_x + r))**2
                pdot_size_resize = pdot_size - surface_diff
                zones_and_percent = [['S_NO2', pdot_size_resize], ['SE_NO2', surface_diff]]
        # Case 4 : the circle is out of the zone - North side
        elif in_interval(pdot_y + r, zone_S[0]) and not in_interval(pdot_y - r, zone_S[0]):
            surface_diff = np.math.pi*(int(y_size/2) - (pdot_y - r))**2
            pdot_size_resize = pdot_size - surface_diff
            zones_and_percent = [['N_NO2', surface_diff], ['S_NO2', pdot_size_resize]]
        # Case 5 : the circle is out of the zone - South side
        elif not in_interval(pdot_y + r, zone_S[0]) and in_interval(pdot_y - r, zone_S[0]):
            surface_diff = np.math.pi*(y_size - (pdot_y + r))**2
            pdot_size_resize = pdot_size - surface_diff
            zones_and_percent = [['S_NO2', pdot_size_resize]]
    
    # zone NE
    elif in_interval(pdot_y, zone_NE[0]) and in_interval(pdot_x, zone_NE[1]):
        # y axis
        if in_interval(pdot_y + r, zone_NE[0]) and in_interval(pdot_y - r, zone_NE[0]):
            # x axis
            # Case 1 : the entire circle is in the zone
            if in_interval(pdot_x + r, zone_NE[1]) and in_interval(pdot_x - r, zone_NE[1]):
                zones_and_percent = [['NE_NO2', pdot_size]]
            # Case 2 : the circle is out of the zone - West side
            elif in_interval(pdot_x + r, zone_NE[1]) and not in_interval(pdot_x - r, zone_NE[1]):
                surface_diff = np.math.pi*(int(2*x_size/3) - (pdot_x - r))**2
                pdot_size_resize = pdot_size - surface_diff
                zones_and_percent = [['NE_NO2', pdot_size_resize], ['N_NO2', surface_diff]]
            # Case 3 : the circle is out of the zone - East side
            elif not in_interval(pdot_x + r, zone_NE[1]) and in_interval(pdot_x - r, zone_NE[1]):
                surface_diff = np.math.pi*(x_size - (pdot_x + r))**2
                pdot_size_resize = pdot_size - surface_diff
                zones_and_percent = [['NE_NO2', pdot_size_resize]]
        # Case 4 : the circle is out of the zone - North side
        elif in_interval(pdot_y + r, zone_NE[0]) and not in_interval(pdot_y - r, zone_NE[0]):
            surface_diff = np.math.pi*(pdot_y - r)**2
            pdot_size_resize = pdot_size - surface_diff
            zones_and_percent = [['N_NO2', pdot_size_resize]]
        # Case 5 : the circle is out of the zone - South side
        elif not in_interval(pdot_y + r, zone_NE[0]) and in_interval(pdot_y - r, zone_NE[0]):
            surface_diff = np.math.pi*(int(y_size/2) - (pdot_y + r))**2
            pdot_size_resize = pdot_size - surface_diff
            zones_and_percent = [['NE_NO2', pdot_size_resize], ['SE_NO2', surface_diff]]
            
    
    # zone SE
    elif in_interval(pdot_y, zone_SE[0]) and in_interval(pdot_x, zone_SE[1]):
        # y axis
        if in_interval(pdot_y + r, zone_SE[0]) and in_interval(pdot_y - r, zone_SE[0]):
            # x axis
            # Case 1 : the entire circle is in the zone
            if in_interval(pdot_x + r, zone_SE[1]) and in_interval(pdot_x - r, zone_SE[1]):
                zones_and_percent = [['SE_NO2', pdot_size]]
            # Case 2 : the circle is out of the zone - West side
            elif in_interval(pdot_x + r, zone_SE[1]) and not in_interval(pdot_x - r, zone_SE[1]):
                surface_diff = np.math.pi*(int(2*x_size/3) - (pdot_x - r))**2
                pdot_size_resize = pdot_size - surface_diff
                zones_and_percent = [['SE_NO2', pdot_size_resize], ['S_NO2', surface_diff]]
            # Case 3 : the circle is out of the zone - East side
            elif not in_interval(pdot_x + r, zone_SE[1]) and in_interval(pdot_x - r, zone_SE[1]):
                surface_diff = np.math.pi*(x_size - (pdot_x + r))**2
                pdot_size_resize = pdot_size - surface_diff
                zones_and_percent = [['SE_NO2', pdot_size_resize]]
        # Case 4 : the circle is out of the zone - North side
        elif in_interval(pdot_y + r, zone_SE[0]) and not in_interval(pdot_y - r, zone_SE[0]):
            surface_diff = np.math.pi*(int(y_size/2) - (pdot_y - r))**2
            pdot_size_resize = pdot_size - surface_diff
            zones_and_percent = [['NE_NO2', surface_diff], ['SE_NO2', pdot_size_resize]]
        # Case 5 : the circle is out of the zone - South side
        elif not in_interval(pdot_y + r, zone_SE[0]) and in_interval(pdot_y - r, zone_SE[0]):
            surface_diff = np.math.pi*(y_size - (pdot_y + r))**2
            pdot_size_resize = pdot_size - surface_diff
            zones_and_percent = [['SE_NO2', pdot_size_resize]]
    
    else:
        zones_and_percent = [['HS', pdot_size]]
            
    return zones_and_percent

In [None]:
def simulate_emissions(EFs, sim_df, emission_sources, all_data):
    x_size = 475; y_size=148
    for dt, day in sim_df.iterrows():
        pollution_dots = []
        for indx, e in emission_sources.iterrows():
            pollution_size = EFs.get(e['primary_fuel'])*e['daily_gwh']
            pollution_location = (e['source_px'][0] + all_data.loc[dt]['DAILY_DISP_X_PX'],
            e['source_px'][1] + all_data.loc[dt]['DAILY_DISP_Y_PX'])
            pollution_dots.append([pollution_size,pollution_location])

        # Let's now assign each pollution dot to the corresponding sector of the island, or ignore 
        # them if they are outside the image boundaries       
        day_pollution = pd.Series([0.0,0.0,0.0,0.0,0.0,0.0],index=['NW_NO2', 'SW_NO2','N_NO2','S_NO2', 'NE_NO2','SE_NO2'])
        
        for pdot in pollution_dots:             
            pdot_size = pdot[0]; pdot_x = pdot[1][0];pdot_y = pdot[1][1]

            assigned_zones = assign_to_zone(pdot_x, pdot_y, pdot_size)

            for assigned_zone in assigned_zones:
                if assigned_zone[0] != 'HS':
                    day_pollution[assigned_zone[0]] += assigned_zone[1]

        sim_df.loc[dt] = day_pollution     
        
    return sim_df

In [None]:
# First Emission Factors Guess, every type roughly equal
EFs = {'Oil':.3334,'Coal':0.3333,'Gas':0.3333}
sim_df = simulate_emissions(EFs, sim_df, emission_sources, all_data)

In [None]:
sim_df.sum()

In [None]:
def evaluate_simulation(sim_df, all_data):
    Fit = 0.0
    for region in sim_df.columns:
        Fit += (sim_df[region].astype('float').corr(all_data[region].astype('float')))**2
    Fit = np.sqrt(Fit/6.0)
    return 1 - Fit

In [None]:
###############################
# lower indicates a better fit
###############################
evaluate_simulation(sim_df, all_data)

In [None]:
# let's try giving a higher value to coal
EFs = {'Oil':.40,'Coal':0.40,'Gas':0.20}
simulate_emissions(EFs, sim_df, emission_sources, all_data)
evaluate_simulation(sim_df, all_data)

In [None]:
EFs = {'Oil':.50,'Coal':0.4,'Gas':0.10}
simulate_emissions(EFs, sim_df, emission_sources, all_data)
evaluate_simulation(sim_df, all_data)

In [None]:
def random_EFs():
    r_1 = random.randint(0,1000)
    r_2 = random.randint(0,1000)
    r_3 = random.randint(0,1000)
    r_sum = r_1+r_2+r_3
    return {'Oil':r_1/r_sum,'Coal':r_2/r_sum,'Gas':r_3/r_sum}

In [None]:
####################################################################
# Let's test a hundred random EFs to get an idea of a reasonable fit
####################################################################

best_fit_EFs = {}
best_fit = 1.0
for i in tqdm(range(100)):
    EFs = random_EFs()
    simulate_emissions(EFs, sim_df, emission_sources, all_data)
    fit = evaluate_simulation(sim_df, all_data)
    if fit < best_fit:
        best_fit = fit
        best_fit_EFs = EFs.copy()
        #print("Best fit sor far:", best_fit_EFs)
print("##############")
print("Best fit was: ", best_fit_EFs)

### It seems clear that gas is relatively clean when compared to coal and oil, but still is responsible for some emissions.

## <b>For the final EF's we'll use bayesian optimisation to get more accurate results.</b>

In [None]:
# Finally we will just adjust the best fit emission factor to the proper units
EF_oil = all_data['NO2'].sum()*best_fit_EFs['Oil']/generation['estimated_generation_gwh'].sum()*1000
                                                                                # Mult. by 1000 to convert to kWH
EF_coal = all_data['NO2'].sum()*best_fit_EFs['Coal']/generation['estimated_generation_gwh'].sum()*1000 
EF_gas = all_data['NO2'].sum()*best_fit_EFs['Gas']/generation['estimated_generation_gwh'].sum()*1000 

In [None]:
display(Math(r'EF_{Oil} = ' + str(round(EF_oil,2)) + '\;NO2\;units\;per\;kWh'))
display(Math(r'EF_{Coal} = ' + str(round(EF_coal,2)) + '\;NO2\;units\;per\;kWh'))
display(Math(r'EF_{Gas} = ' + str(round(EF_gas,2)) + '\;NO2\;units\;per\;kWh'))

## Hyperopt - for a better search strategy using Bayesian optimization

In [None]:
from hyperopt import hp, fmin, tpe, STATUS_OK, Trials
from sklearn.metrics import mean_squared_error

#### HyperOpt instead of random testing

In [None]:
from scipy.stats.stats import pearsonr

In [None]:
def acc_model_corr(EFs, all_data, emission_sources, sim_df):
    
    if 'params' in EFs:
        EFs = EFs['params']
    
    sim_df = simulate_emissions(EFs, sim_df, emission_sources, all_data)
    
    # Correlation
    NW_corr_sq = pearsonr(sim_df['NW_NO2'], all_data['NW_NO2'])[0]**2
    SW_corr_sq = pearsonr(sim_df['SW_NO2'], all_data['SW_NO2'])[0]**2
    N_corr_sq = pearsonr(sim_df['N_NO2'], all_data['N_NO2'])[0]**2
    S_corr_sq = pearsonr(sim_df['S_NO2'], all_data['S_NO2'])[0]**2
    NE_corr_sq = pearsonr(sim_df['NE_NO2'], all_data['NE_NO2'])[0]**2
    SE_corr_sq = pearsonr(sim_df['SE_NO2'], all_data['SE_NO2'])[0]**2
    
    metric = 1 - np.sqrt((NW_corr_sq + SW_corr_sq + N_corr_sq + S_corr_sq + NE_corr_sq + SE_corr_sq)/6)

    return metric

In [None]:
def f(EFs):
    best = 1.0
    acc = acc_model_corr(EFs, all_data, emission_sources, sim_df)
    if acc < best:
        best = acc
    return {'loss': best, 'status': STATUS_OK}

In [None]:
def random_EFs():
    r_1 = random.randint(0,1000)
    r_2 = random.randint(0,1000)
    r_3 = random.randint(0,1000)
    r_sum = r_1+r_2+r_3
    return {'Oil':r_1/r_sum,'Coal':r_2/r_sum,'Gas':r_3/r_sum}

EFs_list = [random_EFs() for y in range(1000)]
params = {'params': hp.choice('params', EFs_list)}

In [None]:
trials = Trials()
params_best = fmin(f, params, algo=tpe.suggest, max_evals=100, trials=trials)

In [None]:
best_fit_EFs = EFs_list[params_best['params']]

In [None]:
best_fit_EFs

In [None]:
# Finally we will just adjust the best fit emission factor to the proper units
EF_oil = all_data['NO2'].sum()*best_fit_EFs['Oil']/generation['estimated_generation_gwh'].sum()*1000
                                                                                # Mult. by 1000 to convert to kWH
EF_coal = all_data['NO2'].sum()*best_fit_EFs['Coal']/generation['estimated_generation_gwh'].sum()*1000 
EF_gas = all_data['NO2'].sum()*best_fit_EFs['Gas']/generation['estimated_generation_gwh'].sum()*1000 

In [None]:
display(Math(r'EF_{Oil} = ' + str(round(EF_oil,2)) + '\;NO2\;units\;per\;kWh'))
display(Math(r'EF_{Coal} = ' + str(round(EF_coal,2)) + '\;NO2\;units\;per\;kWh'))
display(Math(r'EF_{Gas} = ' + str(round(EF_gas,2)) + '\;NO2\;units\;per\;kWh'))