# NO2 emission during the COVID-19 pandemic

## About the data


## Research question
1. Is there a difference in NO2 emission in the Netherlands caused by the lockdowns?


## Table of contents

In [21]:
# IMPORTS
import sys
import numpy as np
import pandas as pd
import yaml
from pathlib import Path
import glob
import os
from functools import partial
import random
from datetime import datetime
from IPython.display import Markdown as md

# Data imputation
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.linear_model import BayesianRidge

In [22]:
import hvplot.pandas
import holoviews as hv
from holoviews import dim
from bokeh.io import output_notebook, output_file
from bokeh.plotting import figure, show
from bokeh.layouts import gridplot, column, layout, row
from bokeh.plotting import ColumnDataSource
from bokeh.models import DatetimeTickFormatter, DataTable, DateFormatter, TableColumn
from bokeh.models import CustomJS, Dropdown
from bokeh.models import Span
from bokeh.transform import jitter
import panel as pn
import regex as re


pn.extension()
output_notebook()

In [23]:
def get_config(file) -> dict:
    """
    Read in config file and return it as a dictionary.

    :parameter
    ----------
    file - String
        Location of config file
    
    :returns
    --------
    config - dict
        Configuration file in dictionary form.
    """
    try:
        with open(file, 'r') as stream:
            config = yaml.safe_load(stream)
    
        return config
    except FileNotFoundError as e:
        print(f"File: could not be found. Error {e}")
        sys.exit(1)

In [24]:
config = get_config("config.yaml")
data_dir = config['data']

In [25]:
class Data:
    
#     def __init__(self, test):
#         pass
    
    def read_NO2(self, file, *, skiprows=7, sep=";", encoding=None)-> pd.DataFrame:
        df = pd.read_csv(file, skiprows=skiprows, sep=sep, encoding=encoding)
        return df
    
    @staticmethod
    def rename_columns(df, columns) -> pd.DataFrame:
        df.rename(columns = columns, inplace = True)
        return df
    
    @staticmethod
    def reformat_date_column(column, *, format_date = '%Y%m%d%H%M') -> pd.Series:
        try:
            column = column.str.replace(" ", "").str.replace(":","")
            column = pd.to_datetime(column.astype(str), format=format_date)
            return column
        except AttributeError as e:
            print(f"Date column: {column.name}, has already been reformatted. Error: {e}")
            return column

    @staticmethod
    def missing_value_filter(df: pd.DataFrame, p_missing: float) -> pd.DataFrame:
        """
        Keep only the columns that pass the maximum required % of missing values.

        :parameters
        -----------
        df - pd.DataFrame
            Data frame
        p_missing - float
            Maximum percentage missing values allowed for a column.

        :returns
        --------
        df - pd.DataFrame
            Filtered data frame based on % missing values.
        """
        df = df[df.columns[df.isnull().mean() < p_missing]]
        return df
    
    @staticmethod
    def interpolate_values(df, method = 'cubicspline', limit_direction = 'forward', *args, **kwargs) -> pd.DataFrame:
        df.interpolate(method = method, limit_direction = limit_direction, *args, **kwargs)
        return df
    
    @staticmethod
    def tidy_df(df, id_vars, value_vars, var_name, value_name) -> pd.DataFrame:
        tidy_df = df.melt(id_vars=id_vars, value_vars=value_vars,
                          var_name = var_name,value_name = value_name)
        return tidy_df

    
data = Data()

In [26]:
class MetaData:
    
    def __init__(self, file):
        self.file = file
        self.df = self.read_data(self.file)
        self.df = self.remove_data(self.df, "StationsCode", axis = 0)
        self.create_lat_long_col()

    def read_data(self, file, *, nrows=6, sep=";", encoding='unicode_escape') -> pd.DataFrame:
        df = pd.read_csv(file, nrows=nrows, sep=sep, encoding=encoding).iloc[:, 4:].T
        df = self.rename_columns(df)
        return df
        
    def rename_columns(self, df) -> pd.DataFrame:
        df.columns = df.iloc[0,:]
        return df
    
    def create_lat_long_col(self) -> None:
        # Seperate the latitude and longitude column and assign it to their own column
        latitude_longitude = self.df["Latitude,Longitude"].str.strip("()").str.split(",", n = 1, expand = True)
        self.remove_data(self.df, "Latitude,Longitude", axis = 1) # Remove old column
        
        # Insert the chunk number column into the dataframe
        self.df.insert(1, column = "Latitude", value = latitude_longitude.iloc[:, 0])

        # Insert the patient id column into the dataframe
        self.df.insert(2, column = "Longitude", value = latitude_longitude.iloc[:, 1])
    
    @staticmethod
    def remove_data(df, name, *, axis = 1) -> pd.DataFrame:
        df.drop(name, axis = axis, inplace = True)
        return df
        
# metadata_instance = MetaData(file_2020)

# <a id="/2">1. Load data</a> 

First, load in the data from the year 2020. Then, load in the data of the year 2021 by combine all the seperate files into one data frame. 

## <a id="/3">1.1 Year 2020</a> 

In [27]:
file_2020 = Path(data_dir) / "2020" / "2020_NO2.csv"
# df_2020 = pd.read_csv(file_2020, skiprows=7, sep=";")
df_2020 = data.read_NO2(file_2020, skiprows=7, sep=";")
df_2020.head()

Unnamed: 0,Component,Bep.periode,Eenheid,Begindatumtijd,Einddatumtijd,NL01485,NL01487,NL01488,NL01489,NL01491,...,NL49022,NL49546,NL49551,NL49553,NL49561,NL49564,NL49565,NL49701,NL49703,NL49704
0,NO2,uur,�g/m�,20200101 00:00,20200101 01:00,45.8,48.5,47.8,36.9,41.4,...,30.4,26.8,35.2,21.5,27.8,13.1,22.9,49.2,21.6,17.1
1,NO2,uur,�g/m�,20200101 01:00,20200101 02:00,32.3,55.8,45.1,43.4,47.4,...,24.6,33.7,16.4,16.0,24.3,24.0,30.2,54.3,21.2,26.6
2,NO2,uur,�g/m�,20200101 02:00,20200101 03:00,32.3,42.8,32.9,39.3,37.4,...,22.9,39.6,23.9,24.8,27.6,26.9,30.6,50.9,22.4,32.6
3,NO2,uur,�g/m�,20200101 03:00,20200101 04:00,25.4,40.3,32.1,26.4,37.1,...,20.4,31.1,22.9,22.7,29.5,28.5,27.9,38.8,22.3,28.4
4,NO2,uur,�g/m�,20200101 04:00,20200101 05:00,24.3,31.3,24.3,23.1,27.1,...,25.1,26.7,26.3,25.0,29.1,25.8,27.0,29.1,25.8,28.7


### Clean data

In [28]:
# Rename the date columns
df_2020 = data.rename_columns(df_2020, columns = {"Begindatumtijd": "date_start", "Einddatumtijd": "date_end"})

# Set the datatype of the date and time colum to datetime.
df_2020['date_start'] = data.reformat_date_column(df_2020['date_start'], format_date='%Y%m%d%H%M')
df_2020['date_end'] = data.reformat_date_column(df_2020['date_end'], format_date='%Y%m%d%H%M')


df_2020.head()

Unnamed: 0,Component,Bep.periode,Eenheid,date_start,date_end,NL01485,NL01487,NL01488,NL01489,NL01491,...,NL49022,NL49546,NL49551,NL49553,NL49561,NL49564,NL49565,NL49701,NL49703,NL49704
0,NO2,uur,�g/m�,2020-01-01 00:00:00,2020-01-01 01:00:00,45.8,48.5,47.8,36.9,41.4,...,30.4,26.8,35.2,21.5,27.8,13.1,22.9,49.2,21.6,17.1
1,NO2,uur,�g/m�,2020-01-01 01:00:00,2020-01-01 02:00:00,32.3,55.8,45.1,43.4,47.4,...,24.6,33.7,16.4,16.0,24.3,24.0,30.2,54.3,21.2,26.6
2,NO2,uur,�g/m�,2020-01-01 02:00:00,2020-01-01 03:00:00,32.3,42.8,32.9,39.3,37.4,...,22.9,39.6,23.9,24.8,27.6,26.9,30.6,50.9,22.4,32.6
3,NO2,uur,�g/m�,2020-01-01 03:00:00,2020-01-01 04:00:00,25.4,40.3,32.1,26.4,37.1,...,20.4,31.1,22.9,22.7,29.5,28.5,27.9,38.8,22.3,28.4
4,NO2,uur,�g/m�,2020-01-01 04:00:00,2020-01-01 05:00:00,24.3,31.3,24.3,23.1,27.1,...,25.1,26.7,26.3,25.0,29.1,25.8,27.0,29.1,25.8,28.7


### Missing values

In [29]:
print(f"Missing values: {df_2020.iloc[:,5:].isnull().any()}")

Missing values: NL01485    True
NL01487    True
NL01488    True
NL01489    True
NL01491    True
           ... 
NL49564    True
NL49565    True
NL49701    True
NL49703    True
NL49704    True
Length: 73, dtype: bool


In [30]:
print(f" Check how many values there are missing per station:\n{df_2020.iloc[:,5:].isnull().sum()}")

 Check how many values there are missing per station:
NL01485    135
NL01487     30
NL01488     32
NL01489     29
NL01491    128
          ... 
NL49564     82
NL49565     68
NL49701     57
NL49703    130
NL49704     71
Length: 73, dtype: int64


In [31]:
# Check for station NL101485 which date the missing values are
# df_2020.query("NL10107 != NL10107")

In [32]:
p = 0.3
# Filter out all columns with to many missing values
df_2020 = data.missing_value_filter(df_2020, p)


df_2020

Unnamed: 0,Component,Bep.periode,Eenheid,date_start,date_end,NL01485,NL01487,NL01488,NL01489,NL01491,...,NL49022,NL49546,NL49551,NL49553,NL49561,NL49564,NL49565,NL49701,NL49703,NL49704
0,NO2,uur,�g/m�,2020-01-01 00:00:00,2020-01-01 01:00:00,45.8,48.5,47.8,36.9,41.4,...,30.4,26.8,35.2,21.5,27.8,13.1,22.9,49.2,21.6,17.1
1,NO2,uur,�g/m�,2020-01-01 01:00:00,2020-01-01 02:00:00,32.3,55.8,45.1,43.4,47.4,...,24.6,33.7,16.4,16.0,24.3,24.0,30.2,54.3,21.2,26.6
2,NO2,uur,�g/m�,2020-01-01 02:00:00,2020-01-01 03:00:00,32.3,42.8,32.9,39.3,37.4,...,22.9,39.6,23.9,24.8,27.6,26.9,30.6,50.9,22.4,32.6
3,NO2,uur,�g/m�,2020-01-01 03:00:00,2020-01-01 04:00:00,25.4,40.3,32.1,26.4,37.1,...,20.4,31.1,22.9,22.7,29.5,28.5,27.9,38.8,22.3,28.4
4,NO2,uur,�g/m�,2020-01-01 04:00:00,2020-01-01 05:00:00,24.3,31.3,24.3,23.1,27.1,...,25.1,26.7,26.3,25.0,29.1,25.8,27.0,29.1,25.8,28.7
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8779,NO2,uur,�g/m�,2020-12-31 19:00:00,2020-12-31 20:00:00,55.4,63.2,57.1,49.7,67.0,...,42.3,58.0,42.0,10.1,51.4,40.3,39.6,45.8,30.6,53.3
8780,NO2,uur,�g/m�,2020-12-31 20:00:00,2020-12-31 21:00:00,52.9,68.8,61.3,54.6,61.5,...,50.8,57.7,32.9,17.7,52.5,43.5,41.6,53.5,33.9,54.0
8781,NO2,uur,�g/m�,2020-12-31 21:00:00,2020-12-31 22:00:00,51.2,67.2,64.4,58.6,59.8,...,51.7,55.9,43.9,23.6,54.4,46.2,43.3,45.7,42.5,53.5
8782,NO2,uur,�g/m�,2020-12-31 22:00:00,2020-12-31 23:00:00,48.5,62.5,60.0,59.4,59.6,...,47.0,63.2,12.3,12.1,55.6,42.1,39.9,46.7,53.1,59.6


# Impute data frame based on mean Top Bottom method

Idea: use stations that are close to the station with missing data then train the data with any model and predict the missing values. K-nearest neighbours for example.


General link:
* https://towardsdatascience.com/different-imputation-methods-to-handle-missing-data-8dd5bce97583#d85f

Try stochastic regression imputation:  
1. https://henrikhain.io/post/stochastic-regression-imputation/
2. https://towardsdatascience.com/6-different-ways-to-compensate-for-missing-values-data-imputation-with-examples-6022d9ca0779 

Multiple imputation:  
1. https://towardsdatascience.com/multiple-imputation-with-random-forests-in-python-dec83c0ac55b
2. https://pubmed.ncbi.nlm.nih.gov/27862164/

PREDICTIVE MODELS (REGRESSION METHODS):
1. https://digitaltesseract.com/data-imputation-techniques-an-introduction/ 

Paper testing:
1. mean top bottom
2. linear regression
3. k nearest neighbours
4. multiple imputation

link: https://www.semanticscholar.org/paper/Imputation-methods-for-filling-missing-data-in-air-Zakaria-Noor/068aef2863fa856e8498f74674ecb4806df88c93
pdf: https://uac.incd.ro/Art/v9n2a04.pdf



## <a id="/4">1.2 Year 2021</a>  

In [33]:
# Map does not take keyword arugments: solution create a partial.
map_func = partial(data.read_NO2, skiprows=7, sep=";", encoding='unicode_escape')

# Merging the files 2021 csv files
list_2021 = glob.glob(os.path.join(data_dir, "2021", "*.csv"))
df_2021 = pd.concat(map(map_func, list_2021), ignore_index=True)
df_2021

Unnamed: 0,Component,Bep.periode,Eenheid,Begindatumtijd,Einddatumtijd,NL01485,NL01487,NL01488,NL01489,NL01491,...,NL49022,NL49546,NL49551,NL49553,NL49561,NL49564,NL49565,NL49701,NL49703,NL49704
0,NO2,uur,µg/m³,20210101 00:00,20210101 01:00,41.4,57.3,50.2,46.4,58.1,...,39.4,54.6,30.3,19.6,47.5,35.0,43.4,45.8,42.0,51.5
1,NO2,uur,µg/m³,20210101 01:00,20210101 02:00,41.8,51.4,47.8,43.0,56.2,...,36.0,53.4,39.7,17.9,49.2,33.6,44.4,40.2,43.3,48.9
2,NO2,uur,µg/m³,20210101 02:00,20210101 03:00,35.5,51.0,45.1,40.7,52.0,...,42.2,54.1,8.9,3.9,53.0,35.6,44.9,47.6,41.9,50.5
3,NO2,uur,µg/m³,20210101 03:00,20210101 04:00,20.4,52.9,49.9,45.3,54.6,...,41.2,55.1,9.0,3.0,56.6,38.9,56.0,42.7,39.3,47.1
4,NO2,uur,µg/m³,20210101 04:00,20210101 05:00,14.5,46.2,45.6,43.5,54.4,...,48.8,56.4,12.5,2.4,58.3,40.3,55.8,46.1,41.8,47.1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8011,NO2,uur,µg/m³,20211130 19:00,20211130 20:00,,,,,,...,,,,,,,,,,
8012,NO2,uur,µg/m³,20211130 20:00,20211130 21:00,,,,,,...,,,,,,,,,,
8013,NO2,uur,µg/m³,20211130 21:00,20211130 22:00,,,,,,...,,,,,,,,,,
8014,NO2,uur,µg/m³,20211130 22:00,20211130 23:00,,,,,,...,,,,,,,,,,


### Clean data

In [34]:
df_2021 = data.rename_columns(df_2021, columns = {"Begindatumtijd": "date_start", "Einddatumtijd": "date_end"})

# Set the datatype of the date and time colum to datetime.
df_2021['date_start'] = data.reformat_date_column(df_2021['date_start'], format_date='%Y%m%d%H%M')
df_2021['date_end'] = data.reformat_date_column(df_2021['date_end'], format_date='%Y%m%d%H%M')


df_2021.head()

Unnamed: 0,Component,Bep.periode,Eenheid,date_start,date_end,NL01485,NL01487,NL01488,NL01489,NL01491,...,NL49022,NL49546,NL49551,NL49553,NL49561,NL49564,NL49565,NL49701,NL49703,NL49704
0,NO2,uur,µg/m³,2021-01-01 00:00:00,2021-01-01 01:00:00,41.4,57.3,50.2,46.4,58.1,...,39.4,54.6,30.3,19.6,47.5,35.0,43.4,45.8,42.0,51.5
1,NO2,uur,µg/m³,2021-01-01 01:00:00,2021-01-01 02:00:00,41.8,51.4,47.8,43.0,56.2,...,36.0,53.4,39.7,17.9,49.2,33.6,44.4,40.2,43.3,48.9
2,NO2,uur,µg/m³,2021-01-01 02:00:00,2021-01-01 03:00:00,35.5,51.0,45.1,40.7,52.0,...,42.2,54.1,8.9,3.9,53.0,35.6,44.9,47.6,41.9,50.5
3,NO2,uur,µg/m³,2021-01-01 03:00:00,2021-01-01 04:00:00,20.4,52.9,49.9,45.3,54.6,...,41.2,55.1,9.0,3.0,56.6,38.9,56.0,42.7,39.3,47.1
4,NO2,uur,µg/m³,2021-01-01 04:00:00,2021-01-01 05:00:00,14.5,46.2,45.6,43.5,54.4,...,48.8,56.4,12.5,2.4,58.3,40.3,55.8,46.1,41.8,47.1


### Missing values

In [35]:
print(f"Missing values:\n{df_2020.iloc[:,5:].isnull().any()}")

Missing values:
NL01485    True
NL01487    True
NL01488    True
NL01489    True
NL01491    True
           ... 
NL49564    True
NL49565    True
NL49701    True
NL49703    True
NL49704    True
Length: 73, dtype: bool


In [36]:
print(f"Check how many values there are missing per station:\n{df_2021.iloc[:,5:].isnull().sum()}")

Check how many values there are missing per station:
NL01485    3163
NL01487    2947
NL01488    3003
NL01489    2936
NL01491    3085
           ... 
NL49564    3787
NL49565    3709
NL49701    3687
NL49703    3899
NL49704    3836
Length: 73, dtype: int64


Inspecting the number of missing values per stations. It looks like that for the year 2021 a lot of stations contain a lot of missing values. Stations that contain to much missing values should be removed:

In [37]:
n_stations_2021 = len(df_2021.columns)

p = 0.15
# Filter out all columns with to many missing values
df_2021 = data.missing_value_filter(df_2021, p)

df_2021

Unnamed: 0,Component,Bep.periode,Eenheid,date_start,date_end,NL10107,NL10131,NL10133,NL10136,NL10138,...,NL10738,NL10741,NL10742,NL10807,NL10818,NL10918,NL10929,NL10934,NL10937,NL10938
0,NO2,uur,µg/m³,2021-01-01 00:00:00,2021-01-01 01:00:00,22.58,22.69,13.43,27.63,23.40,...,14.26,41.78,32.58,16.12,29.11,22.84,13.27,12.43,34.03,30.30
1,NO2,uur,µg/m³,2021-01-01 01:00:00,2021-01-01 02:00:00,21.79,23.09,16.36,26.93,19.52,...,12.45,42.19,33.53,15.62,32.72,20.13,14.64,15.12,24.56,22.76
2,NO2,uur,µg/m³,2021-01-01 02:00:00,2021-01-01 03:00:00,22.06,19.06,17.35,25.40,19.56,...,10.50,41.26,33.26,20.28,30.52,16.96,14.31,18.14,21.07,20.20
3,NO2,uur,µg/m³,2021-01-01 03:00:00,2021-01-01 04:00:00,20.73,25.61,15.83,25.33,20.44,...,9.52,40.00,32.02,24.03,30.22,18.91,17.05,13.87,20.22,18.98
4,NO2,uur,µg/m³,2021-01-01 04:00:00,2021-01-01 05:00:00,21.87,23.23,12.12,15.95,14.17,...,9.26,39.56,31.15,24.89,28.20,19.43,21.04,11.29,21.10,16.14
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8011,NO2,uur,µg/m³,2021-11-30 19:00:00,2021-11-30 20:00:00,12.36,13.51,9.59,10.96,8.26,...,17.87,24.45,28.44,4.90,7.32,4.97,3.47,4.35,29.99,11.30
8012,NO2,uur,µg/m³,2021-11-30 20:00:00,2021-11-30 21:00:00,11.26,15.39,7.32,9.31,7.86,...,15.57,23.51,28.75,6.93,6.52,3.33,5.33,8.37,28.89,12.02
8013,NO2,uur,µg/m³,2021-11-30 21:00:00,2021-11-30 22:00:00,9.34,13.00,6.84,12.06,10.17,...,10.47,18.70,24.28,8.56,6.61,15.93,6.43,7.35,17.58,10.20
8014,NO2,uur,µg/m³,2021-11-30 22:00:00,2021-11-30 23:00:00,7.48,9.65,8.42,8.17,8.97,...,9.80,17.22,24.54,13.63,11.27,14.04,6.10,5.42,15.49,7.29


In [38]:
# @hidden_cell
md(f"Number of stations removed: {n_stations_2021 - len(df_2021.columns)}")

Number of stations removed: 30

## <a id="/5">1.3 Filter out stations with no data</a>  

In [39]:
# Column names of the 2021 data frame
cols_to_keep = df_2021.columns.values.tolist()

# Only keep relevant columns
df_2020 = df_2020.loc[:,cols_to_keep]
df_2020.head()

Unnamed: 0,Component,Bep.periode,Eenheid,date_start,date_end,NL10107,NL10131,NL10133,NL10136,NL10138,...,NL10738,NL10741,NL10742,NL10807,NL10818,NL10918,NL10929,NL10934,NL10937,NL10938
0,NO2,uur,�g/m�,2020-01-01 00:00:00,2020-01-01 01:00:00,38.78,20.94,27.48,40.33,38.7,...,32.69,42.24,44.22,6.14,10.44,30.06,29.3,22.68,37.35,44.95
1,NO2,uur,�g/m�,2020-01-01 01:00:00,2020-01-01 02:00:00,38.31,26.56,37.05,43.46,37.7,...,21.93,37.59,30.06,9.11,6.53,21.39,25.27,25.78,35.03,35.87
2,NO2,uur,�g/m�,2020-01-01 02:00:00,2020-01-01 03:00:00,37.06,34.22,38.43,38.72,41.5,...,23.22,39.67,31.79,12.25,7.75,9.38,22.14,26.87,30.58,30.06
3,NO2,uur,�g/m�,2020-01-01 03:00:00,2020-01-01 04:00:00,35.16,34.82,37.74,33.25,34.47,...,22.2,38.14,32.77,15.16,8.96,6.79,16.38,24.02,22.66,22.74
4,NO2,uur,�g/m�,2020-01-01 04:00:00,2020-01-01 05:00:00,32.31,36.92,36.39,36.58,37.55,...,28.42,39.48,36.21,14.39,11.2,6.16,11.0,11.84,20.93,16.92


## <a id="/6">1.4 Impute missing data</a>  
https://digitaltesseract.com/data-imputation-techniques-an-introduction/

write a story about the decision 

In [40]:
def calculate_corr(df):
    """
    Calculate the correlation between different columns of a data frame.
    
    :parameters
    -----------
    df - pd.DataFrame
        Data frame
    
    :returns
    --------
    c - pd.DataFrame
        Correlation matrix
    """
    c = df.corr().abs()
    return c
    
    

## 2020

In [41]:
# Get the names of the different stations
stations = df_2020.columns[df_2020.columns.str.contains("NL")]

# calculate the correlation between each station for the year 2020
cor_stations_2020 = calculate_corr(df_2020.loc[:,stations])

cor_stations_2020

Unnamed: 0,NL10107,NL10131,NL10133,NL10136,NL10138,NL10230,NL10235,NL10236,NL10237,NL10240,...,NL10738,NL10741,NL10742,NL10807,NL10818,NL10918,NL10929,NL10934,NL10937,NL10938
NL10107,1.0,0.786493,0.715316,0.72973,0.814773,0.464179,0.537842,0.763899,0.715351,0.63608,...,0.667494,0.625002,0.740633,0.602159,0.586028,0.485667,0.526274,0.458137,0.467106,0.497633
NL10131,0.786493,1.0,0.607234,0.663923,0.718681,0.520781,0.529592,0.764157,0.68096,0.635263,...,0.73208,0.627954,0.757683,0.64909,0.607254,0.505326,0.59006,0.471963,0.435828,0.495598
NL10133,0.715316,0.607234,1.0,0.681717,0.793526,0.3564,0.484573,0.646003,0.581961,0.497754,...,0.567923,0.50865,0.611045,0.538856,0.515437,0.407384,0.429012,0.399553,0.45739,0.472347
NL10136,0.72973,0.663923,0.681717,1.0,0.848987,0.38228,0.382627,0.763496,0.727545,0.67512,...,0.553697,0.727328,0.709327,0.46094,0.493645,0.366129,0.388438,0.37078,0.567287,0.412287
NL10138,0.814773,0.718681,0.793526,0.848987,1.0,0.409894,0.496793,0.766782,0.707235,0.632761,...,0.644634,0.638456,0.732766,0.582537,0.589076,0.473212,0.492728,0.458794,0.505232,0.506966
NL10230,0.464179,0.520781,0.3564,0.38228,0.409894,1.0,0.283242,0.412983,0.370334,0.346175,...,0.374716,0.369199,0.425911,0.307071,0.291861,0.231281,0.347551,0.233359,0.205933,0.18191
NL10235,0.537842,0.529592,0.484573,0.382627,0.496793,0.283242,1.0,0.529989,0.505361,0.540504,...,0.627147,0.346167,0.524504,0.587549,0.593109,0.554693,0.518803,0.493699,0.427661,0.555352
NL10236,0.763899,0.764157,0.646003,0.763496,0.766782,0.412983,0.529989,1.0,0.90394,0.774329,...,0.708348,0.768457,0.842673,0.603488,0.6222,0.468061,0.47008,0.436595,0.609008,0.550909
NL10237,0.715351,0.68096,0.581961,0.727545,0.707235,0.370334,0.505361,0.90394,1.0,0.814266,...,0.658556,0.758463,0.807306,0.530467,0.55609,0.404466,0.387074,0.358492,0.617929,0.5179
NL10240,0.63608,0.635263,0.497754,0.67512,0.632761,0.346175,0.540504,0.774329,0.814266,1.0,...,0.66925,0.681749,0.735797,0.537364,0.571264,0.435276,0.401492,0.367103,0.63146,0.517321


In [42]:
# find columns to correlate well with a certain station
def find_correlated_stations(cor_df, station, cor_threshold):
    """
    Find stations that are closely related (highly correlated) to a specific station.
    
    :parameters
    -----------
    cor_df - pd.DataFrame
        Correlation matrix of the stations
    station - str
        Station of interest
    cor_threshold - float
        Cut off to be considered highly correlated
        
    :returns
    --------
    stations - list
        List of correlated stations
    """
    corr_station = cor_df.loc[:,cor_df.columns == station]
    stations = corr_station[corr_station[station] > cor_threshold].index.values.tolist()
    # stations.append(station)
    return stations

stations_NL10138 = find_correlated_stations(cor_stations_2020, "NL10138", 0.7)
stations_NL10138

['NL10107',
 'NL10131',
 'NL10133',
 'NL10136',
 'NL10138',
 'NL10236',
 'NL10237',
 'NL10247',
 'NL10742']

In [43]:
# imputer = IterativeImputer(BayesianRidge())
# new_df = pd.DataFrame(imputer.fit_transform(df_2020[stations_NL10138]), columns = stations_NL10138)
# # new_df["date_start"] = df_2020["date_start"]
# new_df.insert(0, "date_start", df_2020["date_start"].values)
# new_df

In [44]:
# Put this in a class
def data_impute_regression(col, df):
    """
    Use regression-based model to fill in the missing value for a station. Features used for 
    predictions are the stations that are highly correlated to one another. 
    
    :parameters
    -----------
    col - pd.Series
        Column of a data frame
    df - pd.DataFrame
        Data frame containing the features
    
    :returns
    --------
    imputed_col - pd.Series
        Imputed column of interest
    """
    stations = find_correlated_stations(cor_stations_2020, col.name, 0.7)
    imputer = IterativeImputer(BayesianRidge())
    imputed_df = pd.DataFrame(imputer.fit_transform(df[stations]), columns = stations)
    imputed_col = imputed_df[col.name]
    return imputed_col

In [45]:
# Imput the data for columns with missing values
df_2020_imp = df_2020.loc[:,stations].apply(lambda x: data_impute_regression(col=x, df=df_2020), axis=0)

cols_to_keep = ["Component", "Eenheid", "date_start", "date_end"]
df_2020_imp = pd.concat([df_2020.loc[:,cols_to_keep], df_2020_imp], axis=1)


[IterativeImputer] Early stopping criterion not reached.



In [46]:
df_2020_imp

Unnamed: 0,Component,Eenheid,date_start,date_end,NL10107,NL10131,NL10133,NL10136,NL10138,NL10230,...,NL10738,NL10741,NL10742,NL10807,NL10818,NL10918,NL10929,NL10934,NL10937,NL10938
0,NO2,�g/m�,2020-01-01 00:00:00,2020-01-01 01:00:00,38.78,20.94,27.48,40.33,38.70,23.65,...,32.69,42.24,44.22,6.14,10.44,30.06,29.30,22.68,37.35,44.95
1,NO2,�g/m�,2020-01-01 01:00:00,2020-01-01 02:00:00,38.31,26.56,37.05,43.46,37.70,35.95,...,21.93,37.59,30.06,9.11,6.53,21.39,25.27,25.78,35.03,35.87
2,NO2,�g/m�,2020-01-01 02:00:00,2020-01-01 03:00:00,37.06,34.22,38.43,38.72,41.50,28.47,...,23.22,39.67,31.79,12.25,7.75,9.38,22.14,26.87,30.58,30.06
3,NO2,�g/m�,2020-01-01 03:00:00,2020-01-01 04:00:00,35.16,34.82,37.74,33.25,34.47,22.57,...,22.20,38.14,32.77,15.16,8.96,6.79,16.38,24.02,22.66,22.74
4,NO2,�g/m�,2020-01-01 04:00:00,2020-01-01 05:00:00,32.31,36.92,36.39,36.58,37.55,18.47,...,28.42,39.48,36.21,14.39,11.20,6.16,11.00,11.84,20.93,16.92
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8779,NO2,�g/m�,2020-12-31 19:00:00,2020-12-31 20:00:00,20.33,13.69,11.98,27.05,15.15,24.92,...,16.16,42.22,28.04,12.44,14.87,17.47,13.86,16.16,31.23,22.49
8780,NO2,�g/m�,2020-12-31 20:00:00,2020-12-31 21:00:00,17.81,15.45,12.26,23.34,15.69,24.98,...,14.79,40.72,28.22,14.65,13.06,24.95,14.46,18.47,28.68,32.30
8781,NO2,�g/m�,2020-12-31 21:00:00,2020-12-31 22:00:00,19.03,15.01,13.61,19.41,15.88,26.05,...,17.44,37.69,30.29,13.95,16.32,31.54,12.25,16.48,31.84,28.30
8782,NO2,�g/m�,2020-12-31 22:00:00,2020-12-31 23:00:00,19.29,15.73,13.15,20.42,15.96,32.45,...,14.13,42.53,34.98,14.70,14.45,29.41,11.19,16.96,31.38,30.09


## 2021

In [47]:
# calculate the correlation between each station for the year 2021
cor_stations_2021 = calculate_corr(df_2021.loc[:,stations])

In [48]:
# Imput the data for columns with missing values
df_2021_imp = df_2021.loc[:,stations].apply(lambda x: data_impute_regression(col=x, df=df_2021), axis=0)
cols_to_keep = ["Component", "Eenheid", "date_start", "date_end"]
df_2021_imp = pd.concat([df_2021.loc[:,cols_to_keep], df_2021_imp], axis=1)


[IterativeImputer] Early stopping criterion not reached.



## <a id="/7">1.5 Meta Data</a>  

In [49]:
meta_data = MetaData(file_2020)

df_meta_data = meta_data.df
df_meta_data

StationsCode,Stationsnaam,Latitude,Longitude,Stationsgebied,Stationstype,Meetprincipe,Meetopstelling
NL01485,Hoogvliet-Leemkuil,51.867411,4.355242,stad,achtergrond,Chemiluminescentie,Teledyne API 200E chemiluminescent Nox Analyser
NL01487,Rotterdam Zuid-Pleinweg,51.891147,4.48069,regionaal,verkeer,Chemiluminescentie,Teledyne API 200E chemiluminescent Nox Analyser
NL01488,Rotterdam Zuid-Zwartewaalstraat,51.893617,4.487528,stad,achtergrond,Chemiluminescentie,Teledyne API 200E chemiluminescent Nox Analyser
NL01489,Ridderkerk-Hogeweg,51.869431,4.580058,stad,verkeer,Chemiluminescentie,Teledyne API 200E chemiluminescent Nox Analyser
NL01491,Rotterdam-Oost Sidelinge A13,51.938472,4.430692,stad,verkeer,Chemiluminescentie,Teledyne API 200E chemiluminescent Nox Analyser
...,...,...,...,...,...,...,...
NL49564,Hoofddorp-Hoofdweg,52.327464,4.715008,onbekend,onbekend,Chemiluminescentie,Teledyne API 200E chemiluminescent Nox Analyser
NL49565,Oude Meer-Aalsmeerderdijk,52.279991,4.770773,regionaal,achtergrond,Chemiluminescentie,Teledyne API 200E chemiluminescent Nox Analyser
NL49701,Zaandam-Wagenschotpad,52.448011,4.816706,stad,achtergrond,Chemiluminescentie,Thermo model 42w NO/Nox analyser
NL49703,Amsterdam-Spaarnwoude,52.398437,4.728581,regionaal,achtergrond,Chemiluminescentie,Teledyne API 200E chemiluminescent Nox Analyser


## <a id="/7">1.6 Covid data</a>  

https://www.rijksoverheid.nl/onderwerpen/coronavirus-tijdlijn
https://nl.wikipedia.org/wiki/Maatregelen_tijdens_de_coronacrisis_in_Nederland#Maatregelen_naar_datum_van_aankondiging

Lockdown:
* 14 oktober 2020 - 14 december 2020: gedeeltelijke lockdown
* 15 december 2020 - 27 april 2021: harde lockdown
* 28 april 2021 - 25 september 2021: weinig maatregelen
* 26 september 2021 - 26 november 2021: maatregelen
* 27 november 2021 - 17 december 2021: avondlockdown
* 18 december 2021 -  5 januari 2022: lockdown

In [50]:
lockdowns = ["partial-lockdown", "hard-lockdown", "evening-lockdown", "corona-measures", "few-measures"]

In [51]:
def check_lockdown(start, end):
    """
    Check for each date if it was during a lockdown.
    """
    lockdown1 = (start >= pd.to_datetime('2020-10-14', utc= True)) & (end <= pd.to_datetime('2020-12-14', utc= True))
    lockdown2 = (start >= pd.to_datetime('2020-12-15', utc= True)) & (end <= pd.to_datetime('2021-04-27', utc= True))
    lockdown3 = (start >= pd.to_datetime('2021-12-18', utc= True)) & (end <= pd.to_datetime('2022-01-05', utc= True))
    return lockdown1, lockdown2, lockdown3

In [52]:
l1, l2, l3 = check_lockdown(df_2020['date_start'].dt.date, df_2020['date_end'].dt.date)

df_2020_imp["Lockdown"] = np.where(l1 | l2 | l3, "Lockdown", "No-lockdown")

# Change dtype
df_2020_imp["Lockdown"]  = df_2020_imp["Lockdown"].astype("category")


Comparison of Timestamp with datetime.date is deprecated in order to match the standard library behavior.  In a future version these will be considered non-comparable.Use 'ts == pd.Timestamp(date)' or 'ts.date() == date' instead.



In [53]:
l1, l2, l3 = check_lockdown(df_2021['date_start'].dt.date, df_2021['date_end'].dt.date)

df_2021_imp.loc[:,"Lockdown"] = np.where(l1 | l2 | l3, "Lockdown", "No-lockdown")

# Change dtype
df_2021_imp.loc[:,"Lockdown"]  = df_2021_imp.loc[:,"Lockdown"].astype("category")

In [54]:
df_2021_imp

Unnamed: 0,Component,Eenheid,date_start,date_end,NL10107,NL10131,NL10133,NL10136,NL10138,NL10230,...,NL10741,NL10742,NL10807,NL10818,NL10918,NL10929,NL10934,NL10937,NL10938,Lockdown
0,NO2,µg/m³,2021-01-01 00:00:00,2021-01-01 01:00:00,22.58,22.69,13.43,27.63,23.40,35.88,...,41.78,32.58,16.12,29.11,22.84,13.27,12.43,34.03,30.30,Lockdown
1,NO2,µg/m³,2021-01-01 01:00:00,2021-01-01 02:00:00,21.79,23.09,16.36,26.93,19.52,38.31,...,42.19,33.53,15.62,32.72,20.13,14.64,15.12,24.56,22.76,Lockdown
2,NO2,µg/m³,2021-01-01 02:00:00,2021-01-01 03:00:00,22.06,19.06,17.35,25.40,19.56,35.85,...,41.26,33.26,20.28,30.52,16.96,14.31,18.14,21.07,20.20,Lockdown
3,NO2,µg/m³,2021-01-01 03:00:00,2021-01-01 04:00:00,20.73,25.61,15.83,25.33,20.44,28.71,...,40.00,32.02,24.03,30.22,18.91,17.05,13.87,20.22,18.98,Lockdown
4,NO2,µg/m³,2021-01-01 04:00:00,2021-01-01 05:00:00,21.87,23.23,12.12,15.95,14.17,21.38,...,39.56,31.15,24.89,28.20,19.43,21.04,11.29,21.10,16.14,Lockdown
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8011,NO2,µg/m³,2021-11-30 19:00:00,2021-11-30 20:00:00,12.36,13.51,9.59,10.96,8.26,13.34,...,24.45,28.44,4.90,7.32,4.97,3.47,4.35,29.99,11.30,No-lockdown
8012,NO2,µg/m³,2021-11-30 20:00:00,2021-11-30 21:00:00,11.26,15.39,7.32,9.31,7.86,19.64,...,23.51,28.75,6.93,6.52,3.33,5.33,8.37,28.89,12.02,No-lockdown
8013,NO2,µg/m³,2021-11-30 21:00:00,2021-11-30 22:00:00,9.34,13.00,6.84,12.06,10.17,16.90,...,18.70,24.28,8.56,6.61,15.93,6.43,7.35,17.58,10.20,No-lockdown
8014,NO2,µg/m³,2021-11-30 22:00:00,2021-11-30 23:00:00,7.48,9.65,8.42,8.17,8.97,12.97,...,17.22,24.54,13.63,11.27,14.04,6.10,5.42,15.49,7.29,No-lockdown


### Merge covid cases

In [55]:
covid_data = pd.read_csv(Path(data_dir + "time_series_covid19_confirmed_global.csv"), sep=",")
covid_NL = covid_data[(covid_data["Country/Region"]=="Netherlands") & (covid_data["Province/State"].isnull())]

In [56]:
# covid_NL = covid_NL.melt(id_vars=["Country/Region", "Lat", "Long"],
#             value_vars=covid_NL.columns[4:],
#                         var_name = "Date",
#                         value_name = "covid_patients")

# Melt the covid data frame
covid_NL = data.tidy_df(covid_NL, id_vars=["Country/Region", "Lat", "Long"], value_vars=covid_NL.columns[4:],
                       var_name = "Date", value_name = "covid_patients")

covid_NL.Date = pd.to_datetime(covid_NL.Date.astype(str))
# Drop unwanted columns
covid_NL.drop(list(covid_NL.columns[:3]), axis=1, inplace=True)

In [57]:
covid_NL

Unnamed: 0,Date,covid_patients
0,2020-01-22,0
1,2020-01-23,0
2,2020-01-24,0
3,2020-01-25,0
4,2020-01-26,0
...,...,...
723,2022-01-14,3500915
724,2022-01-15,3532768
725,2022-01-16,3568999
726,2022-01-17,3611351


### 2020

In [58]:
# Merge df
NO2_cov_2020 = pd.merge_asof(df_2020_imp, covid_NL, left_on='date_start', right_on="Date")

NO2_cov_2020.drop("Date", axis=1, inplace=True)

NO2_cov_2020.covid_patients = NO2_cov_2020.covid_patients.fillna(0)

In [59]:
NO2_cov_2020

Unnamed: 0,Component,Eenheid,date_start,date_end,NL10107,NL10131,NL10133,NL10136,NL10138,NL10230,...,NL10742,NL10807,NL10818,NL10918,NL10929,NL10934,NL10937,NL10938,Lockdown,covid_patients
0,NO2,�g/m�,2020-01-01 00:00:00,2020-01-01 01:00:00,38.78,20.94,27.48,40.33,38.70,23.65,...,44.22,6.14,10.44,30.06,29.30,22.68,37.35,44.95,No-lockdown,0.0
1,NO2,�g/m�,2020-01-01 01:00:00,2020-01-01 02:00:00,38.31,26.56,37.05,43.46,37.70,35.95,...,30.06,9.11,6.53,21.39,25.27,25.78,35.03,35.87,No-lockdown,0.0
2,NO2,�g/m�,2020-01-01 02:00:00,2020-01-01 03:00:00,37.06,34.22,38.43,38.72,41.50,28.47,...,31.79,12.25,7.75,9.38,22.14,26.87,30.58,30.06,No-lockdown,0.0
3,NO2,�g/m�,2020-01-01 03:00:00,2020-01-01 04:00:00,35.16,34.82,37.74,33.25,34.47,22.57,...,32.77,15.16,8.96,6.79,16.38,24.02,22.66,22.74,No-lockdown,0.0
4,NO2,�g/m�,2020-01-01 04:00:00,2020-01-01 05:00:00,32.31,36.92,36.39,36.58,37.55,18.47,...,36.21,14.39,11.20,6.16,11.00,11.84,20.93,16.92,No-lockdown,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8779,NO2,�g/m�,2020-12-31 19:00:00,2020-12-31 20:00:00,20.33,13.69,11.98,27.05,15.15,24.92,...,28.04,12.44,14.87,17.47,13.86,16.16,31.23,22.49,Lockdown,796981.0
8780,NO2,�g/m�,2020-12-31 20:00:00,2020-12-31 21:00:00,17.81,15.45,12.26,23.34,15.69,24.98,...,28.22,14.65,13.06,24.95,14.46,18.47,28.68,32.30,Lockdown,796981.0
8781,NO2,�g/m�,2020-12-31 21:00:00,2020-12-31 22:00:00,19.03,15.01,13.61,19.41,15.88,26.05,...,30.29,13.95,16.32,31.54,12.25,16.48,31.84,28.30,Lockdown,796981.0
8782,NO2,�g/m�,2020-12-31 22:00:00,2020-12-31 23:00:00,19.29,15.73,13.15,20.42,15.96,32.45,...,34.98,14.70,14.45,29.41,11.19,16.96,31.38,30.09,Lockdown,796981.0


### 2021

In [60]:
# Merge df
NO2_cov_2021 = pd.merge_asof(df_2021_imp, covid_NL, left_on='date_start', right_on="Date")

NO2_cov_2021.drop("Date", axis=1, inplace=True)

NO2_cov_2021.covid_patients = NO2_cov_2021.covid_patients.fillna(0)

In [61]:
NO2_cov_2021

Unnamed: 0,Component,Eenheid,date_start,date_end,NL10107,NL10131,NL10133,NL10136,NL10138,NL10230,...,NL10742,NL10807,NL10818,NL10918,NL10929,NL10934,NL10937,NL10938,Lockdown,covid_patients
0,NO2,µg/m³,2021-01-01 00:00:00,2021-01-01 01:00:00,22.58,22.69,13.43,27.63,23.40,35.88,...,32.58,16.12,29.11,22.84,13.27,12.43,34.03,30.30,Lockdown,805164
1,NO2,µg/m³,2021-01-01 01:00:00,2021-01-01 02:00:00,21.79,23.09,16.36,26.93,19.52,38.31,...,33.53,15.62,32.72,20.13,14.64,15.12,24.56,22.76,Lockdown,805164
2,NO2,µg/m³,2021-01-01 02:00:00,2021-01-01 03:00:00,22.06,19.06,17.35,25.40,19.56,35.85,...,33.26,20.28,30.52,16.96,14.31,18.14,21.07,20.20,Lockdown,805164
3,NO2,µg/m³,2021-01-01 03:00:00,2021-01-01 04:00:00,20.73,25.61,15.83,25.33,20.44,28.71,...,32.02,24.03,30.22,18.91,17.05,13.87,20.22,18.98,Lockdown,805164
4,NO2,µg/m³,2021-01-01 04:00:00,2021-01-01 05:00:00,21.87,23.23,12.12,15.95,14.17,21.38,...,31.15,24.89,28.20,19.43,21.04,11.29,21.10,16.14,Lockdown,805164
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8011,NO2,µg/m³,2021-11-30 19:00:00,2021-11-30 20:00:00,12.36,13.51,9.59,10.96,8.26,13.34,...,28.44,4.90,7.32,4.97,3.47,4.35,29.99,11.30,No-lockdown,2643176
8012,NO2,µg/m³,2021-11-30 20:00:00,2021-11-30 21:00:00,11.26,15.39,7.32,9.31,7.86,19.64,...,28.75,6.93,6.52,3.33,5.33,8.37,28.89,12.02,No-lockdown,2643176
8013,NO2,µg/m³,2021-11-30 21:00:00,2021-11-30 22:00:00,9.34,13.00,6.84,12.06,10.17,16.90,...,24.28,8.56,6.61,15.93,6.43,7.35,17.58,10.20,No-lockdown,2643176
8014,NO2,µg/m³,2021-11-30 22:00:00,2021-11-30 23:00:00,7.48,9.65,8.42,8.17,8.97,12.97,...,24.54,13.63,11.27,14.04,6.10,5.42,15.49,7.29,No-lockdown,2643176


### Tidy data frames

### 2020

In [62]:
id_vars=["date_start", "date_end", "Lockdown", "covid_patients"]
# Get all the stations
value_vars = NO2_cov_2020.columns[NO2_cov_2020.columns.str.contains("NL")]

# Create tidy data frame
tidy_2020 = data.tidy_df(NO2_cov_2020, id_vars=id_vars, value_vars=value_vars,
                       var_name = "site", value_name = "NO2")
tidy_2020

Unnamed: 0,date_start,date_end,Lockdown,covid_patients,site,NO2
0,2020-01-01 00:00:00,2020-01-01 01:00:00,No-lockdown,0.0,NL10107,38.78
1,2020-01-01 01:00:00,2020-01-01 02:00:00,No-lockdown,0.0,NL10107,38.31
2,2020-01-01 02:00:00,2020-01-01 03:00:00,No-lockdown,0.0,NL10107,37.06
3,2020-01-01 03:00:00,2020-01-01 04:00:00,No-lockdown,0.0,NL10107,35.16
4,2020-01-01 04:00:00,2020-01-01 05:00:00,No-lockdown,0.0,NL10107,32.31
...,...,...,...,...,...,...
377707,2020-12-31 19:00:00,2020-12-31 20:00:00,Lockdown,796981.0,NL10938,22.49
377708,2020-12-31 20:00:00,2020-12-31 21:00:00,Lockdown,796981.0,NL10938,32.30
377709,2020-12-31 21:00:00,2020-12-31 22:00:00,Lockdown,796981.0,NL10938,28.30
377710,2020-12-31 22:00:00,2020-12-31 23:00:00,Lockdown,796981.0,NL10938,30.09


### 2021

In [63]:
id_vars=["date_start", "date_end", "Lockdown", "covid_patients"]
# Get all the stations
value_vars = NO2_cov_2021.columns[NO2_cov_2021.columns.str.contains("NL")]

# Create tidy data frame
tidy_2021 = data.tidy_df(NO2_cov_2021, id_vars=id_vars, value_vars=value_vars,
                       var_name = "site", value_name = "NO2")
tidy_2021

Unnamed: 0,date_start,date_end,Lockdown,covid_patients,site,NO2
0,2021-01-01 00:00:00,2021-01-01 01:00:00,Lockdown,805164,NL10107,22.58
1,2021-01-01 01:00:00,2021-01-01 02:00:00,Lockdown,805164,NL10107,21.79
2,2021-01-01 02:00:00,2021-01-01 03:00:00,Lockdown,805164,NL10107,22.06
3,2021-01-01 03:00:00,2021-01-01 04:00:00,Lockdown,805164,NL10107,20.73
4,2021-01-01 04:00:00,2021-01-01 05:00:00,Lockdown,805164,NL10107,21.87
...,...,...,...,...,...,...
344683,2021-11-30 19:00:00,2021-11-30 20:00:00,No-lockdown,2643176,NL10938,11.30
344684,2021-11-30 20:00:00,2021-11-30 21:00:00,No-lockdown,2643176,NL10938,12.02
344685,2021-11-30 21:00:00,2021-11-30 22:00:00,No-lockdown,2643176,NL10938,10.20
344686,2021-11-30 22:00:00,2021-11-30 23:00:00,No-lockdown,2643176,NL10938,7.29


# <a id="/8">2 Data inspection</a>  

In [64]:
tidy_2020.describe()

Unnamed: 0,covid_patients,NO2
count,377712.0,377712.0
mean,137850.530055,15.431501
std,200904.721613,12.268614
min,0.0,-4.94
25%,13614.0,6.682349
50%,50373.5,11.98
75%,124097.0,20.61
max,796981.0,315.04


In [65]:
tidy_2021.describe()

Unnamed: 0,covid_patients,NO2
count,344688.0,344688.0
mean,1622148.0,14.740487
std,437333.9,12.13084
min,805164.0,-4.71
25%,1228647.0,6.23
50%,1676176.0,11.27
75%,1961585.0,19.59
max,2643176.0,355.13


When comparing the basic statistics of the two data frames it is clear that the year 2021 contains overall more covid cases. However, the statistics about the NO2 emission looks to be somewhat the same for both years. Something notably is that the NO2 emission values has a negative value as its minimum value, which is something that is not expected. This is caused by measuring inaccuracies. These measuring inaccuracies can be caused by, for example rapidly changing wheater conditions (i.e. humidity, temperature)[1].The max values for both years are far higher then the mean. However, these max values were not introduced by data imputation. This was validated by checking the maximum values before the data imputation step. 


[1] https://www.luchtmeetnet.nl/informatie/overige/negatieve-waarden

In [66]:
from bokeh.models import (BasicTicker, ColorBar, ColumnDataSource,
                          LinearColorMapper, PrintfTickFormatter,)
from bokeh.transform import transform
from bokeh.palettes import Viridis256

def plot_heatmap(df_cor, year):
    #reshape
    dfc = pd.DataFrame(df_cor.stack(), columns=['r']).reset_index()

    y_range = (list(reversed(df_cor.columns)))
    x_range = (list(df_cor.index))

    source = ColumnDataSource(dfc)

    #create colormapper 
    mapper = LinearColorMapper(palette=Viridis256, low=dfc.r.min(), high=dfc.r.max())

    #create plot
    p = figure(title=f"Correlation heatmap {year}", plot_width=500, plot_height=450,
               x_range=x_range, y_range=y_range, x_axis_location="above", toolbar_location=None)

    #use mapper to fill the rectangles in the plot
    p.rect(x="level_0", y="level_1", width=1, height=1, source=source,
           line_color=None, fill_color=transform('r', mapper))

    #create and add colorbar to the right
    color_bar = ColorBar(color_mapper=mapper, location=(0, 0),
                         ticker=BasicTicker(desired_num_ticks=len(x_range)), 
                         formatter=PrintfTickFormatter(format="%.1f"))
    p.add_layout(color_bar, 'right')

    #draw axis
    p.axis.axis_line_color = None
    p.axis.major_tick_line_color = None
    p.axis.major_label_text_font_size = "10px"
    p.axis.major_label_standoff = 0
    p.xaxis.major_label_orientation = 1.55

    #show
    return p 

heatmap_2021 = plot_heatmap(cor_stations_2021, "2021")
show(heatmap_2021)

In [67]:
heatmap_2020 = plot_heatmap(cor_stations_2020, "2020")
show(heatmap_2020)

From the heatmap we can see that some stations are more corelated to each other then others. This is probably because some stations are located close to each other and others are in a complete different region. Similar NO2 emission measurements are expected for stations that are closely related, espically if they are located in the same city. 

In [68]:
def plot_density(df,*, site, column, by, colors) -> figure:
    """
    Create a density plot for the specific site.
    
    :parameters
    -----------
    df - pd.DataFrame
        Data frame
    site - str
        Name of the site
    columns - list
        Which columns to use to display density
    by - str
        By which column to create density plot
    colors - list
        List of colors to used.
        
    :returns
    --------
    p - figure
        Density plot
    """    
    if site == "All":
        p = df.loc[:, [column, by]].hvplot.kde(by=by, legend = 'top_right', color = colors)
    else:
        p = df.loc[df['site'] == site, [column,by]].hvplot.kde(by=by, legend = 'top_right', color = colors)
    return p
    

In [71]:
# Plot distribution NO2 emission and lockdown yes/no
density_NO2_NL10107 = plot_density(tidy_2021, site = "NL10107", column = 'NO2', by='Lockdown', colors = ['red', 'green'])
density_NO2_NL10107

In [72]:
density_NO2_2020 = plot_density(tidy_2020, site = "All", column = 'NO2', by='Lockdown', colors = ['red', 'green'])
density_NO2_2020

In [73]:
# tidy_2021[tidy_2021["NO2"] > 100]

In [74]:
import plotly.express as px

# df_NL10230 = tidy_2021[tidy_2021.site == "NL10230"]

fig = px.line(df_2021, x="date_start", y="NL10230")
fig.update_xaxes(
    rangeslider_visible=True,
    rangeselector=dict(
        buttons=list([
            dict(count=3, label="3m", step="month", stepmode="backward"),
            dict(count=6, label="6m", step="month", stepmode="backward"),
            dict(count=9, label="9m", step="month", stepmode="backward"),
            dict(step="all")
        ])
    )
)
fig.show()

In [75]:
import plotly.express as px

df_NL10230 = tidy_2021[tidy_2021.site == "NL10230"]

fig = px.line(df_NL10230, x="date_start", y="NO2")
fig.update_xaxes(
    rangeslider_visible=True,
    rangeselector=dict(
        buttons=list([
            dict(count=3, label="3m", step="month", stepmode="backward"),
            dict(count=6, label="6m", step="month", stepmode="backward"),
            dict(count=9, label="9m", step="month", stepmode="backward"),
            dict(step="all")
        ])
    )
)
fig.show()

In [76]:

density_NO2_2021 = plot_density(tidy_2021, site = "All", column = 'NO2', by='Lockdown', colors = ['red', 'green'])
density_NO2_2021

From the desnity plot we can see that there is a small difference in NO2 emission levels for lockdown compared to no-lockdown.  

-- Talk about the individual site and the two years --

In [77]:
autocomplete_station = pn.widgets.AutocompleteInput(
    name='Station', options=stations.values.tolist(),
    value=stations.values.tolist()[0],
    placeholder='ID of station..')

# pn.Row(autocomplete_station, height=100)

#             pn.Row(intervention_widget, measure_widget),

In [78]:
years_df = {"2020":tidy_2020,
           "2021":tidy_2021}

select_year = pn.widgets.Select(name="Year",
                                options=["2020", "2021"])


@pn.depends(site=autocomplete_station, year=select_year)
def create_scatter(year, site="NL10247"):
    """
    Create a scatter plot to show the relation between how tired you feel and hours slept.
    
    :parameters
    -----------
    df - pandas.DataFrame
        Data frame
    
    :returns
    --------
    p - figure
        Scatter plot
    """
    df= years_df[year]

    # Create two seperate data frames for yes and no breakfast
    yes_lockdown = df.loc[(df['site'] == site) & (df['Lockdown'] == "Lockdown"), ["Lockdown", "NO2"]]
    no_lockdown = df.loc[(df['site'] == site) & (df['Lockdown'] == "No-lockdown"), ["Lockdown", "NO2"]]

    yes_lockdown.Lockdown = yes_lockdown.Lockdown.map({"No-lockdown": 0, "Lockdown":1})
    no_lockdown.Lockdown = no_lockdown.Lockdown.map({"No-lockdown": 0, "Lockdown":1})
    
    # Create ColumnDataSource objects
    source_yes = ColumnDataSource(yes_lockdown)
    source_no = ColumnDataSource(no_lockdown)

    p = figure(title = f"Relation between NO2 emission and lockdown, site {site} - {year}.",plot_width = 750, plot_height = 400, tools="pan, hover, zoom_in, zoom_out, yzoom_in, yzoom_out")


    # Create the points for patient who have died
    points = p.scatter(jitter('Lockdown', 0.1), 'NO2', source=source_yes, color = "green", marker = "dot", size = 30, legend_label = "Lockdown: yes", alpha = 0.7)
    # Create the points for patient who survived
    points2 = p.scatter(jitter('Lockdown', 0.1), 'NO2', source=source_no, color = "red", marker = "dot", size = 30, legend_label = "Lockdown: no", alpha = 0.5)

    # Set labels
    p.xaxis.axis_label = 'Lockdown (0 = No-lockdown, 1 = lockdown))'
    # Use regex to grab the info about what was measured
    p.yaxis.axis_label = 'NO2 emission'
    
    p.xaxis.ticker = [0, 1]

    # Make legend interactive
    p.legend.location = "top_center"
    p.legend.click_policy="hide"

    return p 

# p = create_scatter(tidy_2021, "NL10107")


# layout = pn.interact(create_scatter, site=autocomplete_station, year=select_year)
# pn.Row(pn.Column(layout[0], layout[1]))

pn.Column(pn.Row(autocomplete_station, select_year),
            pn.Row(create_scatter))

In [79]:
no_lockdown = tidy_2021.loc[(tidy_2021['site'] == "NL10107") & (tidy_2021['Lockdown'] == "No-lockdown"), ["Lockdown", "NO2"]]
no_lockdown

Unnamed: 0,Lockdown,NO2
2807,No-lockdown,18.29
2808,No-lockdown,19.13
2809,No-lockdown,19.01
2810,No-lockdown,24.45
2811,No-lockdown,24.25
...,...,...
8011,No-lockdown,12.36
8012,No-lockdown,11.26
8013,No-lockdown,9.34
8014,No-lockdown,7.48


In [80]:
discrete_slider = pn.widgets.DiscreteSlider(name='Number of stations', options=[1, 2, 3, 4], value=3)

def violin_plot(n_stations):
    stations = list(NO2_cov_2021.columns[NO2_cov_2021.columns.str.contains("NL")])
    random_stations = random.sample(stations, n_stations)
    df = tidy_2021[tidy_2021.site.isin(random_stations)]
    
    chart = hv.Violin(df, 
                  kdims=['site', 'Lockdown'], vdims="NO2")
    chart.opts(title=f'Violin plot', width=750, height=600,
               violin_color=dim('Lockdown'))
    return chart
    

layout = pn.interact(violin_plot, n_stations = discrete_slider)
pn.Row(pn.Column(layout[0], layout[1]))