# EDA and Maps showing the different pollutants

In [None]:
# packages

# standard
import numpy as np
import pandas as pd

# plots
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns

# maps
import folium

In [None]:
# load and remove redundancies
df = pd.read_csv('../input/air-quality-belgium/data.csv', sep=';')
df = df.drop('Country', axis=1) # Country is always Belgium
df = df.drop('Country Code', axis=1) # Country Code is always BE
df = df.drop('Source Name', axis=1) # Source name is always EEA Belgium
df = df.drop('Unit', axis=1) # Unit is always \mu g/m^3
df.head()

In [None]:
# aux functions for extracting the coordinates
def split_lat(coordinates):
    lat = coordinates.split(',')[0]
    return float(lat)

def split_lon(coordinates):
    lon = coordinates.split(',')[1]
    return float(lon)

In [None]:
# extract lat/lon
df['Longitude'] = list(map(split_lon, df.Coordinates))
df['Latitude'] = list(map(split_lat, df.Coordinates))
df = df.drop('Coordinates', axis=1)

In [None]:
# date handling
df['Date'] = pd.to_datetime(df['Last Updated'], format='%Y-%m-%d')
df['Year'] = list(map(lambda x: int(x.year), df.Date))

In [None]:
# summary
df.describe(include='all')

In [None]:
pollutants = list(df.Pollutant.value_counts().index)
print(pollutants)

In [None]:
# count frequencies
df.Pollutant.value_counts()

In [None]:
# cities
plt.figure(figsize=(9,5))
df.City.value_counts().plot(kind='bar')
plt.title('City')
plt.grid()
plt.show()

In [None]:
# locations - top 25
plt.figure(figsize=(9,5))
df.Location.value_counts()[0:25].plot(kind='bar')
plt.title('Location')
plt.grid()
plt.show()

In [None]:
# last update
plt.figure(figsize=(9,5))
df.Date.value_counts()[0:25].plot(kind='bar')
plt.title('Last Updated')
plt.grid()
plt.show()

In [None]:
# year
plt.figure(figsize=(9,5))
df.Year.value_counts().sort_index().plot(kind='bar')
plt.title('Year (last updated)')
plt.grid()
plt.show()

In [None]:
# plot distributions of pollutants
for p in pollutants:
    df_temp = df[df.Pollutant==p]
    plt.figure()
    v = df_temp.Value.plot(kind='hist')
    plt.title(p)
    plt.grid()
    plt.show()

In [None]:
# plot distributions of pollutants - compare 2020 with previous years
for p in pollutants:
    df_temp_1 = df[(df.Pollutant==p) & (df.Year<2020)]
    df_temp_2 = df[(df.Pollutant==p) & (df.Year==2020)]
    plt.figure(figsize=(9,4))
    ax1 = plt.subplot(1,2,1)
    v = df_temp_1.Value.plot(kind='hist')
    plt.title(p + ' - values before 2020')
    plt.grid()
    ax2 = plt.subplot(1,2,2, sharex=ax1)
    v = df_temp_2.Value.plot(kind='hist')
    plt.title(p + ' - 2020 values only')
    plt.grid()
    plt.show()

#### We see a massive difference in the distribution of CO!

# Maps

In [None]:
# check coordinates first
plt.figure(figsize=(9,5))
plt.scatter(df.Longitude, df.Latitude)
plt.grid()
plt.show()

Looks good!

# > Pollutant CO
(Carbon Monoxide)

In [None]:
# select specific pollutant; we only use the 2020 values
df_select = df[(df.Pollutant=='CO') & (df.Year==2020) & (df.Value>0)]

In [None]:
# check uniqueness of Location
df_select.Location.describe()

In [None]:
# full table
df_select

In [None]:
# interactive map
def plot_map(i_df, i_scale):
    zoom_factor = 8 # inital map size
    radius_scaling = i_scale # scaling of bubbles

    my_map = folium.Map(location=[50.5,4], zoom_start=zoom_factor)

    for i in range(0,df_select.shape[0]):
       folium.CircleMarker(
          location=[df_select.iloc[i]['Latitude'], df_select.iloc[i]['Longitude']],
          radius=np.sqrt(df_select.iloc[i]['Value'])*radius_scaling,
          popup='Location: ' + df_select.iloc[i]['Location'] + '\n Value: ' + str(df_select.iloc[i]['Value']),
          color='blue',
          fill=True,
          fill_color='blue',
          fill_opacity=0.25,
          weight=1 # stroke width in pixels
       ).add_to(my_map)

    return(my_map)

In [None]:
# interactive map including color representation of values
def plot_map_colored(i_df, i_scale):

    zoom_factor = 8 # inital map size
    radius_scaling = i_scale # scaling of bubbles

    my_map = folium.Map(location=[50.5,4], zoom_start=zoom_factor)
 
    v_min = np.min(df_select.Value)
    v_max = np.max(df_select.Value)
    nn = df_select.shape[0]
    for i in range(0,nn):
        v = df_select.iloc[i]['Value']
        v_norm = (v-v_min)/(v_max-v_min)
        i_col = int(v_norm*256)
        current_color = matplotlib.colors.to_hex(matplotlib.cm.rainbow(i_col))
        folium.CircleMarker(
            location=[df_select.iloc[i]['Latitude'], df_select.iloc[i]['Longitude']],
            radius=np.sqrt(df_select.iloc[i]['Value'])*radius_scaling,
            popup='Location: ' + df_select.iloc[i]['Location'] + '\n Value: ' + str(df_select.iloc[i]['Value']),
            color='black',
            fill=True,
            fill_opacity=0.25,
            fill_color=current_color,
            weight=1 # stroke width in pixels
       ).add_to(my_map)

    return(my_map)

In [None]:
# plot map
my_map = plot_map(i_df=df_select, i_scale=20)
my_map

# > Pollutant NO2
(Nitrogen Dioxide)

In [None]:
df_select = df[(df.Pollutant=='NO2') & (df.Year==2020) & (df.Value>0)]
my_map = plot_map(i_df=df_select, i_scale=2)
my_map

# > Pollutant PM2.5
(Particular Matter with diameter of 2.5 micrometers or less)

In [None]:
df_select = df[(df.Pollutant=='PM2.5') & (df.Year==2020) & (df.Value>0)]
my_map = plot_map(i_df=df_select, i_scale=2)
my_map

# > Pollutant PM10
(Particular Matter with diameter of 10 micrometers or less)

In [None]:
df_select = df[(df.Pollutant=='PM10') & (df.Year==2020) & (df.Value>0)]
my_map = plot_map(i_df=df_select, i_scale=2)
my_map

# > Pollutant SO2
(Sulfur Dioxide)

In [None]:
df_select = df[(df.Pollutant=='SO2') & (df.Year==2020) & (df.Value>0)]
my_map = plot_map(i_df=df_select, i_scale=3)
my_map

# > Pollutant O3
(Ozone)

In [None]:
df_select = df[(df.Pollutant=='O3') & (df.Year==2020) & (df.Value>0)]
my_map = plot_map(i_df=df_select, i_scale=1.5)
my_map

### The values are rather similar here, therefore let's render an alternative Map using also color to display the values:

In [None]:
my_map = plot_map_colored(i_df=df_select, i_scale=2)
my_map