# Exploring the Belgium Air-Quality dataset
https://www.kaggle.com/bouweceunen/air-quality-belgium


In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import folium
import matplotlib.pyplot as plt
import seaborn as sns

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))



# Exploratory Data Analysis

Let's start by looking at the file. The data is separated by semi-colons.

There are different kinds of air pollutants such as carbon monoxide, lead, nitrogen oxides, sulphur dioxide, particule pollution (also known as particulate matters pm), ground level ozone etc.
(https://www.cdc.gov/air/pollutants.htm)



In [None]:
df = pd.read_csv('/kaggle/input/air-quality-belgium/data.csv',delimiter=';')
df

In [None]:
df.columns.tolist()

In [None]:
df.describe()

In [None]:
#List of cities in the dataset
df['City'].unique().tolist()


## Types of pollutants in this list

In [None]:
df['Pollutant'].unique().tolist()

# Visualize on a map

### Convert Coordinates column to latitude and longtitude for plotting on a map

In [None]:
lon = []
lat = []

for row in df['Coordinates']:
    try:
        lat.append(row.split(',')[0])
        lon.append(row.split(',')[1])
    except:
        lat.append(np.NaN)
        lon.append(np.Nan)

df['latitude'] = lat
df['longitude'] = lon

df

In [None]:
import geopandas
gdf = geopandas.GeoDataFrame(
    df, geometry=geopandas.points_from_xy(df.longitude, df.latitude))
gdf.head()

In [None]:
world = geopandas.read_file(geopandas.datasets.get_path('naturalearth_lowres'))

# Get only belgium
ax = world[world.name =='Belgium'].plot(color='yellow', edgecolor='black', figsize=(10,6))

# We can now plot our ``GeoDataFrame``.
gdf.plot(ax=ax, color='blue')
plt.title('Belgium')
plt.show()

# Using folium for map visualization

In [None]:
belgium_map = folium.Map(location=[50.79663,4.3585386],
               zoom_start=8,
              tiles='cartodbpositron')
#tiles='Stamen Terrain' tiles='cartodbpositron'
#higher number for zoom means to zoom in closer
for recs in df[:].iterrows():
    folium.CircleMarker(location=[recs[1]['latitude'],recs[1]['longitude']],                        
                        radius=5,
                        color='red').add_to(belgium_map)
    

#Display the map
belgium_map

## Explore the data for PM10 pollutants

PM10 is particulate matter that is smaller than 10 micrometers.
PM2.5 is particulate matter that is smaller than 2.5 micrometers.

More information about PM can be found here: 
https://www.epa.gov/pm-pollution/particulate-matter-pm-basics#PM


In [None]:
pm10 = df['Pollutant']=='PM10'
pm10_df = df[pm10]
pm10_df

In [None]:
pm10_df['Value'].describe()

Check which city/location recorded the highest PM10 value

In [None]:
pm10_df[pm10_df['Value'] == pm10_df['Value'].max()]

In [None]:
ax = pm10_df.plot(figsize=(6,6), alpha=0.5, label='PM10')
plt.title('PM10')
plt.show()

In [None]:
pm10_df.nunique()

## Visualize the pm10 cluster

In [None]:
for recs in pm10_df[:].iterrows():
    folium.CircleMarker(location=[recs[1]['latitude'],recs[1]['longitude']],                        
                        radius=5,
                        color='red').add_to(belgium_map)
belgium_map

In [None]:
x = pm10_df.groupby('City').mean().sort_values(by= 'Value',ascending=False)
x.head()


In [None]:
pm10_Flanders = pm10_df['City']=='Flanders' 
pm10_Flanders_df = pm10_df[pm10_Flanders]
select_cols = ['City','Location','Value','Last Updated','latitude','longitude']
pm10_Flanders_df = pm10_Flanders_df[select_cols]

pm10_Flanders_df.head()

Flanders is one of the places with very high PM10 values and has also recorded the highest PM10 value of 127.2 in July of 2017 in this dataset and is also the area with the highest population (over 65%).