## Final Data Matching

Where we match the air quality, traffic, and weather data into one dataframe.

In [1]:
%load_ext autoreload
%autoreload 2

import pandas as pd
import glob, json

from src.get_data import (
    get_air_locations_df, 
    get_traffic_locations_df,
    get_air_quality_df,
    get_weather_df,
    get_traffic_df
)
from src.constants import MADRID_AIR_QUALITY_ZONES
zones_stations_dict = {
    estacion : zone
    for zone,estaciones in MADRID_AIR_QUALITY_ZONES.items() 
    for estacion in estaciones
}

from src.data_matching import match_data

### Load Air Quality, Weather, and Traffic Data

In [2]:
### Air Quality Data
aq_df = get_air_quality_df("../01-data/processed")

### Weather data
weather_df = get_weather_df('../01-data/processed')

### Traffic data
traffic_df = get_traffic_df('../01-data/processed')

### Traffic locations
traffic_locations_df = get_traffic_locations_df('../01-data/processed')

### Air locations
air_locations_df = get_air_locations_df('../01-data')

In [16]:
madrid_data = match_data(
    aq_df,
    weather_df,
    traffic_df,
    traffic_locations_df,
    air_locations_df
)
madrid_data.columns = madrid_data.columns.str.replace("µ","u")

In [10]:
madrid_data.reset_index(drop=True).to_feather("../01-data/processed/madrid_data.feather")

### Meteorologically-Normalized Air Quality data

Obtained after running the notebook `meteorological_normalization.ipynb` that uses the R package `rmweather` to normalize the air quality data of each station as obtained above.

In [19]:
# weather_df = pd.read_feather("../01-data/processed/weather_data.feather")
paths = glob.glob("../01-data/interim/normalized/aq_weather_*.csv")
paths = [path for path in paths if "zone" not in path.lower()]
aq_station_datasets_names = json.load(open("../references/air_quality_data/aq_station_datasets_names.json","r"))
station_names_dict = {dataset_name:estacion_name for estacion_name,dataset_name in aq_station_datasets_names.items()}
dfs = []
for csv in paths:
    df = pd.read_csv(csv)
    dataset_name = csv.split("-normalized")[0].split("aq_weather")[-1].strip("_")
    if "zone" in csv.lower():
        df["zone"] = int(dataset_name.split("_")[-1])
    else:
        df["estacion"] = station_names_dict[f"aq_weather_{dataset_name}"]
    dfs.append(df)
aq_normalized_df = pd.concat(dfs).reset_index(drop=True)
aq_normalized_df["date"] = pd.to_datetime(aq_normalized_df["date"])
if "estacion" in aq_normalized_df.columns:
    aq_normalized_df["zone"] = aq_normalized_df.estacion.replace(zones_stations_dict)
    ind_cols = ["date","estacion","zone"]
else:
    ind_cols = ["date","zone"]    
cols = ind_cols+aq_normalized_df.columns.difference(["date","zone","estacion"]).tolist()
aq_normalized_df = aq_normalized_df[cols]
aq_normalized_df = aq_normalized_df.join(
    weather_df.set_index("time"),
    how="left",
    on="date"
).rename(columns={"date":"time"})
aq_normalized_df.to_feather("../01-data/processed/aq-weather_normalized.feather")
aq_normalized_df

Unnamed: 0,time,estacion,zone,no2_ug_m3,o3_ug_m3,pm10_ug_m3,pm25_ug_m3,u_wind_component_100m,v_wind_component_100m,u_wind_component_10m,v_wind_component_10m,temperature,mean_sea_level_pressure,surface_pressure,total_precipitation
0,2014-01-01,Arturo Soria,5,37.328751,50.404527,,,,,,,,,,
1,2014-01-02,Arturo Soria,5,37.044001,50.248060,,,6.541273,6.615959,3.640051,4.119808,9.179368,101516.407090,94100.221477,0.000060
2,2014-01-03,Arturo Soria,5,36.862443,50.523813,,,6.745887,5.250344,3.718983,3.279350,11.933752,101592.550361,94249.256536,0.000001
3,2014-01-04,Arturo Soria,5,37.619649,50.464669,,,2.795416,5.758685,1.334723,3.302196,11.067902,101761.997075,94375.673417,0.001242
4,2014-01-05,Arturo Soria,5,36.962339,51.027592,,,5.896027,1.745964,2.895803,1.314621,4.196645,102262.036483,94682.387424,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
66451,2021-07-27,Pza. Elíptica,7,40.704621,43.898951,16.871263,10.115675,1.478077,1.640485,0.871175,0.739209,22.733633,101358.202439,94297.325815,0.000000
66452,2021-07-28,Pza. Elíptica,7,40.015906,44.397134,16.458286,10.143112,5.314024,-1.078184,2.701377,-0.219237,25.678588,101095.431065,94063.647306,0.000000
66453,2021-07-29,Pza. Elíptica,7,40.051398,44.744623,16.600162,10.363564,2.925121,0.298342,1.469807,0.144468,26.339382,101105.016571,94090.374134,0.000000
66454,2021-07-30,Pza. Elíptica,7,40.463696,43.853293,16.728366,10.211346,6.606843,-0.235507,2.872476,-0.108473,26.906520,101086.702365,94084.622791,0.000000


In [20]:
madrid_normalized_data = match_data(
    aq_normalized_df.drop(columns=weather_df.columns.drop("time")),
    traffic_df=traffic_df,
    traffic_locations_df=traffic_locations_df,
    air_locations_df=air_locations_df,
    location_by="estacion"
)
madrid_normalized_data.columns = madrid_normalized_data.columns.str.replace("µ","u")
madrid_normalized_data

Unnamed: 0,time,estacion,zone,no2_ug_m3,o3_ug_m3,pm10_ug_m3,pm25_ug_m3,traffic_intensity,traffic_load
0,2015-01-13,Pza. de España,1,44.545089,,,,27.338800,13.735294
1,2015-01-14,Pza. de España,1,43.825188,,,,37.227380,18.147059
2,2015-01-15,Pza. de España,1,44.323731,,,,47.791727,18.859375
3,2015-01-16,Pza. de España,1,44.541437,,,,49.448797,22.529412
4,2015-01-17,Pza. de España,1,45.087627,,,,71.008819,32.765625
...,...,...,...,...,...,...,...,...,...
47956,2021-07-27,Tres Olivos,5,26.606841,61.076274,15.133098,,13.042269,4.333333
47957,2021-07-28,Tres Olivos,5,27.050943,61.928859,15.153105,,11.981436,4.416667
47958,2021-07-29,Tres Olivos,5,26.714006,61.492196,15.132037,,14.385735,8.259259
47959,2021-07-30,Tres Olivos,5,26.241367,62.720795,15.088814,,14.933396,4.722222


In [21]:
madrid_normalized_data.reset_index(drop=True).to_feather("../01-data/processed/madrid_normalized_data.feather")


#### Meteorological and traffic normalized air quality data


In [4]:
# weather_df = pd.read_feather("../01-data/processed/weather_data.feather")
paths = glob.glob("../01-data/interim/normalized/aq_weather_traffic_*.csv")
paths = [path for path in paths if "zone" not in path.lower()]
aq_station_datasets_names = json.load(open("../references/air_quality_data/aq_station_datasets_names.json","r"))
station_names_dict = {dataset_name:estacion_name for estacion_name,dataset_name in aq_station_datasets_names.items()}
dfs = []
for csv in paths:
    df = pd.read_csv(csv)
    dataset_name = csv.split("-normalized")[0].split("aq_weather_traffic")[-1].strip("_")
    if "zone" in csv.lower():
        df["zone"] = int(dataset_name.split("_")[-1])
    else:
        df["estacion"] = station_names_dict[f"aq_weather_{dataset_name}"]
    dfs.append(df)
aq_normalized_df = pd.concat(dfs).reset_index(drop=True)
aq_normalized_df["date"] = pd.to_datetime(aq_normalized_df["date"])
if "estacion" in aq_normalized_df.columns:
    aq_normalized_df["zone"] = aq_normalized_df.estacion.replace(zones_stations_dict)
    ind_cols = ["date","estacion","zone"]
else:
    ind_cols = ["date","zone"]    
cols = ind_cols+aq_normalized_df.columns.difference(["date","zone","estacion"]).tolist()
aq_normalized_df = aq_normalized_df[cols].rename(columns={"date":"time"})
# aq_normalized_df.to_feather("../01-data/processed/aq-weather_normalized.feather")
aq_normalized_df

Unnamed: 0,time,estacion,zone,no2_ug_m3,o3_ug_m3,pm10_ug_m3,pm25_ug_m3
0,2015-01-12,Pza. del Carmen,1,43.107404,43.082765,,
1,2015-01-13,Pza. del Carmen,1,43.396571,43.073891,,
2,2015-01-14,Pza. del Carmen,1,42.738500,43.122104,,
3,2015-01-15,Pza. del Carmen,1,42.843685,42.934980,,
4,2015-01-16,Pza. del Carmen,1,43.376371,43.434526,,
...,...,...,...,...,...,...,...
44087,2021-07-27,Escuelas Aguirre,2,39.237426,48.043498,17.826680,8.780500
44088,2021-07-28,Escuelas Aguirre,2,39.211484,48.190403,17.970742,8.762198
44089,2021-07-29,Escuelas Aguirre,2,39.032637,47.483580,18.100054,8.868897
44090,2021-07-30,Escuelas Aguirre,2,39.037914,48.165211,18.008030,8.907478


In [16]:
pd.options.plotting.backend = "plotly"
aq_normalized_df_zone = aq_normalized_df.groupby(["time","zone"]).mean().reset_index()
fig = aq_normalized_df_zone[aq_normalized_df_zone.zone==1].plot(x="time",y="no2_ug_m3",title="Meteorological and traffic normalized NO2 in Madrid")
fig.show()

In [17]:
weather_normalized = get_air_quality_df("../01-data/processed/",meteo_normalized=True)
weather_normalized = weather_normalized.groupby(["time","zone"]).mean().reset_index()
weather_normalized[weather_normalized.zone==1].plot(x="time",y="no2_ug_m3",title="Meteorologically normalized NO2 in Madrid")