## Combining European datasets into one large dataframe

- Doing this for both with and without NO2 datasets

In [2]:
from urllib.request import urlopen
import json
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import datetime
import glob

pd.set_option('display.max_columns', None)

## Read in our newly scraped TOAR data (29/8/22)

* Note that these data have NO$_2$ and NO columns that contain NaNs. 
* If we want to train with this data, we have to drop these columns. 
* Furthermore if we want to do a robust comparison of training with and without NO$_2$, for example, we need to drop those rows when training with just O$_3$.

In [2]:
uk_data = pd.read_csv('/home/jovyan/lustre_scratch/cas/european_data_new_temp/country/uk/uk_all_data_timeidx.csv')
france_data = pd.read_csv('/home/jovyan/lustre_scratch/cas/european_data_new_temp/country/france/france_all_data_timeidx.csv')
italy_data = pd.read_csv('/home/jovyan/lustre_scratch/cas/european_data_new_temp/country/italy/italy_all_data_timeidx.csv')

KeyboardInterrupt: 

### Let's just check our data

In [4]:
uk_data

Unnamed: 0,datetime,o3,station_name,lat,lon,alt,station_etopo_alt,station_rel_etopo_alt,station_type,landcover,toar_category,pop_density,max_5km_pop_density,max_25km_pop_density,nightlight_1km,nightlight_max_25km,nox_emi,omi_nox,no2,no,temp,press,u,v,totprecip,pblheight,relhum,cloudcover,raw_time_idx,time_idx_large_temp,time_idx_new,time_idx
0,2004-01-01,35.588063,ABERDEEN,57.157360,-2.094278,5.0,21.0,21.0,background,13.0,3.0,49081.0,49081.0,49081.0,63.0,63.0,5.58598,1.80,22.487713,,3.195833,994.475000,2.621792,0.232750,0.528500,640.948333,90.354583,99.700000,731581,1731581,996713,996713
1,2004-01-02,36.214613,ABERDEEN,57.157360,-2.094278,5.0,21.0,21.0,background,13.0,3.0,49081.0,49081.0,49081.0,63.0,63.0,5.58598,1.80,31.051338,,2.916667,1013.333333,2.003333,-0.130875,0.161458,453.892083,88.965417,98.463750,731582,1731582,996714,996714
2,2004-01-03,32.956513,ABERDEEN,57.157360,-2.094278,5.0,21.0,21.0,background,13.0,3.0,49081.0,49081.0,49081.0,63.0,63.0,5.58598,1.80,20.853425,,4.416667,1007.125000,3.616833,0.145500,1.039625,812.766667,89.084167,95.205833,731583,1731583,996715,996715
3,2004-01-04,33.457750,ABERDEEN,57.157360,-2.094278,5.0,21.0,21.0,background,13.0,3.0,49081.0,49081.0,49081.0,63.0,63.0,5.58598,1.80,29.743913,,3.608333,1009.333333,1.545833,-0.169125,0.139625,201.668750,91.235000,99.543333,731584,1731584,996716,996716
4,2004-01-05,14.410655,ABERDEEN,57.157360,-2.094278,5.0,21.0,21.0,background,13.0,3.0,49081.0,49081.0,49081.0,63.0,63.0,5.58598,1.80,27.866814,,6.758333,999.858333,2.052625,3.092708,0.339458,503.145250,94.327083,98.989167,731585,1731585,996717,996717
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
412343,2014-10-27,31.632138,Yarner Wood,50.596389,-3.713056,119.0,121.0,91.0,background,10.0,1.0,2639.0,2642.0,34777.0,8.0,63.0,1.61496,2.58,,,13.420833,993.808333,3.413667,5.463000,0.000000,679.383333,95.172917,68.577192,735533,1735533,999996,999996
412344,2014-10-28,37.071400,Yarner Wood,50.596389,-3.713056,119.0,121.0,91.0,background,10.0,1.0,2639.0,2642.0,34777.0,8.0,63.0,1.61496,2.58,,,13.787500,988.737500,3.531125,2.846128,0.000000,504.812500,92.852917,71.243125,735534,1735534,999997,999997
412345,2014-10-29,24.320388,Yarner Wood,50.596389,-3.713056,119.0,121.0,91.0,background,10.0,1.0,2639.0,2642.0,34777.0,8.0,63.0,1.61496,2.58,,,13.575000,991.070833,0.057857,0.974005,0.024667,222.330417,97.516250,100.000000,735535,1735535,999998,999998
412346,2014-10-30,32.798150,Yarner Wood,50.596389,-3.713056,119.0,121.0,91.0,background,10.0,1.0,2639.0,2642.0,34777.0,8.0,63.0,1.61496,2.58,,,15.133333,992.700000,2.446375,4.090292,0.000000,643.937500,97.690000,99.391667,735536,1735536,999999,999999


In [5]:
france_data

Unnamed: 0,datetime,o3,station_name,lat,lon,alt,station_etopo_alt,station_rel_etopo_alt,station_type,landcover,toar_category,pop_density,max_5km_pop_density,max_25km_pop_density,nightlight_1km,nightlight_max_25km,nox_emi,omi_nox,no2,no,temp,press,u,v,totprecip,pblheight,relhum,cloudcover,raw_time_idx,time_idx_large_temp,time_idx_new,time_idx
0,1999-01-01,23.746237,142 BVD STRASBOURG,49.493057,0.114736,4.0,40.0,40.0,background,13.0,3.0,23472.0,40594.0,40594.0,63.0,63.0,3.53716,4.08,13.924076,,8.100000,1005.208333,-0.628083,5.034625,0.018067,427.770833,84.202917,72.044042,729755,1729755,999881,999881
1,1999-01-02,32.079375,142 BVD STRASBOURG,49.493057,0.114736,4.0,40.0,40.0,background,13.0,3.0,23472.0,40594.0,40594.0,63.0,63.0,3.53716,4.08,12.355170,,9.375000,1000.741667,6.748875,7.782625,0.455327,1359.520833,79.008750,96.157500,729756,1729756,999882,999882
2,1999-01-03,36.089263,142 BVD STRASBOURG,49.493057,0.114736,4.0,40.0,40.0,background,13.0,3.0,23472.0,40594.0,40594.0,63.0,63.0,3.53716,4.08,10.263283,,10.012500,1004.495833,8.525500,4.089708,0.653547,1209.666667,78.207917,86.420417,729757,1729757,999883,999883
3,1999-01-04,28.194763,142 BVD STRASBOURG,49.493057,0.114736,4.0,40.0,40.0,background,13.0,3.0,23472.0,40594.0,40594.0,63.0,63.0,3.53716,4.08,21.114900,,13.262500,1009.666667,6.121542,6.912542,0.208625,1124.850000,89.556250,96.252917,729758,1729758,999884,999884
4,1999-01-05,24.247488,142 BVD STRASBOURG,49.493057,0.114736,4.0,40.0,40.0,background,13.0,3.0,23472.0,40594.0,40594.0,63.0,63.0,3.53716,4.08,24.579575,,11.995833,1013.583333,1.190708,6.052333,0.000000,479.175000,79.690833,55.943792,729759,1729759,999885,999885
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1595861,2012-12-27,40.457229,place de VERDUN,46.162556,-1.153603,10.0,8.0,8.0,background,13.0,3.0,26006.0,26006.0,26006.0,63.0,63.0,5.11447,2.01,9.021231,10.323442,12.483333,1020.666667,8.647208,-1.266454,1.084360,1518.258333,82.215833,93.790833,734864,1734864,999996,999996
1595862,2012-12-28,31.578129,place de VERDUN,46.162556,-1.153603,10.0,8.0,8.0,background,13.0,3.0,26006.0,26006.0,26006.0,63.0,63.0,5.11447,2.01,15.427620,25.958937,11.850000,1026.208333,2.087375,3.525625,0.000000,547.620833,92.065000,94.320417,734865,1734865,999997,999997
1595863,2012-12-29,39.034075,place de VERDUN,46.162556,-1.153603,10.0,8.0,8.0,background,13.0,3.0,26006.0,26006.0,26006.0,63.0,63.0,5.11447,2.01,10.655506,14.031850,9.916667,1016.041667,4.182017,3.727543,0.421450,1008.770833,84.371250,60.506273,734866,1734866,999998,999998
1595864,2012-12-30,42.104171,place de VERDUN,46.162556,-1.153603,10.0,8.0,8.0,background,13.0,3.0,26006.0,26006.0,26006.0,63.0,63.0,5.11447,2.01,8.759734,5.612740,11.337500,1024.333333,5.104583,0.770862,0.000000,912.304167,74.465000,79.811375,734867,1734867,999999,999999


In [6]:
italy_data

Unnamed: 0,datetime,o3,station_name,lat,lon,alt,station_etopo_alt,station_rel_etopo_alt,station_type,landcover,toar_category,pop_density,max_5km_pop_density,max_25km_pop_density,nightlight_1km,nightlight_max_25km,nox_emi,omi_nox,no2,no,temp,press,u,v,totprecip,pblheight,relhum,cloudcover,raw_time_idx,time_idx_large_temp,time_idx_new,time_idx
0,2005-07-01,17.794012,AB1 Autostrada del Brennero A22 21021,46.679169,11.621111,550.0,842.0,249.0,traffic,5.0,1.0,2051.0,3126.0,3349.0,16.0,53.0,0.69703,1.20,24.252725,,13.333333,877.662500,-0.098500,-2.076292,3.018834,752.507500,73.478333,64.689125,732128,1732128,997626,997626
1,2005-07-02,30.074388,AB1 Autostrada del Brennero A22 21021,46.679169,11.621111,550.0,842.0,249.0,traffic,5.0,1.0,2051.0,3126.0,3349.0,16.0,53.0,0.69703,1.20,23.860512,,14.033333,883.254167,0.272333,-1.881250,0.000000,655.424083,59.010833,15.414721,732129,1732129,997627,997627
2,2005-07-03,45.926113,AB1 Autostrada del Brennero A22 21021,46.679169,11.621111,550.0,842.0,249.0,traffic,5.0,1.0,2051.0,3126.0,3349.0,16.0,53.0,0.69703,1.20,23.729775,,15.962500,882.829167,0.106250,-0.134750,0.000000,297.831083,63.284167,21.893292,732130,1732130,997628,997628
3,2005-07-04,40.161843,AB1 Autostrada del Brennero A22 21021,46.679169,11.621111,550.0,842.0,249.0,traffic,5.0,1.0,2051.0,3126.0,3349.0,16.0,53.0,0.69703,1.20,36.542512,,17.870833,878.262500,1.185125,1.463667,0.000000,675.655000,67.292500,60.566000,732131,1732131,997629,997629
4,2005-07-05,29.447850,AB1 Autostrada del Brennero A22 21021,46.679169,11.621111,550.0,842.0,249.0,traffic,5.0,1.0,2051.0,3126.0,3349.0,16.0,53.0,0.69703,1.20,25.298687,,11.450000,876.154167,0.040042,-1.609458,3.114833,631.438042,70.458750,56.490563,732132,1732132,997630,997630
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
885889,2012-11-26,19.360400,viale La Plaja 2009235,39.215557,9.105834,3.0,1.0,1.0,traffic,13.0,0.0,215.0,20856.0,22417.0,63.0,63.0,8.54064,1.57,43.798725,51.216250,17.283333,1016.416667,-2.855062,2.077470,0.000000,507.762500,78.645000,34.774750,734833,1734833,999996,999996
885890,2012-11-27,13.658798,viale La Plaja 2009235,39.215557,9.105834,3.0,1.0,1.0,traffic,13.0,0.0,215.0,20856.0,22417.0,63.0,63.0,8.54064,1.57,57.395950,64.245850,17.975000,1005.825000,-0.627479,2.807800,0.054886,601.885000,78.806250,99.502083,734834,1734834,999997,999997
885891,2012-11-28,15.914375,viale La Plaja 2009235,39.215557,9.105834,3.0,1.0,1.0,traffic,13.0,0.0,215.0,20856.0,22417.0,63.0,63.0,8.54064,1.57,32.751000,23.453255,13.662500,994.812500,6.595833,1.037817,0.927494,1284.587500,73.621250,100.000000,734835,1734835,999998,999998
885892,2012-11-29,12.844275,viale La Plaja 2009235,39.215557,9.105834,3.0,1.0,1.0,traffic,13.0,0.0,215.0,20856.0,22417.0,63.0,63.0,8.54064,1.57,22.160837,14.633225,12.675000,996.820833,9.165250,-3.130608,0.243413,1639.762500,69.458333,67.676708,734836,1734836,999999,999999


In [7]:
uk_data.shape[0] + italy_data.shape[0] + france_data.shape[0]

2894108

In [8]:
# need ignore index otherwise we have problems passing through the TFT.

combined_data = pd.concat([uk_data, france_data, italy_data], ignore_index=True)

In [11]:
combined_data

Unnamed: 0,datetime,o3,station_name,lat,lon,alt,station_etopo_alt,station_rel_etopo_alt,station_type,landcover,toar_category,pop_density,max_5km_pop_density,max_25km_pop_density,nightlight_1km,nightlight_max_25km,nox_emi,omi_nox,no2,no,temp,press,u,v,totprecip,pblheight,relhum,cloudcover,raw_time_idx,time_idx_large_temp,time_idx_new,time_idx
0,2004-01-01,35.588063,ABERDEEN,57.157360,-2.094278,5.0,21.0,21.0,background,13.0,3.0,49081.0,49081.0,49081.0,63.0,63.0,5.58598,1.80,22.487713,,3.195833,994.475000,2.621792,0.232750,0.528500,640.948333,90.354583,99.700000,731581,1731581,996713,996713
1,2004-01-02,36.214613,ABERDEEN,57.157360,-2.094278,5.0,21.0,21.0,background,13.0,3.0,49081.0,49081.0,49081.0,63.0,63.0,5.58598,1.80,31.051338,,2.916667,1013.333333,2.003333,-0.130875,0.161458,453.892083,88.965417,98.463750,731582,1731582,996714,996714
2,2004-01-03,32.956513,ABERDEEN,57.157360,-2.094278,5.0,21.0,21.0,background,13.0,3.0,49081.0,49081.0,49081.0,63.0,63.0,5.58598,1.80,20.853425,,4.416667,1007.125000,3.616833,0.145500,1.039625,812.766667,89.084167,95.205833,731583,1731583,996715,996715
3,2004-01-04,33.457750,ABERDEEN,57.157360,-2.094278,5.0,21.0,21.0,background,13.0,3.0,49081.0,49081.0,49081.0,63.0,63.0,5.58598,1.80,29.743913,,3.608333,1009.333333,1.545833,-0.169125,0.139625,201.668750,91.235000,99.543333,731584,1731584,996716,996716
4,2004-01-05,14.410655,ABERDEEN,57.157360,-2.094278,5.0,21.0,21.0,background,13.0,3.0,49081.0,49081.0,49081.0,63.0,63.0,5.58598,1.80,27.866814,,6.758333,999.858333,2.052625,3.092708,0.339458,503.145250,94.327083,98.989167,731585,1731585,996717,996717
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2894103,2012-11-26,19.360400,viale La Plaja 2009235,39.215557,9.105834,3.0,1.0,1.0,traffic,13.0,0.0,215.0,20856.0,22417.0,63.0,63.0,8.54064,1.57,43.798725,51.216250,17.283333,1016.416667,-2.855062,2.077470,0.000000,507.762500,78.645000,34.774750,734833,1734833,999996,999996
2894104,2012-11-27,13.658798,viale La Plaja 2009235,39.215557,9.105834,3.0,1.0,1.0,traffic,13.0,0.0,215.0,20856.0,22417.0,63.0,63.0,8.54064,1.57,57.395950,64.245850,17.975000,1005.825000,-0.627479,2.807800,0.054886,601.885000,78.806250,99.502083,734834,1734834,999997,999997
2894105,2012-11-28,15.914375,viale La Plaja 2009235,39.215557,9.105834,3.0,1.0,1.0,traffic,13.0,0.0,215.0,20856.0,22417.0,63.0,63.0,8.54064,1.57,32.751000,23.453255,13.662500,994.812500,6.595833,1.037817,0.927494,1284.587500,73.621250,100.000000,734835,1734835,999998,999998
2894106,2012-11-29,12.844275,viale La Plaja 2009235,39.215557,9.105834,3.0,1.0,1.0,traffic,13.0,0.0,215.0,20856.0,22417.0,63.0,63.0,8.54064,1.57,22.160837,14.633225,12.675000,996.820833,9.165250,-3.130608,0.243413,1639.762500,69.458333,67.676708,734836,1734836,999999,999999


In [10]:
## quick check on length to ensure they match

uk_data.shape[0] + italy_data.shape[0] + france_data.shape[0]

2894108

In [13]:
combined_data

Unnamed: 0,datetime,o3,station_name,lat,lon,alt,station_etopo_alt,station_rel_etopo_alt,station_type,landcover,toar_category,pop_density,max_5km_pop_density,max_25km_pop_density,nightlight_1km,nightlight_max_25km,nox_emi,omi_nox,no2,no,temp,press,u,v,totprecip,pblheight,relhum,cloudcover,raw_time_idx,time_idx_large_temp,time_idx_new,time_idx
0,2004-01-01,35.588063,ABERDEEN,57.157360,-2.094278,5.0,21.0,21.0,background,13.0,3.0,49081.0,49081.0,49081.0,63.0,63.0,5.58598,1.80,22.487713,,3.195833,994.475000,2.621792,0.232750,0.528500,640.948333,90.354583,99.700000,731581,1731581,996713,996713
1,2004-01-02,36.214613,ABERDEEN,57.157360,-2.094278,5.0,21.0,21.0,background,13.0,3.0,49081.0,49081.0,49081.0,63.0,63.0,5.58598,1.80,31.051338,,2.916667,1013.333333,2.003333,-0.130875,0.161458,453.892083,88.965417,98.463750,731582,1731582,996714,996714
2,2004-01-03,32.956513,ABERDEEN,57.157360,-2.094278,5.0,21.0,21.0,background,13.0,3.0,49081.0,49081.0,49081.0,63.0,63.0,5.58598,1.80,20.853425,,4.416667,1007.125000,3.616833,0.145500,1.039625,812.766667,89.084167,95.205833,731583,1731583,996715,996715
3,2004-01-04,33.457750,ABERDEEN,57.157360,-2.094278,5.0,21.0,21.0,background,13.0,3.0,49081.0,49081.0,49081.0,63.0,63.0,5.58598,1.80,29.743913,,3.608333,1009.333333,1.545833,-0.169125,0.139625,201.668750,91.235000,99.543333,731584,1731584,996716,996716
4,2004-01-05,14.410655,ABERDEEN,57.157360,-2.094278,5.0,21.0,21.0,background,13.0,3.0,49081.0,49081.0,49081.0,63.0,63.0,5.58598,1.80,27.866814,,6.758333,999.858333,2.052625,3.092708,0.339458,503.145250,94.327083,98.989167,731585,1731585,996717,996717
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2894103,2012-11-26,19.360400,viale La Plaja 2009235,39.215557,9.105834,3.0,1.0,1.0,traffic,13.0,0.0,215.0,20856.0,22417.0,63.0,63.0,8.54064,1.57,43.798725,51.216250,17.283333,1016.416667,-2.855062,2.077470,0.000000,507.762500,78.645000,34.774750,734833,1734833,999996,999996
2894104,2012-11-27,13.658798,viale La Plaja 2009235,39.215557,9.105834,3.0,1.0,1.0,traffic,13.0,0.0,215.0,20856.0,22417.0,63.0,63.0,8.54064,1.57,57.395950,64.245850,17.975000,1005.825000,-0.627479,2.807800,0.054886,601.885000,78.806250,99.502083,734834,1734834,999997,999997
2894105,2012-11-28,15.914375,viale La Plaja 2009235,39.215557,9.105834,3.0,1.0,1.0,traffic,13.0,0.0,215.0,20856.0,22417.0,63.0,63.0,8.54064,1.57,32.751000,23.453255,13.662500,994.812500,6.595833,1.037817,0.927494,1284.587500,73.621250,100.000000,734835,1734835,999998,999998
2894106,2012-11-29,12.844275,viale La Plaja 2009235,39.215557,9.105834,3.0,1.0,1.0,traffic,13.0,0.0,215.0,20856.0,22417.0,63.0,63.0,8.54064,1.57,22.160837,14.633225,12.675000,996.820833,9.165250,-3.130608,0.243413,1639.762500,69.458333,67.676708,734836,1734836,999999,999999


In [21]:
### count nans in the columns of this dataframe, we only want them in no and no2

combined_data.isna().sum()

datetime                       0
o3                             0
station_name                   0
lat                            0
lon                            0
alt                            0
station_etopo_alt              0
station_rel_etopo_alt          0
station_type                   0
landcover                      0
toar_category                  0
pop_density                    0
max_5km_pop_density            0
max_25km_pop_density           0
nightlight_1km                 0
nightlight_max_25km            0
nox_emi                        0
omi_nox                        0
no2                       760314
no                       2011080
temp                           0
press                          0
u                              0
v                              0
totprecip                      0
pblheight                      0
relhum                         0
cloudcover                     0
raw_time_idx                   0
time_idx_large_temp            0
time_idx_n

In [24]:
combined_data = combined_data.drop(['raw_time_idx', 'time_idx_large_temp', 'time_idx_new'], axis=1)

In [25]:
combined_data

Unnamed: 0,datetime,o3,station_name,lat,lon,alt,station_etopo_alt,station_rel_etopo_alt,station_type,landcover,toar_category,pop_density,max_5km_pop_density,max_25km_pop_density,nightlight_1km,nightlight_max_25km,nox_emi,omi_nox,no2,no,temp,press,u,v,totprecip,pblheight,relhum,cloudcover,time_idx
0,2004-01-01,35.588063,ABERDEEN,57.157360,-2.094278,5.0,21.0,21.0,background,13.0,3.0,49081.0,49081.0,49081.0,63.0,63.0,5.58598,1.80,22.487713,,3.195833,994.475000,2.621792,0.232750,0.528500,640.948333,90.354583,99.700000,996713
1,2004-01-02,36.214613,ABERDEEN,57.157360,-2.094278,5.0,21.0,21.0,background,13.0,3.0,49081.0,49081.0,49081.0,63.0,63.0,5.58598,1.80,31.051338,,2.916667,1013.333333,2.003333,-0.130875,0.161458,453.892083,88.965417,98.463750,996714
2,2004-01-03,32.956513,ABERDEEN,57.157360,-2.094278,5.0,21.0,21.0,background,13.0,3.0,49081.0,49081.0,49081.0,63.0,63.0,5.58598,1.80,20.853425,,4.416667,1007.125000,3.616833,0.145500,1.039625,812.766667,89.084167,95.205833,996715
3,2004-01-04,33.457750,ABERDEEN,57.157360,-2.094278,5.0,21.0,21.0,background,13.0,3.0,49081.0,49081.0,49081.0,63.0,63.0,5.58598,1.80,29.743913,,3.608333,1009.333333,1.545833,-0.169125,0.139625,201.668750,91.235000,99.543333,996716
4,2004-01-05,14.410655,ABERDEEN,57.157360,-2.094278,5.0,21.0,21.0,background,13.0,3.0,49081.0,49081.0,49081.0,63.0,63.0,5.58598,1.80,27.866814,,6.758333,999.858333,2.052625,3.092708,0.339458,503.145250,94.327083,98.989167,996717
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2894103,2012-11-26,19.360400,viale La Plaja 2009235,39.215557,9.105834,3.0,1.0,1.0,traffic,13.0,0.0,215.0,20856.0,22417.0,63.0,63.0,8.54064,1.57,43.798725,51.216250,17.283333,1016.416667,-2.855062,2.077470,0.000000,507.762500,78.645000,34.774750,999996
2894104,2012-11-27,13.658798,viale La Plaja 2009235,39.215557,9.105834,3.0,1.0,1.0,traffic,13.0,0.0,215.0,20856.0,22417.0,63.0,63.0,8.54064,1.57,57.395950,64.245850,17.975000,1005.825000,-0.627479,2.807800,0.054886,601.885000,78.806250,99.502083,999997
2894105,2012-11-28,15.914375,viale La Plaja 2009235,39.215557,9.105834,3.0,1.0,1.0,traffic,13.0,0.0,215.0,20856.0,22417.0,63.0,63.0,8.54064,1.57,32.751000,23.453255,13.662500,994.812500,6.595833,1.037817,0.927494,1284.587500,73.621250,100.000000,999998
2894106,2012-11-29,12.844275,viale La Plaja 2009235,39.215557,9.105834,3.0,1.0,1.0,traffic,13.0,0.0,215.0,20856.0,22417.0,63.0,63.0,8.54064,1.57,22.160837,14.633225,12.675000,996.820833,9.165250,-3.130608,0.243413,1639.762500,69.458333,67.676708,999999


In [26]:
## save this dataframe

combined_data.to_csv('/home/jovyan/lustre_scratch/cas/european_data_new_temp/merged_euro_clean/uk_france_italy_o3_nans_no2_no.csv', index=False)

In [30]:
combined_data

Unnamed: 0,datetime,o3,station_name,lat,lon,alt,station_etopo_alt,station_rel_etopo_alt,station_type,landcover,toar_category,pop_density,max_5km_pop_density,max_25km_pop_density,nightlight_1km,nightlight_max_25km,nox_emi,omi_nox,no2,no,temp,press,u,v,totprecip,pblheight,relhum,cloudcover,time_idx
0,2004-01-01,35.588063,ABERDEEN,57.157360,-2.094278,5.0,21.0,21.0,background,13.0,3.0,49081.0,49081.0,49081.0,63.0,63.0,5.58598,1.80,22.487713,,3.195833,994.475000,2.621792,0.232750,0.528500,640.948333,90.354583,99.700000,996713
1,2004-01-02,36.214613,ABERDEEN,57.157360,-2.094278,5.0,21.0,21.0,background,13.0,3.0,49081.0,49081.0,49081.0,63.0,63.0,5.58598,1.80,31.051338,,2.916667,1013.333333,2.003333,-0.130875,0.161458,453.892083,88.965417,98.463750,996714
2,2004-01-03,32.956513,ABERDEEN,57.157360,-2.094278,5.0,21.0,21.0,background,13.0,3.0,49081.0,49081.0,49081.0,63.0,63.0,5.58598,1.80,20.853425,,4.416667,1007.125000,3.616833,0.145500,1.039625,812.766667,89.084167,95.205833,996715
3,2004-01-04,33.457750,ABERDEEN,57.157360,-2.094278,5.0,21.0,21.0,background,13.0,3.0,49081.0,49081.0,49081.0,63.0,63.0,5.58598,1.80,29.743913,,3.608333,1009.333333,1.545833,-0.169125,0.139625,201.668750,91.235000,99.543333,996716
4,2004-01-05,14.410655,ABERDEEN,57.157360,-2.094278,5.0,21.0,21.0,background,13.0,3.0,49081.0,49081.0,49081.0,63.0,63.0,5.58598,1.80,27.866814,,6.758333,999.858333,2.052625,3.092708,0.339458,503.145250,94.327083,98.989167,996717
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2894103,2012-11-26,19.360400,viale La Plaja 2009235,39.215557,9.105834,3.0,1.0,1.0,traffic,13.0,0.0,215.0,20856.0,22417.0,63.0,63.0,8.54064,1.57,43.798725,51.216250,17.283333,1016.416667,-2.855062,2.077470,0.000000,507.762500,78.645000,34.774750,999996
2894104,2012-11-27,13.658798,viale La Plaja 2009235,39.215557,9.105834,3.0,1.0,1.0,traffic,13.0,0.0,215.0,20856.0,22417.0,63.0,63.0,8.54064,1.57,57.395950,64.245850,17.975000,1005.825000,-0.627479,2.807800,0.054886,601.885000,78.806250,99.502083,999997
2894105,2012-11-28,15.914375,viale La Plaja 2009235,39.215557,9.105834,3.0,1.0,1.0,traffic,13.0,0.0,215.0,20856.0,22417.0,63.0,63.0,8.54064,1.57,32.751000,23.453255,13.662500,994.812500,6.595833,1.037817,0.927494,1284.587500,73.621250,100.000000,999998
2894106,2012-11-29,12.844275,viale La Plaja 2009235,39.215557,9.105834,3.0,1.0,1.0,traffic,13.0,0.0,215.0,20856.0,22417.0,63.0,63.0,8.54064,1.57,22.160837,14.633225,12.675000,996.820833,9.165250,-3.130608,0.243413,1639.762500,69.458333,67.676708,999999


# Now make a dataframe with no and NO2 dropped...

In [27]:
combined_data_drop_no_no2 = combined_data.drop(['no2', 'no'], axis=1)

In [32]:
combined_data_drop_no_no2

Unnamed: 0,datetime,o3,station_name,lat,lon,alt,station_etopo_alt,station_rel_etopo_alt,station_type,landcover,toar_category,pop_density,max_5km_pop_density,max_25km_pop_density,nightlight_1km,nightlight_max_25km,nox_emi,omi_nox,temp,press,u,v,totprecip,pblheight,relhum,cloudcover,time_idx
0,2004-01-01,35.588063,ABERDEEN,57.157360,-2.094278,5.0,21.0,21.0,background,13.0,3.0,49081.0,49081.0,49081.0,63.0,63.0,5.58598,1.80,3.195833,994.475000,2.621792,0.232750,0.528500,640.948333,90.354583,99.700000,996713
1,2004-01-02,36.214613,ABERDEEN,57.157360,-2.094278,5.0,21.0,21.0,background,13.0,3.0,49081.0,49081.0,49081.0,63.0,63.0,5.58598,1.80,2.916667,1013.333333,2.003333,-0.130875,0.161458,453.892083,88.965417,98.463750,996714
2,2004-01-03,32.956513,ABERDEEN,57.157360,-2.094278,5.0,21.0,21.0,background,13.0,3.0,49081.0,49081.0,49081.0,63.0,63.0,5.58598,1.80,4.416667,1007.125000,3.616833,0.145500,1.039625,812.766667,89.084167,95.205833,996715
3,2004-01-04,33.457750,ABERDEEN,57.157360,-2.094278,5.0,21.0,21.0,background,13.0,3.0,49081.0,49081.0,49081.0,63.0,63.0,5.58598,1.80,3.608333,1009.333333,1.545833,-0.169125,0.139625,201.668750,91.235000,99.543333,996716
4,2004-01-05,14.410655,ABERDEEN,57.157360,-2.094278,5.0,21.0,21.0,background,13.0,3.0,49081.0,49081.0,49081.0,63.0,63.0,5.58598,1.80,6.758333,999.858333,2.052625,3.092708,0.339458,503.145250,94.327083,98.989167,996717
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2894103,2012-11-26,19.360400,viale La Plaja 2009235,39.215557,9.105834,3.0,1.0,1.0,traffic,13.0,0.0,215.0,20856.0,22417.0,63.0,63.0,8.54064,1.57,17.283333,1016.416667,-2.855062,2.077470,0.000000,507.762500,78.645000,34.774750,999996
2894104,2012-11-27,13.658798,viale La Plaja 2009235,39.215557,9.105834,3.0,1.0,1.0,traffic,13.0,0.0,215.0,20856.0,22417.0,63.0,63.0,8.54064,1.57,17.975000,1005.825000,-0.627479,2.807800,0.054886,601.885000,78.806250,99.502083,999997
2894105,2012-11-28,15.914375,viale La Plaja 2009235,39.215557,9.105834,3.0,1.0,1.0,traffic,13.0,0.0,215.0,20856.0,22417.0,63.0,63.0,8.54064,1.57,13.662500,994.812500,6.595833,1.037817,0.927494,1284.587500,73.621250,100.000000,999998
2894106,2012-11-29,12.844275,viale La Plaja 2009235,39.215557,9.105834,3.0,1.0,1.0,traffic,13.0,0.0,215.0,20856.0,22417.0,63.0,63.0,8.54064,1.57,12.675000,996.820833,9.165250,-3.130608,0.243413,1639.762500,69.458333,67.676708,999999


In [29]:
combined_data_drop_no_no2.to_csv('/home/jovyan/lustre_scratch/cas/european_data_new_temp/merged_euro_clean/uk_france_italy_o3.csv', index=False)

In [40]:
combined_data_drop_no_no2

Unnamed: 0,datetime,o3,station_name,lat,lon,alt,station_etopo_alt,station_rel_etopo_alt,station_type,landcover,toar_category,pop_density,max_5km_pop_density,max_25km_pop_density,nightlight_1km,nightlight_max_25km,nox_emi,omi_nox,temp,press,u,v,totprecip,pblheight,relhum,cloudcover,time_idx
0,2004-01-01,35.588063,ABERDEEN,57.157360,-2.094278,5.0,21.0,21.0,background,13.0,3.0,49081.0,49081.0,49081.0,63.0,63.0,5.58598,1.80,3.195833,994.475000,2.621792,0.232750,0.528500,640.948333,90.354583,99.700000,996713
1,2004-01-02,36.214613,ABERDEEN,57.157360,-2.094278,5.0,21.0,21.0,background,13.0,3.0,49081.0,49081.0,49081.0,63.0,63.0,5.58598,1.80,2.916667,1013.333333,2.003333,-0.130875,0.161458,453.892083,88.965417,98.463750,996714
2,2004-01-03,32.956513,ABERDEEN,57.157360,-2.094278,5.0,21.0,21.0,background,13.0,3.0,49081.0,49081.0,49081.0,63.0,63.0,5.58598,1.80,4.416667,1007.125000,3.616833,0.145500,1.039625,812.766667,89.084167,95.205833,996715
3,2004-01-04,33.457750,ABERDEEN,57.157360,-2.094278,5.0,21.0,21.0,background,13.0,3.0,49081.0,49081.0,49081.0,63.0,63.0,5.58598,1.80,3.608333,1009.333333,1.545833,-0.169125,0.139625,201.668750,91.235000,99.543333,996716
4,2004-01-05,14.410655,ABERDEEN,57.157360,-2.094278,5.0,21.0,21.0,background,13.0,3.0,49081.0,49081.0,49081.0,63.0,63.0,5.58598,1.80,6.758333,999.858333,2.052625,3.092708,0.339458,503.145250,94.327083,98.989167,996717
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2894103,2012-11-26,19.360400,viale La Plaja 2009235,39.215557,9.105834,3.0,1.0,1.0,traffic,13.0,0.0,215.0,20856.0,22417.0,63.0,63.0,8.54064,1.57,17.283333,1016.416667,-2.855062,2.077470,0.000000,507.762500,78.645000,34.774750,999996
2894104,2012-11-27,13.658798,viale La Plaja 2009235,39.215557,9.105834,3.0,1.0,1.0,traffic,13.0,0.0,215.0,20856.0,22417.0,63.0,63.0,8.54064,1.57,17.975000,1005.825000,-0.627479,2.807800,0.054886,601.885000,78.806250,99.502083,999997
2894105,2012-11-28,15.914375,viale La Plaja 2009235,39.215557,9.105834,3.0,1.0,1.0,traffic,13.0,0.0,215.0,20856.0,22417.0,63.0,63.0,8.54064,1.57,13.662500,994.812500,6.595833,1.037817,0.927494,1284.587500,73.621250,100.000000,999998
2894106,2012-11-29,12.844275,viale La Plaja 2009235,39.215557,9.105834,3.0,1.0,1.0,traffic,13.0,0.0,215.0,20856.0,22417.0,63.0,63.0,8.54064,1.57,12.675000,996.820833,9.165250,-3.130608,0.243413,1639.762500,69.458333,67.676708,999999


In [41]:
for name in combined_data_drop_no_no2['station_name'].unique():
    if len(combined_data_drop_no_no2[combined_data_drop_no_no2['station_name'] == name]) < 400:
        index_names = combined_data_drop_no_no2[(combined_data_drop_no_no2['station_name'] == name)].index
        combined_data_drop_no_no2 = combined_data_drop_no_no2.drop(index_names, inplace = False)
        print(index_names)
    else:
        print('Nothing to see here.')

Nothing to see here.
Nothing to see here.
Nothing to see here.
Nothing to see here.
Nothing to see here.
Nothing to see here.
Nothing to see here.
Nothing to see here.
Nothing to see here.
Nothing to see here.
Nothing to see here.
Nothing to see here.
Nothing to see here.
Nothing to see here.
Nothing to see here.
Nothing to see here.
Nothing to see here.
Nothing to see here.
Nothing to see here.
Nothing to see here.
Nothing to see here.
Nothing to see here.
Nothing to see here.
Nothing to see here.
Nothing to see here.
Nothing to see here.
Nothing to see here.
Nothing to see here.
Nothing to see here.
Nothing to see here.
Nothing to see here.
Nothing to see here.
Nothing to see here.
Nothing to see here.
Nothing to see here.
Nothing to see here.
Nothing to see here.
Nothing to see here.
Nothing to see here.
Nothing to see here.
Nothing to see here.
Nothing to see here.
Nothing to see here.
Nothing to see here.
Nothing to see here.
Nothing to see here.
Nothing to see here.
Nothing to se

In [42]:
for name in combined_data_drop_no_no2['station_name'].unique():
    if len(combined_data_drop_no_no2[combined_data_drop_no_no2['station_name'] == name]) < 400:
        print('Wanker!')

In [43]:
combined_data_drop_no_no2

Unnamed: 0,datetime,o3,station_name,lat,lon,alt,station_etopo_alt,station_rel_etopo_alt,station_type,landcover,toar_category,pop_density,max_5km_pop_density,max_25km_pop_density,nightlight_1km,nightlight_max_25km,nox_emi,omi_nox,temp,press,u,v,totprecip,pblheight,relhum,cloudcover,time_idx
0,2004-01-01,35.588063,ABERDEEN,57.157360,-2.094278,5.0,21.0,21.0,background,13.0,3.0,49081.0,49081.0,49081.0,63.0,63.0,5.58598,1.80,3.195833,994.475000,2.621792,0.232750,0.528500,640.948333,90.354583,99.700000,996713
1,2004-01-02,36.214613,ABERDEEN,57.157360,-2.094278,5.0,21.0,21.0,background,13.0,3.0,49081.0,49081.0,49081.0,63.0,63.0,5.58598,1.80,2.916667,1013.333333,2.003333,-0.130875,0.161458,453.892083,88.965417,98.463750,996714
2,2004-01-03,32.956513,ABERDEEN,57.157360,-2.094278,5.0,21.0,21.0,background,13.0,3.0,49081.0,49081.0,49081.0,63.0,63.0,5.58598,1.80,4.416667,1007.125000,3.616833,0.145500,1.039625,812.766667,89.084167,95.205833,996715
3,2004-01-04,33.457750,ABERDEEN,57.157360,-2.094278,5.0,21.0,21.0,background,13.0,3.0,49081.0,49081.0,49081.0,63.0,63.0,5.58598,1.80,3.608333,1009.333333,1.545833,-0.169125,0.139625,201.668750,91.235000,99.543333,996716
4,2004-01-05,14.410655,ABERDEEN,57.157360,-2.094278,5.0,21.0,21.0,background,13.0,3.0,49081.0,49081.0,49081.0,63.0,63.0,5.58598,1.80,6.758333,999.858333,2.052625,3.092708,0.339458,503.145250,94.327083,98.989167,996717
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2894103,2012-11-26,19.360400,viale La Plaja 2009235,39.215557,9.105834,3.0,1.0,1.0,traffic,13.0,0.0,215.0,20856.0,22417.0,63.0,63.0,8.54064,1.57,17.283333,1016.416667,-2.855062,2.077470,0.000000,507.762500,78.645000,34.774750,999996
2894104,2012-11-27,13.658798,viale La Plaja 2009235,39.215557,9.105834,3.0,1.0,1.0,traffic,13.0,0.0,215.0,20856.0,22417.0,63.0,63.0,8.54064,1.57,17.975000,1005.825000,-0.627479,2.807800,0.054886,601.885000,78.806250,99.502083,999997
2894105,2012-11-28,15.914375,viale La Plaja 2009235,39.215557,9.105834,3.0,1.0,1.0,traffic,13.0,0.0,215.0,20856.0,22417.0,63.0,63.0,8.54064,1.57,13.662500,994.812500,6.595833,1.037817,0.927494,1284.587500,73.621250,100.000000,999998
2894106,2012-11-29,12.844275,viale La Plaja 2009235,39.215557,9.105834,3.0,1.0,1.0,traffic,13.0,0.0,215.0,20856.0,22417.0,63.0,63.0,8.54064,1.57,12.675000,996.820833,9.165250,-3.130608,0.243413,1639.762500,69.458333,67.676708,999999


In [44]:
combined_data_drop_no_no2.to_csv('/home/jovyan/lustre_scratch/cas/european_data_new_temp/merged_euro_clean/uk_france_italy_o3_fewer_400_dropped.csv', index=False)

In [3]:
# read back in

new_data = pd.read_csv('/home/jovyan/lustre_scratch/cas/european_data_new_temp/merged_euro_clean/uk_france_italy_o3_fewer_400_dropped.csv')

In [4]:
new_data

Unnamed: 0,datetime,o3,station_name,lat,lon,alt,station_etopo_alt,station_rel_etopo_alt,station_type,landcover,toar_category,pop_density,max_5km_pop_density,max_25km_pop_density,nightlight_1km,nightlight_max_25km,nox_emi,omi_nox,temp,press,u,v,totprecip,pblheight,relhum,cloudcover,time_idx
0,2004-01-01,35.588063,ABERDEEN,57.157360,-2.094278,5.0,21.0,21.0,background,13.0,3.0,49081.0,49081.0,49081.0,63.0,63.0,5.58598,1.80,3.195833,994.475000,2.621792,0.232750,0.528500,640.948333,90.354583,99.700000,996713
1,2004-01-02,36.214613,ABERDEEN,57.157360,-2.094278,5.0,21.0,21.0,background,13.0,3.0,49081.0,49081.0,49081.0,63.0,63.0,5.58598,1.80,2.916667,1013.333333,2.003333,-0.130875,0.161458,453.892083,88.965417,98.463750,996714
2,2004-01-03,32.956513,ABERDEEN,57.157360,-2.094278,5.0,21.0,21.0,background,13.0,3.0,49081.0,49081.0,49081.0,63.0,63.0,5.58598,1.80,4.416667,1007.125000,3.616833,0.145500,1.039625,812.766667,89.084167,95.205833,996715
3,2004-01-04,33.457750,ABERDEEN,57.157360,-2.094278,5.0,21.0,21.0,background,13.0,3.0,49081.0,49081.0,49081.0,63.0,63.0,5.58598,1.80,3.608333,1009.333333,1.545833,-0.169125,0.139625,201.668750,91.235000,99.543333,996716
4,2004-01-05,14.410655,ABERDEEN,57.157360,-2.094278,5.0,21.0,21.0,background,13.0,3.0,49081.0,49081.0,49081.0,63.0,63.0,5.58598,1.80,6.758333,999.858333,2.052625,3.092708,0.339458,503.145250,94.327083,98.989167,996717
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2869422,2012-11-26,19.360400,viale La Plaja 2009235,39.215557,9.105834,3.0,1.0,1.0,traffic,13.0,0.0,215.0,20856.0,22417.0,63.0,63.0,8.54064,1.57,17.283333,1016.416667,-2.855062,2.077470,0.000000,507.762500,78.645000,34.774750,999996
2869423,2012-11-27,13.658798,viale La Plaja 2009235,39.215557,9.105834,3.0,1.0,1.0,traffic,13.0,0.0,215.0,20856.0,22417.0,63.0,63.0,8.54064,1.57,17.975000,1005.825000,-0.627479,2.807800,0.054886,601.885000,78.806250,99.502083,999997
2869424,2012-11-28,15.914375,viale La Plaja 2009235,39.215557,9.105834,3.0,1.0,1.0,traffic,13.0,0.0,215.0,20856.0,22417.0,63.0,63.0,8.54064,1.57,13.662500,994.812500,6.595833,1.037817,0.927494,1284.587500,73.621250,100.000000,999998
2869425,2012-11-29,12.844275,viale La Plaja 2009235,39.215557,9.105834,3.0,1.0,1.0,traffic,13.0,0.0,215.0,20856.0,22417.0,63.0,63.0,8.54064,1.57,12.675000,996.820833,9.165250,-3.130608,0.243413,1639.762500,69.458333,67.676708,999999


## Maybe I need to reindex it here!

In [49]:
combined_data_drop_o3_drop_fewer_400 = combined_data_drop_no_no2.reset_index(drop = True) 

In [51]:
combined_data_drop_o3_drop_fewer_400.to_csv('/home/jovyan/lustre_scratch/cas/european_data_new_temp/merged_euro_clean/uk_france_italy_o3_fewer_400_dropped_reindex.csv', index=False)




In [54]:
combined_data_drop_o3_drop_fewer_400[2000000:2000010]

Unnamed: 0,datetime,o3,station_name,lat,lon,alt,station_etopo_alt,station_rel_etopo_alt,station_type,landcover,toar_category,pop_density,max_5km_pop_density,max_25km_pop_density,nightlight_1km,nightlight_max_25km,nox_emi,omi_nox,temp,press,u,v,totprecip,pblheight,relhum,cloudcover,time_idx
2000000,2007-10-14,29.5105,ABBADIA CERRETO 309801,45.308609,9.586111,64.0,62.0,5.0,background,12.0,0.0,2189.0,6192.0,20801.0,17.0,62.0,5.15732,6.31,14.383333,1015.791667,-2.625417,0.585808,0.0,537.4225,67.061667,21.767625,998095
2000001,2007-10-15,28.427457,ABBADIA CERRETO 309801,45.308609,9.586111,64.0,62.0,5.0,background,12.0,0.0,2189.0,6192.0,20801.0,17.0,62.0,5.15732,6.31,13.408333,1016.208333,-0.665563,0.325293,0.0,291.655,64.076667,1.02,998096
2000002,2007-10-16,28.257413,ABBADIA CERRETO 309801,45.308609,9.586111,64.0,62.0,5.0,background,12.0,0.0,2189.0,6192.0,20801.0,17.0,62.0,5.15732,6.31,13.679167,1016.916667,-0.627451,0.47732,0.0,303.454042,70.900833,15.236521,998097
2000003,2007-10-17,26.189785,ABBADIA CERRETO 309801,45.308609,9.586111,64.0,62.0,5.0,background,12.0,0.0,2189.0,6192.0,20801.0,17.0,62.0,5.15732,6.31,13.6625,1014.791667,0.02117,0.382842,0.0,296.813042,72.002917,51.936175,998098
2000004,2007-10-18,23.495638,ABBADIA CERRETO 309801,45.308609,9.586111,64.0,62.0,5.0,background,12.0,0.0,2189.0,6192.0,20801.0,17.0,62.0,5.15732,6.31,14.370833,1011.25,-0.321492,-1.218367,0.0,504.28175,66.64375,20.945629,998099
2000005,2007-10-19,22.305175,ABBADIA CERRETO 309801,45.308609,9.586111,64.0,62.0,5.0,background,12.0,0.0,2189.0,6192.0,20801.0,17.0,62.0,5.15732,6.31,12.575,1014.291667,-0.837996,-0.160912,0.002116,402.4625,70.514583,33.948534,998100
2000006,2007-10-20,24.99935,ABBADIA CERRETO 309801,45.308609,9.586111,64.0,62.0,5.0,background,12.0,0.0,2189.0,6192.0,20801.0,17.0,62.0,5.15732,6.31,10.191667,1015.333333,-2.732267,0.205357,0.00651,1153.1625,44.963333,69.036183,998101
2000007,2007-10-21,24.623413,ABBADIA CERRETO 309801,45.308609,9.586111,64.0,62.0,5.0,background,12.0,0.0,2189.0,6192.0,20801.0,17.0,62.0,5.15732,6.31,9.283333,1012.125,0.331275,-0.173647,0.0,351.123458,53.82,43.402792,998102
2000008,2007-10-22,10.275423,ABBADIA CERRETO 309801,45.308609,9.586111,64.0,62.0,5.0,background,12.0,0.0,2189.0,6192.0,20801.0,17.0,62.0,5.15732,6.31,10.766667,1004.083333,1.7795,-0.161397,0.0,300.76325,45.995833,77.251667,998103
2000009,2007-10-23,13.784108,ABBADIA CERRETO 309801,45.308609,9.586111,64.0,62.0,5.0,background,12.0,0.0,2189.0,6192.0,20801.0,17.0,62.0,5.15732,6.31,12.1625,1001.833333,-1.327425,0.249009,0.0,342.067208,58.260417,75.967083,998104


In [5]:
# read back in...

new_data = pd.read_csv('/home/jovyan/lustre_scratch/cas/european_data_new_temp/merged_euro_clean/uk_france_italy_o3_fewer_400_dropped_reindex.csv')

In [19]:
new_data['time_idx'][1020539:1647179]

1020539     998680
1020540     998681
1020541     998682
1020542     998683
1020543     998684
            ...   
1647174     999996
1647175     999997
1647176     999998
1647177     999999
1647178    1000000
Name: time_idx, Length: 626640, dtype: int64

In [6]:
new_data

Unnamed: 0,datetime,o3,station_name,lat,lon,alt,station_etopo_alt,station_rel_etopo_alt,station_type,landcover,toar_category,pop_density,max_5km_pop_density,max_25km_pop_density,nightlight_1km,nightlight_max_25km,nox_emi,omi_nox,temp,press,u,v,totprecip,pblheight,relhum,cloudcover,time_idx
0,2004-01-01,35.588063,ABERDEEN,57.157360,-2.094278,5.0,21.0,21.0,background,13.0,3.0,49081.0,49081.0,49081.0,63.0,63.0,5.58598,1.80,3.195833,994.475000,2.621792,0.232750,0.528500,640.948333,90.354583,99.700000,996713
1,2004-01-02,36.214613,ABERDEEN,57.157360,-2.094278,5.0,21.0,21.0,background,13.0,3.0,49081.0,49081.0,49081.0,63.0,63.0,5.58598,1.80,2.916667,1013.333333,2.003333,-0.130875,0.161458,453.892083,88.965417,98.463750,996714
2,2004-01-03,32.956513,ABERDEEN,57.157360,-2.094278,5.0,21.0,21.0,background,13.0,3.0,49081.0,49081.0,49081.0,63.0,63.0,5.58598,1.80,4.416667,1007.125000,3.616833,0.145500,1.039625,812.766667,89.084167,95.205833,996715
3,2004-01-04,33.457750,ABERDEEN,57.157360,-2.094278,5.0,21.0,21.0,background,13.0,3.0,49081.0,49081.0,49081.0,63.0,63.0,5.58598,1.80,3.608333,1009.333333,1.545833,-0.169125,0.139625,201.668750,91.235000,99.543333,996716
4,2004-01-05,14.410655,ABERDEEN,57.157360,-2.094278,5.0,21.0,21.0,background,13.0,3.0,49081.0,49081.0,49081.0,63.0,63.0,5.58598,1.80,6.758333,999.858333,2.052625,3.092708,0.339458,503.145250,94.327083,98.989167,996717
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2869422,2012-11-26,19.360400,viale La Plaja 2009235,39.215557,9.105834,3.0,1.0,1.0,traffic,13.0,0.0,215.0,20856.0,22417.0,63.0,63.0,8.54064,1.57,17.283333,1016.416667,-2.855062,2.077470,0.000000,507.762500,78.645000,34.774750,999996
2869423,2012-11-27,13.658798,viale La Plaja 2009235,39.215557,9.105834,3.0,1.0,1.0,traffic,13.0,0.0,215.0,20856.0,22417.0,63.0,63.0,8.54064,1.57,17.975000,1005.825000,-0.627479,2.807800,0.054886,601.885000,78.806250,99.502083,999997
2869424,2012-11-28,15.914375,viale La Plaja 2009235,39.215557,9.105834,3.0,1.0,1.0,traffic,13.0,0.0,215.0,20856.0,22417.0,63.0,63.0,8.54064,1.57,13.662500,994.812500,6.595833,1.037817,0.927494,1284.587500,73.621250,100.000000,999998
2869425,2012-11-29,12.844275,viale La Plaja 2009235,39.215557,9.105834,3.0,1.0,1.0,traffic,13.0,0.0,215.0,20856.0,22417.0,63.0,63.0,8.54064,1.57,12.675000,996.820833,9.165250,-3.130608,0.243413,1639.762500,69.458333,67.676708,999999


# Do a comparison with the old data!

In [7]:
def load_country_data(country, species):
    data = pd.read_csv('/home/jovyan/lustre_scratch/cas/clean_data/'+country+'_'+species+'.csv')
    return data

old_data = load_country_data('uk_france_italy', 'o3_no2_new_timeidx')

In [8]:
old_data

Unnamed: 0,datetime,station_name,o3,no2,cloudcover,relhum,press,temp,v,u,pblheight,station_type,landcover,pop_density,nox_emi,alt,station_etopo_alt,station_rel_etopo_alt,omi_nox,max_5km_pop_density,max_25km_pop_density,nightlight_1km,nightlight_max_25km,toar_category,time_idx,time_idx_large_temp,time_idx_new
0,1997-01-01,LONDON BROMLEY,16.916863,32.358763,100.000000,66.568333,1015.166667,-3.358333,-1.693183,-5.167833,1284.758333,traffic,13.0,26423.0,31.64350,50.0,57.0,32.0,9.11,70545.0,147838.0,62.0,63.0,3.0,999181,1729025,999181
1,1997-01-02,LONDON BROMLEY,7.267980,48.113257,41.109833,76.667500,1012.750000,-4.558333,-1.273517,-1.456917,359.566667,traffic,13.0,26423.0,31.64350,50.0,57.0,32.0,9.11,70545.0,147838.0,62.0,63.0,3.0,999182,1729026,999182
2,1997-01-03,LONDON BROMLEY,4.887090,45.694525,100.000000,81.119167,1004.500000,-2.841667,-2.483750,-3.205000,696.775000,traffic,13.0,26423.0,31.64350,50.0,57.0,32.0,9.11,70545.0,147838.0,62.0,63.0,3.0,999183,1729027,999183
3,1997-01-04,LONDON BROMLEY,8.646398,35.627325,100.000000,82.391667,1001.666667,-0.400000,-2.900500,-3.983000,1000.000000,traffic,13.0,26423.0,31.64350,50.0,57.0,32.0,9.11,70545.0,147838.0,62.0,63.0,3.0,999184,1729028,999184
4,1997-01-05,LONDON BROMLEY,15.162503,35.038988,56.576833,67.783333,1007.250000,0.208333,-2.109500,-3.869000,1090.058333,traffic,13.0,26423.0,31.64350,50.0,57.0,32.0,9.11,70545.0,147838.0,62.0,63.0,3.0,999185,1729029,999185
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2008315,2011-02-09,PO-ROMA 904805,35.462713,10.916994,16.666667,81.625833,1011.250000,7.141667,0.201127,0.733658,172.271250,background,13.0,27233.0,8.05226,54.0,53.0,18.0,3.71,27887.0,47543.0,62.0,63.0,3.0,999991,1734177,999991
2008316,2011-02-10,PO-ROMA 904805,25.312613,16.660314,25.137008,75.300833,1013.083333,7.475000,0.041537,0.368121,303.423083,background,13.0,27233.0,8.05226,54.0,53.0,18.0,3.71,27887.0,47543.0,62.0,63.0,3.0,999992,1734178,999992
2008317,2011-02-11,PO-ROMA 904805,25.348414,39.811075,99.362500,85.586667,1011.416667,7.916667,0.582618,0.583498,251.835667,background,13.0,27233.0,8.05226,54.0,53.0,18.0,3.71,27887.0,47543.0,62.0,63.0,3.0,999993,1734179,999993
2008318,2011-02-17,PO-ROMA 904805,27.150500,20.722675,99.924167,79.872500,993.116667,9.958333,-0.017647,-1.755658,673.566667,background,13.0,27233.0,8.05226,54.0,53.0,18.0,3.71,27887.0,47543.0,62.0,63.0,3.0,999999,1734185,999999


In [9]:
new_data

Unnamed: 0,datetime,o3,station_name,lat,lon,alt,station_etopo_alt,station_rel_etopo_alt,station_type,landcover,toar_category,pop_density,max_5km_pop_density,max_25km_pop_density,nightlight_1km,nightlight_max_25km,nox_emi,omi_nox,temp,press,u,v,totprecip,pblheight,relhum,cloudcover,time_idx
0,2004-01-01,35.588063,ABERDEEN,57.157360,-2.094278,5.0,21.0,21.0,background,13.0,3.0,49081.0,49081.0,49081.0,63.0,63.0,5.58598,1.80,3.195833,994.475000,2.621792,0.232750,0.528500,640.948333,90.354583,99.700000,996713
1,2004-01-02,36.214613,ABERDEEN,57.157360,-2.094278,5.0,21.0,21.0,background,13.0,3.0,49081.0,49081.0,49081.0,63.0,63.0,5.58598,1.80,2.916667,1013.333333,2.003333,-0.130875,0.161458,453.892083,88.965417,98.463750,996714
2,2004-01-03,32.956513,ABERDEEN,57.157360,-2.094278,5.0,21.0,21.0,background,13.0,3.0,49081.0,49081.0,49081.0,63.0,63.0,5.58598,1.80,4.416667,1007.125000,3.616833,0.145500,1.039625,812.766667,89.084167,95.205833,996715
3,2004-01-04,33.457750,ABERDEEN,57.157360,-2.094278,5.0,21.0,21.0,background,13.0,3.0,49081.0,49081.0,49081.0,63.0,63.0,5.58598,1.80,3.608333,1009.333333,1.545833,-0.169125,0.139625,201.668750,91.235000,99.543333,996716
4,2004-01-05,14.410655,ABERDEEN,57.157360,-2.094278,5.0,21.0,21.0,background,13.0,3.0,49081.0,49081.0,49081.0,63.0,63.0,5.58598,1.80,6.758333,999.858333,2.052625,3.092708,0.339458,503.145250,94.327083,98.989167,996717
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2869422,2012-11-26,19.360400,viale La Plaja 2009235,39.215557,9.105834,3.0,1.0,1.0,traffic,13.0,0.0,215.0,20856.0,22417.0,63.0,63.0,8.54064,1.57,17.283333,1016.416667,-2.855062,2.077470,0.000000,507.762500,78.645000,34.774750,999996
2869423,2012-11-27,13.658798,viale La Plaja 2009235,39.215557,9.105834,3.0,1.0,1.0,traffic,13.0,0.0,215.0,20856.0,22417.0,63.0,63.0,8.54064,1.57,17.975000,1005.825000,-0.627479,2.807800,0.054886,601.885000,78.806250,99.502083,999997
2869424,2012-11-28,15.914375,viale La Plaja 2009235,39.215557,9.105834,3.0,1.0,1.0,traffic,13.0,0.0,215.0,20856.0,22417.0,63.0,63.0,8.54064,1.57,13.662500,994.812500,6.595833,1.037817,0.927494,1284.587500,73.621250,100.000000,999998
2869425,2012-11-29,12.844275,viale La Plaja 2009235,39.215557,9.105834,3.0,1.0,1.0,traffic,13.0,0.0,215.0,20856.0,22417.0,63.0,63.0,8.54064,1.57,12.675000,996.820833,9.165250,-3.130608,0.243413,1639.762500,69.458333,67.676708,999999


# Crucially, let's compare the data types!

In [11]:
old_data.dtypes

datetime                  object
station_name              object
o3                       float64
no2                      float64
cloudcover               float64
relhum                   float64
press                    float64
temp                     float64
v                        float64
u                        float64
pblheight                float64
station_type              object
landcover                float64
pop_density              float64
nox_emi                  float64
alt                      float64
station_etopo_alt        float64
station_rel_etopo_alt    float64
omi_nox                  float64
max_5km_pop_density      float64
max_25km_pop_density     float64
nightlight_1km           float64
nightlight_max_25km      float64
toar_category            float64
time_idx                   int64
time_idx_large_temp        int64
time_idx_new               int64
dtype: object

In [12]:
new_data.dtypes

datetime                  object
o3                       float64
station_name              object
lat                      float64
lon                      float64
alt                      float64
station_etopo_alt        float64
station_rel_etopo_alt    float64
station_type              object
landcover                float64
toar_category            float64
pop_density              float64
max_5km_pop_density      float64
max_25km_pop_density     float64
nightlight_1km           float64
nightlight_max_25km      float64
nox_emi                  float64
omi_nox                  float64
temp                     float64
press                    float64
u                        float64
v                        float64
totprecip                float64
pblheight                float64
relhum                   float64
cloudcover               float64
time_idx                   int64
dtype: object

# They are the same.

# Now dropping unnecessary rows...to see if this helps with loading data through the dataloader.

In [16]:
new_data_drop_cols = new_data.drop(['totprecip', 'lat', 'lon'], axis=1)

In [20]:
new_data_drop_cols.to_csv('/home/jovyan/lustre_scratch/cas/european_data_new_temp/merged_euro_clean/uk_france_italy_o3_fewer_400_dropped_reindex_drop_cols.csv', index=False)

In [64]:
new_data_drop_cols['time_idx'][1020539:2869427]

1334611    994887
1334612    994888
1334613    994889
1334614    994890
1334615    994891
            ...  
2869421    999995
2869422    999996
2869423    999997
2869424    999998
2869425    999999
Name: time_idx, Length: 1534815, dtype: int64

In [18]:
combined_data

Unnamed: 0,datetime,o3,station_name,lat,lon,alt,station_etopo_alt,station_rel_etopo_alt,station_type,landcover,toar_category,pop_density,max_5km_pop_density,max_25km_pop_density,nightlight_1km,nightlight_max_25km,nox_emi,omi_nox,no2,no,temp,press,u,v,totprecip,pblheight,relhum,cloudcover,raw_time_idx,time_idx_large_temp,time_idx_new,time_idx
0,2004-01-01,35.588063,ABERDEEN,57.157360,-2.094278,5.0,21.0,21.0,background,13.0,3.0,49081.0,49081.0,49081.0,63.0,63.0,5.58598,1.80,22.487713,,3.195833,994.475000,2.621792,0.232750,0.528500,640.948333,90.354583,99.700000,731581,1731581,996713,996713
1,2004-01-02,36.214613,ABERDEEN,57.157360,-2.094278,5.0,21.0,21.0,background,13.0,3.0,49081.0,49081.0,49081.0,63.0,63.0,5.58598,1.80,31.051338,,2.916667,1013.333333,2.003333,-0.130875,0.161458,453.892083,88.965417,98.463750,731582,1731582,996714,996714
2,2004-01-03,32.956513,ABERDEEN,57.157360,-2.094278,5.0,21.0,21.0,background,13.0,3.0,49081.0,49081.0,49081.0,63.0,63.0,5.58598,1.80,20.853425,,4.416667,1007.125000,3.616833,0.145500,1.039625,812.766667,89.084167,95.205833,731583,1731583,996715,996715
3,2004-01-04,33.457750,ABERDEEN,57.157360,-2.094278,5.0,21.0,21.0,background,13.0,3.0,49081.0,49081.0,49081.0,63.0,63.0,5.58598,1.80,29.743913,,3.608333,1009.333333,1.545833,-0.169125,0.139625,201.668750,91.235000,99.543333,731584,1731584,996716,996716
4,2004-01-05,14.410655,ABERDEEN,57.157360,-2.094278,5.0,21.0,21.0,background,13.0,3.0,49081.0,49081.0,49081.0,63.0,63.0,5.58598,1.80,27.866814,,6.758333,999.858333,2.052625,3.092708,0.339458,503.145250,94.327083,98.989167,731585,1731585,996717,996717
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2894103,2012-11-26,19.360400,viale La Plaja 2009235,39.215557,9.105834,3.0,1.0,1.0,traffic,13.0,0.0,215.0,20856.0,22417.0,63.0,63.0,8.54064,1.57,43.798725,51.216250,17.283333,1016.416667,-2.855062,2.077470,0.000000,507.762500,78.645000,34.774750,734833,1734833,999996,999996
2894104,2012-11-27,13.658798,viale La Plaja 2009235,39.215557,9.105834,3.0,1.0,1.0,traffic,13.0,0.0,215.0,20856.0,22417.0,63.0,63.0,8.54064,1.57,57.395950,64.245850,17.975000,1005.825000,-0.627479,2.807800,0.054886,601.885000,78.806250,99.502083,734834,1734834,999997,999997
2894105,2012-11-28,15.914375,viale La Plaja 2009235,39.215557,9.105834,3.0,1.0,1.0,traffic,13.0,0.0,215.0,20856.0,22417.0,63.0,63.0,8.54064,1.57,32.751000,23.453255,13.662500,994.812500,6.595833,1.037817,0.927494,1284.587500,73.621250,100.000000,734835,1734835,999998,999998
2894106,2012-11-29,12.844275,viale La Plaja 2009235,39.215557,9.105834,3.0,1.0,1.0,traffic,13.0,0.0,215.0,20856.0,22417.0,63.0,63.0,8.54064,1.57,22.160837,14.633225,12.675000,996.820833,9.165250,-3.130608,0.243413,1639.762500,69.458333,67.676708,734836,1734836,999999,999999


In [19]:
def load_country_data(country, species):
    data = pd.read_csv('/home/jovyan/lustre_scratch/cas/clean_data/'+country+'_'+species+'.csv')
    return data

data = load_country_data('uk_france_italy', 'o3')

In [20]:
data

Unnamed: 0,datetime,station_name,o3,cloudcover,relhum,press,temp,v,u,pblheight,station_type,landcover,pop_density,nox_emi,alt,station_etopo_alt,station_rel_etopo_alt,omi_nox,max_5km_pop_density,max_25km_pop_density,nightlight_1km,nightlight_max_25km,toar_category,time_idx
0,2009-12-02,MOLD,5.179480,94.967500,95.645000,975.691667,5.841667,3.717417,-1.381458,450.000000,background,10.0,4035.0,2.21795,114.0,120.0,21.0,4.04,4035.0,19252.0,43.0,63.0,0.0,733743
1,2009-12-03,MOLD,35.838662,95.847500,84.225833,980.191667,3.883333,-4.509917,6.030083,1212.025000,background,10.0,4035.0,2.21795,114.0,120.0,21.0,4.04,4035.0,19252.0,43.0,63.0,0.0,733744
2,2009-12-04,MOLD,32.956538,85.655000,85.600833,986.575000,4.416667,2.879525,1.145888,253.125000,background,10.0,4035.0,2.21795,114.0,120.0,21.0,4.04,4035.0,19252.0,43.0,63.0,0.0,733745
3,2009-12-05,MOLD,30.951563,94.480833,87.855833,975.425000,8.216667,3.869917,1.548700,960.008333,background,10.0,4035.0,2.21795,114.0,120.0,21.0,4.04,4035.0,19252.0,43.0,63.0,0.0,733746
4,2009-12-06,MOLD,36.966437,56.391650,79.767500,972.775000,6.875000,3.432250,6.832333,1323.000000,background,10.0,4035.0,2.21795,114.0,120.0,21.0,4.04,4035.0,19252.0,43.0,63.0,0.0,733747
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3058146,2011-02-09,PO-ROMA 904805,35.462713,16.666667,81.625833,1011.250000,7.141667,0.201127,0.733658,172.271250,background,13.0,27233.0,8.05226,54.0,53.0,18.0,3.71,27887.0,47543.0,62.0,63.0,3.0,734177
3058147,2011-02-10,PO-ROMA 904805,25.312613,25.137008,75.300833,1013.083333,7.475000,0.041537,0.368121,303.423083,background,13.0,27233.0,8.05226,54.0,53.0,18.0,3.71,27887.0,47543.0,62.0,63.0,3.0,734178
3058148,2011-02-11,PO-ROMA 904805,25.348414,99.362500,85.586667,1011.416667,7.916667,0.582618,0.583498,251.835667,background,13.0,27233.0,8.05226,54.0,53.0,18.0,3.71,27887.0,47543.0,62.0,63.0,3.0,734179
3058149,2011-02-17,PO-ROMA 904805,27.150500,99.924167,79.872500,993.116667,9.958333,-0.017647,-1.755658,673.566667,background,13.0,27233.0,8.05226,54.0,53.0,18.0,3.71,27887.0,47543.0,62.0,63.0,3.0,734185


## Excellent, this has been saved!

In [15]:
## test this has been saved appropriately
comb_o3_csv = pd.read_csv('/home/jovyan/lustre_scratch/cas/european_data_new_temp/merged_euro_clean/uk_france_italy_o3_nans_no2_no.csv')
comb_o3_csv

Unnamed: 0,datetime,o3,station_name,lat,lon,alt,station_etopo_alt,station_rel_etopo_alt,station_type,landcover,toar_category,pop_density,max_5km_pop_density,max_25km_pop_density,nightlight_1km,nightlight_max_25km,nox_emi,omi_nox,no2,no,temp,press,u,v,totprecip,pblheight,relhum,cloudcover,raw_time_idx,time_idx_large_temp,time_idx_new,time_idx
0,2004-01-01,35.588063,ABERDEEN,57.157360,-2.094278,5.0,21.0,21.0,background,13.0,3.0,49081.0,49081.0,49081.0,63.0,63.0,5.58598,1.80,22.487713,,3.195833,994.475000,2.621792,0.232750,0.528500,640.948333,90.354583,99.700000,731581,1731581,996713,996713
1,2004-01-02,36.214613,ABERDEEN,57.157360,-2.094278,5.0,21.0,21.0,background,13.0,3.0,49081.0,49081.0,49081.0,63.0,63.0,5.58598,1.80,31.051338,,2.916667,1013.333333,2.003333,-0.130875,0.161458,453.892083,88.965417,98.463750,731582,1731582,996714,996714
2,2004-01-03,32.956513,ABERDEEN,57.157360,-2.094278,5.0,21.0,21.0,background,13.0,3.0,49081.0,49081.0,49081.0,63.0,63.0,5.58598,1.80,20.853425,,4.416667,1007.125000,3.616833,0.145500,1.039625,812.766667,89.084167,95.205833,731583,1731583,996715,996715
3,2004-01-04,33.457750,ABERDEEN,57.157360,-2.094278,5.0,21.0,21.0,background,13.0,3.0,49081.0,49081.0,49081.0,63.0,63.0,5.58598,1.80,29.743913,,3.608333,1009.333333,1.545833,-0.169125,0.139625,201.668750,91.235000,99.543333,731584,1731584,996716,996716
4,2004-01-05,14.410655,ABERDEEN,57.157360,-2.094278,5.0,21.0,21.0,background,13.0,3.0,49081.0,49081.0,49081.0,63.0,63.0,5.58598,1.80,27.866814,,6.758333,999.858333,2.052625,3.092708,0.339458,503.145250,94.327083,98.989167,731585,1731585,996717,996717
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2894103,2012-11-26,19.360400,viale La Plaja 2009235,39.215557,9.105834,3.0,1.0,1.0,traffic,13.0,0.0,215.0,20856.0,22417.0,63.0,63.0,8.54064,1.57,43.798725,51.216250,17.283333,1016.416667,-2.855062,2.077470,0.000000,507.762500,78.645000,34.774750,734833,1734833,999996,999996
2894104,2012-11-27,13.658798,viale La Plaja 2009235,39.215557,9.105834,3.0,1.0,1.0,traffic,13.0,0.0,215.0,20856.0,22417.0,63.0,63.0,8.54064,1.57,57.395950,64.245850,17.975000,1005.825000,-0.627479,2.807800,0.054886,601.885000,78.806250,99.502083,734834,1734834,999997,999997
2894105,2012-11-28,15.914375,viale La Plaja 2009235,39.215557,9.105834,3.0,1.0,1.0,traffic,13.0,0.0,215.0,20856.0,22417.0,63.0,63.0,8.54064,1.57,32.751000,23.453255,13.662500,994.812500,6.595833,1.037817,0.927494,1284.587500,73.621250,100.000000,734835,1734835,999998,999998
2894106,2012-11-29,12.844275,viale La Plaja 2009235,39.215557,9.105834,3.0,1.0,1.0,traffic,13.0,0.0,215.0,20856.0,22417.0,63.0,63.0,8.54064,1.57,22.160837,14.633225,12.675000,996.820833,9.165250,-3.130608,0.243413,1639.762500,69.458333,67.676708,734836,1734836,999999,999999


In [None]:
## Now save a pair of frames, one with no NO2 nans, and one with no NO nans...



### Repeat for data including NO2

In [15]:
uk_no2_data = pd.read_csv('/home/jovyan/lustre_scratch/cas/clean_data/uk_o3_no2.csv')
italy_no2_data = pd.read_csv('/home/jovyan/lustre_scratch/cas/clean_data/italy_o3_no2.csv')
france_no2_data = pd.read_csv('/home/jovyan/lustre_scratch/cas/clean_data/france_o3_no2.csv')

In [16]:
combined_no2_data = pd.concat([uk_no2_data, france_no2_data, italy_no2_data], ignore_index=True)

In [17]:
combined_no2_data

Unnamed: 0.1,Unnamed: 0,datetime,station_name,o3,no2,cloudcover,relhum,press,temp,v,...,alt,station_etopo_alt,station_rel_etopo_alt,omi_nox,max_5km_pop_density,max_25km_pop_density,nightlight_1km,nightlight_max_25km,toar_category,time_idx
0,1097,2009-12-02,MOLD,5.179480,26.805286,94.967500,95.645000,975.691667,5.841667,3.717417,...,114.0,120.0,21.0,4.04,4035.0,19252.0,43.0,63.0,0.0,733743
1,1098,2009-12-03,MOLD,35.838662,25.642563,95.847500,84.225833,980.191667,3.883333,-4.509917,...,114.0,120.0,21.0,4.04,4035.0,19252.0,43.0,63.0,0.0,733744
2,1099,2009-12-04,MOLD,32.956538,31.113163,85.655000,85.600833,986.575000,4.416667,2.879525,...,114.0,120.0,21.0,4.04,4035.0,19252.0,43.0,63.0,0.0,733745
3,1100,2009-12-05,MOLD,30.951563,29.720614,94.480833,87.855833,975.425000,8.216667,3.869917,...,114.0,120.0,21.0,4.04,4035.0,19252.0,43.0,63.0,0.0,733746
4,1101,2009-12-06,MOLD,36.966437,19.702457,56.391650,79.767500,972.775000,6.875000,3.432250,...,114.0,120.0,21.0,4.04,4035.0,19252.0,43.0,63.0,0.0,733747
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2046282,1925335,2011-02-09,PO-ROMA 904805,35.462713,10.916994,16.666667,81.625833,1011.250000,7.141667,0.201127,...,54.0,53.0,18.0,3.71,27887.0,47543.0,62.0,63.0,3.0,734177
2046283,1925336,2011-02-10,PO-ROMA 904805,25.312613,16.660314,25.137008,75.300833,1013.083333,7.475000,0.041537,...,54.0,53.0,18.0,3.71,27887.0,47543.0,62.0,63.0,3.0,734178
2046284,1925337,2011-02-11,PO-ROMA 904805,25.348414,39.811075,99.362500,85.586667,1011.416667,7.916667,0.582618,...,54.0,53.0,18.0,3.71,27887.0,47543.0,62.0,63.0,3.0,734179
2046285,1925343,2011-02-17,PO-ROMA 904805,27.150500,20.722675,99.924167,79.872500,993.116667,9.958333,-0.017647,...,54.0,53.0,18.0,3.71,27887.0,47543.0,62.0,63.0,3.0,734185


In [18]:
## quick check on length

uk_no2_data.shape[0] + italy_no2_data.shape[0] + france_no2_data.shape[0]

2046287

In [41]:
combined_no2_data = combined_no2_data.drop(['Unnamed: 0'], axis= 1)

In [43]:
## save this dataframe

# combined_no2_data.to_csv('/home/jovyan/lustre_scratch/cas/clean_data/uk_france_italy_o3_no2.csv', index=False)

### Repeat for data including NO2 and NO

In [2]:
uk_no2_data = pd.read_csv('/home/jovyan/lustre_scratch/cas/clean_data/uk_o3_no2_no.csv')
italy_no2_data = pd.read_csv('/home/jovyan/lustre_scratch/cas/clean_data/italy_o3_no2_no.csv')
france_no2_data = pd.read_csv('/home/jovyan/lustre_scratch/cas/clean_data/france_o3_no2_no.csv')

In [3]:
combined_no2_data = pd.concat([uk_no2_data, france_no2_data, italy_no2_data], ignore_index=True)

In [4]:
combined_no2_data

Unnamed: 0.1,Unnamed: 0,datetime,station_name,o3,no2,no,cloudcover,relhum,press,temp,v,u,pblheight,station_type,landcover,pop_density,nox_emi,alt,station_etopo_alt,station_rel_etopo_alt,omi_nox,max_5km_pop_density,max_25km_pop_density,nightlight_1km,nightlight_max_25km,toar_category,time_idx
0,13323,1997-01-01,WALSALL ALUMWELL,34.752633,67.911386,9.368029,89.153333,88.371667,1006.250000,-3.416667,-1.714963,-2.522500,722.425000,background,13.0,36630.0,28.37880,20.0,130.0,11.0,6.24,41668.0,47829.0,63.0,63.0,3.0,729025
1,13324,1997-01-02,WALSALL ALUMWELL,19.548362,61.579713,14.262060,100.000000,92.149167,1001.583333,-1.766667,-1.132583,-0.381083,183.845000,background,13.0,36630.0,28.37880,20.0,130.0,11.0,6.24,41668.0,47829.0,63.0,63.0,3.0,729026
2,13325,1997-01-03,WALSALL ALUMWELL,15.538437,36.346425,34.175879,92.389333,87.984167,995.558333,-2.908333,-2.163250,-4.994500,1017.100000,background,13.0,36630.0,28.37880,20.0,130.0,11.0,6.24,41668.0,47829.0,63.0,63.0,3.0,729027
3,13326,1997-01-04,WALSALL ALUMWELL,23.934212,35.431225,33.556581,77.030833,85.156667,994.350000,-1.666667,-2.993167,-4.618833,1063.425000,background,13.0,36630.0,28.37880,20.0,130.0,11.0,6.24,41668.0,47829.0,63.0,63.0,3.0,729028
4,13327,1997-01-05,WALSALL ALUMWELL,29.698475,35.300463,15.443571,92.346167,90.380833,998.300000,-0.750000,-1.904250,-2.147917,479.550000,background,13.0,36630.0,28.37880,20.0,130.0,11.0,6.24,41668.0,47829.0,63.0,63.0,3.0,729029
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
802571,1925335,2011-02-09,PO-ROMA 904805,35.462713,10.916994,3.507962,16.666667,81.625833,1011.250000,7.141667,0.201127,0.733658,172.271250,background,13.0,27233.0,8.05226,54.0,53.0,18.0,3.71,27887.0,47543.0,62.0,63.0,3.0,734177
802572,1925336,2011-02-10,PO-ROMA 904805,25.312613,16.660314,8.619565,25.137008,75.300833,1013.083333,7.475000,0.041537,0.368121,303.423083,background,13.0,27233.0,8.05226,54.0,53.0,18.0,3.71,27887.0,47543.0,62.0,63.0,3.0,734178
802573,1925337,2011-02-11,PO-ROMA 904805,25.348414,39.811075,64.145600,99.362500,85.586667,1011.416667,7.916667,0.582618,0.583498,251.835667,background,13.0,27233.0,8.05226,54.0,53.0,18.0,3.71,27887.0,47543.0,62.0,63.0,3.0,734179
802574,1925343,2011-02-17,PO-ROMA 904805,27.150500,20.722675,5.512517,99.924167,79.872500,993.116667,9.958333,-0.017647,-1.755658,673.566667,background,13.0,27233.0,8.05226,54.0,53.0,18.0,3.71,27887.0,47543.0,62.0,63.0,3.0,734185


In [5]:
## quick check on length

uk_no2_data.shape[0] + italy_no2_data.shape[0] + france_no2_data.shape[0]

802576

In [6]:
combined_no2_data = combined_no2_data.drop(['Unnamed: 0'], axis= 1)

In [7]:
## save this dataframe

combined_no2_data.to_csv('/home/jovyan/lustre_scratch/cas/clean_data/uk_france_italy_o3_no2_no.csv', index=False)

# Testing and exploration...

In [3]:
# test it has been saved appropriately...
comb_o3_no2_csv = pd.read_csv('/home/jovyan/lustre_scratch/cas/clean_data/uk_france_italy_o3_no2.csv')
comb_o3_no2_csv

Unnamed: 0,datetime,station_name,o3,no2,cloudcover,relhum,press,temp,v,u,pblheight,station_type,landcover,pop_density,nox_emi,alt,station_etopo_alt,station_rel_etopo_alt,omi_nox,max_5km_pop_density,max_25km_pop_density,nightlight_1km,nightlight_max_25km,toar_category,time_idx
0,2009-12-02,MOLD,5.179480,26.805286,94.967500,95.645000,975.691667,5.841667,3.717417,-1.381458,450.000000,background,10.0,4035.0,2.21795,114.0,120.0,21.0,4.04,4035.0,19252.0,43.0,63.0,0.0,733743
1,2009-12-03,MOLD,35.838662,25.642563,95.847500,84.225833,980.191667,3.883333,-4.509917,6.030083,1212.025000,background,10.0,4035.0,2.21795,114.0,120.0,21.0,4.04,4035.0,19252.0,43.0,63.0,0.0,733744
2,2009-12-04,MOLD,32.956538,31.113163,85.655000,85.600833,986.575000,4.416667,2.879525,1.145888,253.125000,background,10.0,4035.0,2.21795,114.0,120.0,21.0,4.04,4035.0,19252.0,43.0,63.0,0.0,733745
3,2009-12-05,MOLD,30.951563,29.720614,94.480833,87.855833,975.425000,8.216667,3.869917,1.548700,960.008333,background,10.0,4035.0,2.21795,114.0,120.0,21.0,4.04,4035.0,19252.0,43.0,63.0,0.0,733746
4,2009-12-06,MOLD,36.966437,19.702457,56.391650,79.767500,972.775000,6.875000,3.432250,6.832333,1323.000000,background,10.0,4035.0,2.21795,114.0,120.0,21.0,4.04,4035.0,19252.0,43.0,63.0,0.0,733747
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2046282,2011-02-09,PO-ROMA 904805,35.462713,10.916994,16.666667,81.625833,1011.250000,7.141667,0.201127,0.733658,172.271250,background,13.0,27233.0,8.05226,54.0,53.0,18.0,3.71,27887.0,47543.0,62.0,63.0,3.0,734177
2046283,2011-02-10,PO-ROMA 904805,25.312613,16.660314,25.137008,75.300833,1013.083333,7.475000,0.041537,0.368121,303.423083,background,13.0,27233.0,8.05226,54.0,53.0,18.0,3.71,27887.0,47543.0,62.0,63.0,3.0,734178
2046284,2011-02-11,PO-ROMA 904805,25.348414,39.811075,99.362500,85.586667,1011.416667,7.916667,0.582618,0.583498,251.835667,background,13.0,27233.0,8.05226,54.0,53.0,18.0,3.71,27887.0,47543.0,62.0,63.0,3.0,734179
2046285,2011-02-17,PO-ROMA 904805,27.150500,20.722675,99.924167,79.872500,993.116667,9.958333,-0.017647,-1.755658,673.566667,background,13.0,27233.0,8.05226,54.0,53.0,18.0,3.71,27887.0,47543.0,62.0,63.0,3.0,734185


### I should compare these two datasets, one with NOx and one without, in order to look at how they differ and whether there are biases...

In [50]:
comb_o3_csv

Unnamed: 0,datetime,station_name,o3,cloudcover,relhum,press,temp,v,u,pblheight,station_type,landcover,pop_density,nox_emi,alt,station_etopo_alt,station_rel_etopo_alt,omi_nox,max_5km_pop_density,max_25km_pop_density,nightlight_1km,nightlight_max_25km,toar_category,time_idx
0,2009-12-02,MOLD,5.179480,94.967500,95.645000,975.691667,5.841667,3.717417,-1.381458,450.000000,background,10.0,4035.0,2.21795,114.0,120.0,21.0,4.04,4035.0,19252.0,43.0,63.0,0.0,733743
1,2009-12-03,MOLD,35.838662,95.847500,84.225833,980.191667,3.883333,-4.509917,6.030083,1212.025000,background,10.0,4035.0,2.21795,114.0,120.0,21.0,4.04,4035.0,19252.0,43.0,63.0,0.0,733744
2,2009-12-04,MOLD,32.956538,85.655000,85.600833,986.575000,4.416667,2.879525,1.145888,253.125000,background,10.0,4035.0,2.21795,114.0,120.0,21.0,4.04,4035.0,19252.0,43.0,63.0,0.0,733745
3,2009-12-05,MOLD,30.951563,94.480833,87.855833,975.425000,8.216667,3.869917,1.548700,960.008333,background,10.0,4035.0,2.21795,114.0,120.0,21.0,4.04,4035.0,19252.0,43.0,63.0,0.0,733746
4,2009-12-06,MOLD,36.966437,56.391650,79.767500,972.775000,6.875000,3.432250,6.832333,1323.000000,background,10.0,4035.0,2.21795,114.0,120.0,21.0,4.04,4035.0,19252.0,43.0,63.0,0.0,733747
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3058146,2011-02-09,PO-ROMA 904805,35.462713,16.666667,81.625833,1011.250000,7.141667,0.201127,0.733658,172.271250,background,13.0,27233.0,8.05226,54.0,53.0,18.0,3.71,27887.0,47543.0,62.0,63.0,3.0,734177
3058147,2011-02-10,PO-ROMA 904805,25.312613,25.137008,75.300833,1013.083333,7.475000,0.041537,0.368121,303.423083,background,13.0,27233.0,8.05226,54.0,53.0,18.0,3.71,27887.0,47543.0,62.0,63.0,3.0,734178
3058148,2011-02-11,PO-ROMA 904805,25.348414,99.362500,85.586667,1011.416667,7.916667,0.582618,0.583498,251.835667,background,13.0,27233.0,8.05226,54.0,53.0,18.0,3.71,27887.0,47543.0,62.0,63.0,3.0,734179
3058149,2011-02-17,PO-ROMA 904805,27.150500,99.924167,79.872500,993.116667,9.958333,-0.017647,-1.755658,673.566667,background,13.0,27233.0,8.05226,54.0,53.0,18.0,3.71,27887.0,47543.0,62.0,63.0,3.0,734185


In [4]:
comb_o3_no2_csv

Unnamed: 0,datetime,station_name,o3,no2,cloudcover,relhum,press,temp,v,u,pblheight,station_type,landcover,pop_density,nox_emi,alt,station_etopo_alt,station_rel_etopo_alt,omi_nox,max_5km_pop_density,max_25km_pop_density,nightlight_1km,nightlight_max_25km,toar_category,time_idx
0,2009-12-02,MOLD,5.179480,26.805286,94.967500,95.645000,975.691667,5.841667,3.717417,-1.381458,450.000000,background,10.0,4035.0,2.21795,114.0,120.0,21.0,4.04,4035.0,19252.0,43.0,63.0,0.0,733743
1,2009-12-03,MOLD,35.838662,25.642563,95.847500,84.225833,980.191667,3.883333,-4.509917,6.030083,1212.025000,background,10.0,4035.0,2.21795,114.0,120.0,21.0,4.04,4035.0,19252.0,43.0,63.0,0.0,733744
2,2009-12-04,MOLD,32.956538,31.113163,85.655000,85.600833,986.575000,4.416667,2.879525,1.145888,253.125000,background,10.0,4035.0,2.21795,114.0,120.0,21.0,4.04,4035.0,19252.0,43.0,63.0,0.0,733745
3,2009-12-05,MOLD,30.951563,29.720614,94.480833,87.855833,975.425000,8.216667,3.869917,1.548700,960.008333,background,10.0,4035.0,2.21795,114.0,120.0,21.0,4.04,4035.0,19252.0,43.0,63.0,0.0,733746
4,2009-12-06,MOLD,36.966437,19.702457,56.391650,79.767500,972.775000,6.875000,3.432250,6.832333,1323.000000,background,10.0,4035.0,2.21795,114.0,120.0,21.0,4.04,4035.0,19252.0,43.0,63.0,0.0,733747
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2046282,2011-02-09,PO-ROMA 904805,35.462713,10.916994,16.666667,81.625833,1011.250000,7.141667,0.201127,0.733658,172.271250,background,13.0,27233.0,8.05226,54.0,53.0,18.0,3.71,27887.0,47543.0,62.0,63.0,3.0,734177
2046283,2011-02-10,PO-ROMA 904805,25.312613,16.660314,25.137008,75.300833,1013.083333,7.475000,0.041537,0.368121,303.423083,background,13.0,27233.0,8.05226,54.0,53.0,18.0,3.71,27887.0,47543.0,62.0,63.0,3.0,734178
2046284,2011-02-11,PO-ROMA 904805,25.348414,39.811075,99.362500,85.586667,1011.416667,7.916667,0.582618,0.583498,251.835667,background,13.0,27233.0,8.05226,54.0,53.0,18.0,3.71,27887.0,47543.0,62.0,63.0,3.0,734179
2046285,2011-02-17,PO-ROMA 904805,27.150500,20.722675,99.924167,79.872500,993.116667,9.958333,-0.017647,-1.755658,673.566667,background,13.0,27233.0,8.05226,54.0,53.0,18.0,3.71,27887.0,47543.0,62.0,63.0,3.0,734185


In [6]:
comb_o3_no2_csv.loc[comb_o3_no2_csv['station_name'] == 'London Haringey Priory Park South']

Unnamed: 0,datetime,station_name,o3,no2,cloudcover,relhum,press,temp,v,u,pblheight,station_type,landcover,pop_density,nox_emi,alt,station_etopo_alt,station_rel_etopo_alt,omi_nox,max_5km_pop_density,max_25km_pop_density,nightlight_1km,nightlight_max_25km,toar_category,time_idx
78662,2012-11-22,London Haringey Priory Park South,33.457787,16.124917,95.288333,69.3925,1003.666667,11.775,6.870417,3.36775,1677.333333,background,13.0,132790.0,78.6076,37.0,60.0,47.0,11.01,145800.0,147838.0,63.0,63.0,3.0,734829
78663,2012-11-23,London Haringey Priory Park South,35.58805,18.761563,78.950833,89.916667,1007.166667,7.533333,1.39385,0.706848,222.533333,background,13.0,132790.0,78.6076,37.0,60.0,47.0,11.01,145800.0,147838.0,63.0,63.0,3.0,734830
78664,2012-11-24,London Haringey Priory Park South,6.265507,23.206788,100.0,95.284167,1003.25,8.058333,2.0885,-3.516917,743.258333,background,13.0,132790.0,78.6076,37.0,60.0,47.0,11.01,145800.0,147838.0,63.0,63.0,3.0,734831
78665,2012-11-25,London Haringey Priory Park South,33.457775,23.3095,65.984167,79.5625,999.483333,9.025,2.177333,3.677167,1297.641667,background,13.0,132790.0,78.6076,37.0,60.0,47.0,11.01,145800.0,147838.0,63.0,63.0,3.0,734832
78666,2012-11-26,London Haringey Priory Park South,35.086787,36.673275,100.0,93.0525,991.1,7.508333,-1.003083,0.82475,438.766667,background,13.0,132790.0,78.6076,37.0,60.0,47.0,11.01,145800.0,147838.0,63.0,63.0,3.0,734833
78667,2012-11-27,London Haringey Priory Park South,26.4404,35.496588,86.998333,88.6575,999.058333,6.741667,-5.037583,-1.457513,1091.016667,background,13.0,132790.0,78.6076,37.0,60.0,47.0,11.01,145800.0,147838.0,63.0,63.0,3.0,734834
78668,2012-11-28,London Haringey Priory Park South,30.325025,18.630825,66.465,83.884167,1006.916667,4.566667,-5.516917,-0.13853,1042.391667,background,13.0,132790.0,78.6076,37.0,60.0,47.0,11.01,145800.0,147838.0,63.0,63.0,3.0,734835
78669,2012-11-29,London Haringey Priory Park South,22.05455,29.8093,50.930833,85.6175,1005.833333,1.558333,-2.617583,0.860025,367.566667,background,13.0,132790.0,78.6076,37.0,60.0,47.0,11.01,145800.0,147838.0,63.0,63.0,3.0,734836
78670,2012-11-30,London Haringey Priory Park South,4.76178,22.226225,72.943667,88.734167,1005.583333,0.741667,-0.800782,1.0934,221.135,background,13.0,132790.0,78.6076,37.0,60.0,47.0,11.01,145800.0,147838.0,63.0,63.0,3.0,734837
78671,2012-12-01,London Haringey Priory Park South,12.656305,9.674953,52.8025,84.264167,1007.5,1.858333,-1.340808,2.332,585.391667,background,13.0,132790.0,78.6076,37.0,60.0,47.0,11.01,145800.0,147838.0,63.0,63.0,3.0,734838


In [24]:
list_of_low_stations = list(comb_o3_no2_csv['station_name'].value_counts().tail(50).index)

In [26]:
comb_o3_no2_csv['station_name']

0                    MOLD
1                    MOLD
2                    MOLD
3                    MOLD
4                    MOLD
                ...      
2046282    PO-ROMA 904805
2046283    PO-ROMA 904805
2046284    PO-ROMA 904805
2046285    PO-ROMA 904805
2046286    PO-ROMA 904805
Name: station_name, Length: 2046287, dtype: object

In [55]:
### Calculate summary statistics of our two datasets, one containing NaNs and the other not!

comb_o3_csv.describe()

Unnamed: 0,o3,cloudcover,relhum,press,temp,v,u,pblheight,landcover,pop_density,nox_emi,alt,station_etopo_alt,station_rel_etopo_alt,omi_nox,max_5km_pop_density,max_25km_pop_density,nightlight_1km,nightlight_max_25km,toar_category,time_idx
count,3058151.0,3058151.0,3058151.0,3058151.0,3058151.0,3058151.0,3058151.0,3058151.0,3058151.0,3058151.0,3058151.0,3058151.0,3058151.0,3058151.0,3058151.0,3058151.0,3058151.0,3058151.0,3058151.0,3058151.0,3058151.0
mean,35.16865,60.9373,66.31663,990.7175,14.21402,-0.1723148,0.7015306,850.7266,11.53199,22274.13,11.9114,177.4202,179.5583,50.26418,3.721342,28886.29,43193.38,50.53259,61.3561,1.188939,732675.9
std,16.4411,34.85789,17.03851,35.44624,8.077313,2.630546,2.703231,478.6463,2.728095,38328.42,19.14125,284.2285,271.1163,98.0732,2.274765,47265.82,57762.44,18.64317,4.814191,1.344653,1413.257
min,-2.75682,0.0,5.852917,690.175,-21.775,-20.08,-17.62083,7.47775,0.0,0.0,0.01613,-2.0,-120.0,-57.0,0.63,17.0,405.0,0.0,24.0,0.0,729025.0
25%,24.31013,30.22667,53.67417,984.3417,8.391667,-1.621825,-0.9063333,475.0833,12.0,3165.0,2.34588,26.0,33.0,10.0,2.13,4230.0,11512.0,47.0,62.0,0.0,731813.0
50%,34.58556,69.46512,67.66083,1001.133,14.14167,-0.07176758,0.4741667,816.3467,13.0,9926.0,5.11447,79.0,79.0,22.0,3.0,13681.0,25580.0,60.0,63.0,1.0,732855.0
75%,45.36221,94.55167,80.0675,1011.25,20.05833,1.308792,2.253,1180.308,13.0,24948.0,11.8633,217.0,222.0,47.0,4.6,36102.0,51746.0,63.0,63.0,3.0,733818.0
max,209.017,100.0,100.0,1044.0,41.60833,17.50917,19.28917,3873.5,14.0,316381.0,115.468,3480.0,3116.0,1278.0,11.06,333891.0,333891.0,63.0,63.0,3.0,735537.0


In [59]:
### Calculate summary statistics of our two datasets, one containing NaNs and the other not!

comb_o3_no2_csv.describe()

Unnamed: 0,o3,no2,cloudcover,relhum,press,temp,v,u,pblheight,landcover,pop_density,nox_emi,alt,station_etopo_alt,station_rel_etopo_alt,omi_nox,max_5km_pop_density,max_25km_pop_density,nightlight_1km,nightlight_max_25km,toar_category,time_idx
count,2046287.0,2046287.0,2046287.0,2046287.0,2046287.0,2046287.0,2046287.0,2046287.0,2046287.0,2046287.0,2046287.0,2046287.0,2046287.0,2046287.0,2046287.0,2046287.0,2046287.0,2046287.0,2046287.0,2046287.0,2046287.0,2046287.0
mean,34.20262,19.69361,60.51669,65.9658,993.5643,14.42075,-0.2146408,0.6506462,848.495,11.9266,24854.83,12.00131,144.5228,150.2253,37.11355,3.806239,31658.66,45789.44,55.29021,62.14023,1.27339,732573.1
std,16.6571,13.11564,34.95629,16.88787,30.18514,8.053511,2.596136,2.60926,480.6119,2.330991,39576.08,17.82669,197.6501,197.0553,64.3935,2.293502,47615.78,59257.65,13.28277,2.481138,1.41682,1393.742
min,0.0,0.0,0.0,5.852917,743.3417,-18.48333,-20.08,-17.62083,8.019,0.0,0.0,0.085494,-2.0,-120.0,-57.0,0.74,93.0,1206.0,0.0,28.0,0.0,729025.0
25%,22.80644,10.00181,29.41146,53.425,986.5667,8.583333,-1.629583,-0.8954167,468.9813,12.0,4574.0,2.97501,25.0,31.0,9.0,2.14,6326.0,13338.0,55.0,62.0,0.0,731732.0
50%,33.58308,17.25801,68.79583,67.16083,1002.0,14.38333,-0.093675,0.4326667,812.5083,13.0,12889.0,5.54536,74.0,74.0,20.0,3.05,16983.0,29134.0,61.0,63.0,0.0,732736.0
75%,44.61038,26.56698,94.3275,79.54875,1011.667,20.31667,1.253578,2.150333,1181.147,13.0,27582.0,12.7281,196.0,204.0,38.0,4.88,39223.0,52046.0,63.0,63.0,3.0,733687.0
max,199.9949,491.0033,100.0,100.0,1043.833,41.29167,16.1375,18.81167,3873.5,14.0,316381.0,115.468,2035.0,2011.0,923.0,11.06,333891.0,333891.0,63.0,63.0,3.0,734868.0


In [61]:
# I wonder if there is an easy way to compare these two sets of summary statistics?
# This sorts them alphabetically...but that is ok.

# So we are essentially trying to establish here that both datasets have absolute summary stats that are very similar.

comb_o3_no2_csv.describe() - comb_o3_csv.describe()

Unnamed: 0,alt,cloudcover,landcover,max_25km_pop_density,max_5km_pop_density,nightlight_1km,nightlight_max_25km,no2,nox_emi,o3,omi_nox,pblheight,pop_density,press,relhum,station_etopo_alt,station_rel_etopo_alt,temp,time_idx,toar_category,u,v
count,-1011864.0,-1011864.0,-1011864.0,-1011864.0,-1011864.0,-1011864.0,-1011864.0,,-1011864.0,-1011864.0,-1011864.0,-1011864.0,-1011864.0,-1011864.0,-1011864.0,-1011864.0,-1011864.0,-1011864.0,-1011864.0,-1011864.0,-1011864.0,-1011864.0
mean,-32.89741,-0.4206051,0.3946072,2596.062,2772.368,4.757617,0.7841337,,0.08990855,-0.9660298,0.08489691,-2.231642,2580.701,2.846861,-0.3508292,-29.33305,-13.15063,0.2067268,-102.8557,0.08445129,-0.05088442,-0.04232601
std,-86.57838,0.09839612,-0.3971037,1495.207,349.9611,-5.360394,-2.333052,,-1.314558,0.2159952,0.01873679,1.965679,1247.664,-5.261107,-0.1506402,-74.06092,-33.67971,-0.02380197,-19.51477,0.07216684,-0.09397155,-0.03440954
min,0.0,0.0,0.0,801.0,76.0,0.0,4.0,,0.069364,2.75682,0.11,0.54125,0.0,53.16667,0.0,0.0,0.0,3.291667,0.0,0.0,0.0,0.0
25%,-1.0,-0.8152083,0.0,1826.0,2096.0,8.0,0.0,,0.62913,-1.503687,0.01,-6.102042,1409.0,2.225,-0.2491667,-2.0,-1.0,0.1916667,-81.0,0.0,0.01091667,-0.007758333
50%,-5.0,-0.66929,0.0,3554.0,3302.0,1.0,0.0,,0.43089,-1.002487,0.05,-3.838333,2963.0,0.8666667,-0.5,-5.0,-2.0,0.2416667,-119.0,-1.0,-0.0415,-0.02190742
75%,-21.0,-0.2241667,0.0,300.0,3121.0,0.0,0.0,,0.8648,-0.7518375,0.28,0.8387917,2634.0,0.4166667,-0.51875,-18.0,-9.0,0.2583333,-131.0,0.0,-0.1026667,-0.05521333
max,-1445.0,0.0,0.0,0.0,0.0,0.0,0.0,,0.0,-9.022125,0.0,0.0,0.0,-0.1666667,0.0,-1105.0,-355.0,-0.3166667,-669.0,0.0,-0.4775,-1.371667


In [56]:
corr_o3 = comb_o3_csv.corr()

corr_o3.style.background_gradient(cmap='coolwarm')

Unnamed: 0,o3,cloudcover,relhum,press,temp,v,u,pblheight,landcover,pop_density,nox_emi,alt,station_etopo_alt,station_rel_etopo_alt,omi_nox,max_5km_pop_density,max_25km_pop_density,nightlight_1km,nightlight_max_25km,toar_category,time_idx
o3,1.0,-0.25035,-0.497281,-0.12225,0.585652,0.006227,-0.011279,0.431551,-0.086479,-0.093099,-0.089641,0.140491,0.13615,0.130202,-0.130017,-0.085565,-0.08033,-0.119239,-0.070612,-0.080935,0.092188
cloudcover,-0.25035,1.0,0.529099,-0.034098,-0.242105,0.200667,0.114027,-0.075163,0.025545,0.055509,0.04525,-0.015137,-0.019954,-0.026799,0.123762,0.060828,0.069116,-0.019156,-0.033994,0.055521,-0.041249
relhum,-0.497281,0.529099,1.0,-0.009207,-0.551699,0.165354,0.086149,-0.545366,-0.002662,-0.004158,0.001514,0.004423,-0.00271,0.012253,0.051438,-0.00186,-0.006092,-0.072059,-0.080604,0.040383,-0.066266
press,-0.12225,-0.034098,-0.009207,1.0,0.188743,-0.054809,-0.018056,0.042301,0.533085,0.191826,0.212634,-0.847211,-0.8818,-0.545645,0.332546,0.202015,0.194321,0.395038,0.366657,0.103875,-0.07648
temp,0.585652,-0.242105,-0.551699,0.188743,1.0,0.07365,0.016352,0.428007,0.108667,-0.006732,-1e-05,-0.180141,-0.183446,-0.104109,-0.051306,-0.0083,-0.016481,0.115202,0.117222,-0.041588,0.083634
v,0.006227,0.200667,0.165354,-0.054809,0.07365,1.0,0.115415,-0.07466,-0.001761,0.025306,0.023533,0.013384,0.012296,0.009336,0.077626,0.02649,0.031729,-0.012981,-0.019746,0.024071,-0.013801
u,-0.011279,0.114027,0.086149,-0.018056,0.016352,0.115415,1.0,0.257787,0.010449,0.008638,0.016918,-0.003099,-0.00967,0.000458,0.024574,0.011085,0.014348,-0.033137,-0.017877,0.033284,-0.046876
pblheight,0.431551,-0.075163,-0.545366,0.042301,0.428007,-0.07466,0.257787,1.0,0.059344,0.065458,0.053151,-0.049558,-0.05454,-0.050441,0.082917,0.06917,0.077937,0.023691,0.027471,0.043243,-0.040074
landcover,-0.086479,0.025545,-0.002662,0.533085,0.108667,-0.001761,0.010449,0.059344,1.0,0.234724,0.213595,-0.487568,-0.504831,-0.395307,0.286033,0.236792,0.192011,0.48031,0.343327,0.20976,-0.050055
pop_density,-0.093099,0.055509,-0.004158,0.191826,-0.006732,0.025306,0.008638,0.065458,0.234724,1.0,0.755323,-0.175358,-0.180599,-0.142146,0.586112,0.949759,0.805178,0.332491,0.176656,0.533388,-0.081855


In [57]:
corr_o3_no2 = comb_o3_no2_csv.corr()

corr_o3_no2.style.background_gradient(cmap='coolwarm')

Unnamed: 0,o3,no2,cloudcover,relhum,press,temp,v,u,pblheight,landcover,pop_density,nox_emi,alt,station_etopo_alt,station_rel_etopo_alt,omi_nox,max_5km_pop_density,max_25km_pop_density,nightlight_1km,nightlight_max_25km,toar_category,time_idx
o3,1.0,-0.342667,-0.254813,-0.517129,-0.085252,0.626144,0.001077,-0.006774,0.450533,-0.057243,-0.09171,-0.087967,0.088139,0.08346,0.089919,-0.131292,-0.090779,-0.085912,-0.107459,-0.048364,-0.08877,0.093329
no2,-0.342667,1.0,-0.018225,0.076098,0.106662,-0.218934,0.038329,-0.10185,-0.230131,0.103928,0.192109,0.221619,-0.138552,-0.124701,-0.099257,0.248076,0.176655,0.15465,0.244044,0.13056,0.156316,-0.136634
cloudcover,-0.254813,-0.018225,1.0,0.524173,-0.047281,-0.242746,0.20102,0.115902,-0.077349,0.049371,0.068469,0.04921,-0.010094,-0.014964,-0.04499,0.135448,0.07486,0.081615,0.03868,0.004424,0.065556,-0.049137
relhum,-0.517129,0.076098,0.524173,1.0,0.00231,-0.54995,0.162622,0.0808,-0.558986,0.03297,0.016454,0.005742,-0.015182,-0.020315,-0.025568,0.071203,0.01944,0.013044,0.01238,-0.008358,0.048073,-0.065785
press,-0.085252,0.106662,-0.047281,0.00231,1.0,0.149426,-0.060832,-0.013627,0.035817,0.489185,0.178368,0.197706,-0.817675,-0.854364,-0.473153,0.299246,0.192179,0.175094,0.394675,0.403443,0.166295,-0.059137
temp,0.626144,-0.218934,-0.242746,-0.54995,0.149426,1.0,0.075374,0.020579,0.432495,0.05717,-0.032143,-0.015824,-0.146055,-0.149373,-0.050835,-0.08431,-0.034179,-0.041398,0.037562,0.071356,-0.042893,0.100183
v,0.001077,0.038329,0.20102,0.162622,-0.060832,0.075374,1.0,0.10394,-0.086228,0.008517,0.033978,0.028435,0.012489,0.011151,-0.002111,0.092021,0.035848,0.041723,0.007466,-0.002946,0.024944,-0.023832
u,-0.006774,-0.10185,0.115902,0.0808,-0.013627,0.020579,0.10394,1.0,0.254538,0.022182,0.020703,0.020355,-0.011419,-0.01838,-0.010425,0.032405,0.02301,0.02201,0.018997,0.013825,0.044884,-0.055063
pblheight,0.450533,-0.230131,-0.077349,-0.558986,0.035817,0.432495,-0.086228,0.254538,1.0,0.071217,0.067432,0.056799,-0.038884,-0.044813,-0.043634,0.080721,0.071849,0.077643,0.054657,0.03938,0.052633,-0.046502
landcover,-0.057243,0.103928,0.049371,0.03297,0.489185,0.05717,0.008517,0.022182,0.071217,1.0,0.206184,0.198615,-0.402475,-0.423374,-0.330219,0.237677,0.208966,0.149484,0.403242,0.276127,0.234299,-0.055124


In [64]:
### It may be useful to plot a map of all my stations!!! I hope I kept lat/long!

### Fuck, I didn't. Where am I gonna grab these from?

france_df = pd.read_csv('/home/jovyan/lustre_scratch/cas/european_data/per_country/france_env.csv')

In [65]:
france_df

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,datetime,press,relhum,u,temp,totprecip,v,pblheight,cloudcover,station_name,lat,lon,alt,station_etopo_alt,station_rel_etopo_alt,station_type,landcover,toar_category,pop_density,max_5km_pop_density,max_25km_pop_density,nightlight_1km,nightlight_max_25km,nox_emi,omi_nox,co
0,0,0,1997-01-01,998.275000,76.765000,-3.285750,-11.183333,0.004231,-0.727642,373.900000,85.302000,Chateau Thierry,49.044167,3.397792,66.0,103.0,45.0,background,12.0,0.0,3746.0,8093.0,8093.0,54.0,57.0,2.02326,3.54,
1,1,1,1997-01-02,997.141667,83.644167,-3.035333,-8.600000,0.006876,-1.447260,269.918333,87.178333,Chateau Thierry,49.044167,3.397792,66.0,103.0,45.0,background,12.0,0.0,3746.0,8093.0,8093.0,54.0,57.0,2.02326,3.54,
2,2,2,1997-01-03,987.116667,88.940000,-3.963083,-6.025000,0.057272,-2.030275,369.000000,100.000000,Chateau Thierry,49.044167,3.397792,66.0,103.0,45.0,background,12.0,0.0,3746.0,8093.0,8093.0,54.0,57.0,2.02326,3.54,
3,3,3,1997-01-04,983.775000,88.749167,-1.194583,-5.591667,0.067547,-3.277917,383.733333,100.000000,Chateau Thierry,49.044167,3.397792,66.0,103.0,45.0,background,12.0,0.0,3746.0,8093.0,8093.0,54.0,57.0,2.02326,3.54,
4,4,4,1997-01-05,992.041667,86.235833,-1.804917,-4.575000,0.020020,1.650617,310.735000,75.344167,Chateau Thierry,49.044167,3.397792,66.0,103.0,45.0,background,12.0,0.0,3746.0,8093.0,8093.0,54.0,57.0,2.02326,3.54,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6906833,6569,6569,2014-12-27,994.675000,87.356667,3.789392,1.783333,1.054362,,1298.833333,100.000000,Somain Salengro,50.358334,3.280555,22.0,20.0,4.0,industrial,12.0,0.0,13473.0,13473.0,38730.0,60.0,63.0,26.96070,5.91,
6906834,6570,6570,2014-12-28,1025.750000,75.588333,-2.908550,-3.191667,0.000000,,537.583333,0.772333,Somain Salengro,50.358334,3.280555,22.0,20.0,4.0,industrial,12.0,0.0,13473.0,13473.0,38730.0,60.0,63.0,26.96070,5.91,
6906835,6571,6571,2014-12-29,1033.666667,95.490000,2.306667,0.641667,0.448042,,275.466667,95.864167,Somain Salengro,50.358334,3.280555,22.0,20.0,4.0,industrial,12.0,0.0,13473.0,13473.0,38730.0,60.0,63.0,26.96070,5.91,
6906836,6572,6572,2014-12-30,1034.916667,96.050000,2.194667,0.766667,0.000407,,273.050000,73.020667,Somain Salengro,50.358334,3.280555,22.0,20.0,4.0,industrial,12.0,0.0,13473.0,13473.0,38730.0,60.0,63.0,26.96070,5.91,
