In [1]:
import os
import pandas as pd
import numpy as np
import pickle
from tqdm.notebook import tqdm
from sklearn.preprocessing import StandardScaler
from matplotlib import pyplot as plt
from netCDF4 import Dataset

In [None]:
DATA_DIR = "/xace/d1/hamer"

In [2]:

MODE = 'O3'
VARS = ['O3', 'TEMP2', 'RGRND', 'PBL2', 'WDIR10', 'WSPD10', 'NOX', 'NOY', 'TIMEOFDAY', 'site_lat', 'site_lon']
VARS = ['O3', 'TEMP2', 'RGRND', 'PBL2', 'WDIR10', 'WSPD10', 'NOX', 'NOY', 'TIMEOFDAY']
if(MODE == 'pm25'):
    #VARS = ['pm25', 'RC', 'RGRND', 'RN', 'TEMP2', 'WDIR10', 'WSPD10', 'TIMEOFDAY', 'site_lat', 'site_lon']
    VARS = ['pm25', 'RC', 'RGRND', 'RN', 'TEMP2', 'WDIR10', 'WSPD10', 'TIMEOFDAY']
    #VARS = ['pm25']

mult = 1
if(MODE == 'O3'):
    mult = 1000

In [3]:
def prepare_data(data):
    data["stationid"] = data["stationid"].astype("int64")
    data["date_time"] = pd.to_datetime(data["date_time"])
    return data.rename(columns={"lon": "Longitude", "lat": "Latitude"})

In [4]:
# Create a list to load all the AirNow data into
csv_data_current = []

# Load each AirNow file as a pandas dataframe and insert it into the array.
print("Loading AirNow data")
for data_file in tqdm(os.listdir(DATA_DIR+'/airnow/'+MODE), desc='AirNow Files'):
    csv_data_current.append(prepare_data(pd.read_csv(DATA_DIR+"/airnow/"+MODE+'/'+data_file, parse_dates=True)))

# Combine the dataframes together into a single large one.
airnow_combined = pd.concat(csv_data_current)

# Pivot the dataframe to have time as the rows and station IDs as the columns. We also multiply the entire dataframe
# by 1000 to convert ppm into ppb (which is easier to work with). This isn't done for pm25.
print("Generating AirNow dataframe")

airnow_data = airnow_combined.pivot(index='date_time', columns='stationid', values=MODE).iloc[13:,:].dropna(axis=1)*mult

# Get a list of stations that appear in the AirNow data.
airnow_stations = np.unique(np.array(airnow_data.columns))

# Get a list of stations that appear in the forecast data (we load only the first file since they all have the same
# stations in each of them).
ncdf_dataset = Dataset(DATA_DIR+"/interpolated/"+MODE+"/forecasts.interp.20190701.12z.nc", "r", format="NETCDF3_CLASSIC")
ncdf_stations = np.array([int("".join([j.decode("utf-8") for j in i])) for i in ncdf_dataset.variables['site_id']])

all_stations = np.union1d(airnow_stations, ncdf_stations)
np.savetxt('all'+MODE+'.txt',all_stations)

# Generate a lsit of actually useable stations by finding the intersection of the AirNow and forecast stations
useable_stations = np.intersect1d(airnow_stations, ncdf_stations)
np.savetxt('useable_stations_'+MODE+'.txt',all_stations)

# Remove the stations that aren't also in the forecast data
airnow_data = airnow_data[airnow_data.columns.intersection(useable_stations)]

# Fill in missing times with invalid data
airnow_data = airnow_data.reindex(pd.date_range(airnow_data.index[0], airnow_data.index[-1], freq='h'), fill_value=-999.0*mult)

# Save the dataframe as a csv
print("Saving AirNow data")
airnow_data.to_csv(DATA_DIR+"/AirNow_"+MODE+".csv")

Loading AirNow data


AirNow Files:   0%|          | 0/6 [00:00<?, ?it/s]

Generating AirNow dataframe
Saving AirNow data


In [5]:
print(airnow_combined[['stationid','Latitude','Longitude']].drop_duplicates().to_numpy().shape)

(1980, 3)


In [6]:
airnow_combined

Unnamed: 0,stationid,Latitude,Longitude,O3,date_time
0,370510010,35.00139,-78.99055,0.039,2019-08-01 00:00:00
1,370630015,35.99167,-78.89639,0.033,2019-08-01 00:00:00
2,370630099,35.88917,-78.87444,0.037,2019-08-01 00:00:00
3,370650099,35.98889,-77.58667,0.045,2019-08-01 00:00:00
4,370670022,36.11056,-80.22667,0.047,2019-08-01 00:00:00
...,...,...,...,...,...
891475,370670022,36.11056,-80.22667,0.047,2019-09-30 23:00:00
891476,370670030,36.02583,-80.34167,0.043,2019-09-30 23:00:00
891477,370671008,36.05056,-80.14389,0.041,2019-09-30 23:00:00
891478,370750001,35.25778,-83.79528,0.042,2019-09-30 23:00:00


In [7]:
(airnow_combined.pivot(index='date_time', columns='stationid', values=MODE).iloc[13:,:].fillna(-999.0)*mult).to_csv(DATA_DIR+"/AirNow_all_"+MODE+".csv")

In [8]:
airnow_data

stationid,31101,40203,40302,41302,50204,50308,50311,50604,52301,53201,...,530090013,530330080,530570011,530730005,540390020,550030010,550270001,550790026,560030002,560450003
2019-07-01 13:00:00,24.0,24.0,19.0,19.0,15.0,12.0,16.0,14.0,10.0,17.0,...,22.0,22.0,4.0,2.0,15.0,23.0,20.0,19.0,26.0,19.0
2019-07-01 14:00:00,24.0,22.0,21.0,19.0,18.0,13.0,18.0,15.0,11.0,21.0,...,23.0,5.0,10.0,4.0,23.0,25.0,22.0,25.0,23.0,20.0
2019-07-01 15:00:00,23.0,19.0,21.0,20.0,18.0,15.0,19.0,18.0,14.0,22.0,...,23.0,3.0,19.0,12.0,29.0,-999000.0,30.0,29.0,36.0,20.0
2019-07-01 16:00:00,22.0,19.0,21.0,21.0,19.0,17.0,25.0,22.0,20.0,24.0,...,24.0,6.0,20.0,18.0,44.0,-999000.0,39.0,31.0,39.0,25.0
2019-07-01 17:00:00,23.0,20.0,20.0,24.0,20.0,22.0,31.0,22.0,23.0,26.0,...,23.0,7.0,19.0,23.0,56.0,28.0,39.0,42.0,41.0,31.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2019-12-31 19:00:00,41.0,38.0,30.0,43.0,25.0,30.0,34.0,24.0,32.0,36.0,...,40.0,27.0,24.0,27.0,34.0,38.0,36.0,30.0,34.0,44.0
2019-12-31 20:00:00,41.0,39.0,30.0,44.0,25.0,31.0,34.0,24.0,32.0,37.0,...,40.0,26.0,25.0,26.0,34.0,39.0,37.0,31.0,38.0,44.0
2019-12-31 21:00:00,41.0,39.0,30.0,45.0,19.0,30.0,35.0,24.0,31.0,37.0,...,40.0,25.0,25.0,24.0,34.0,39.0,37.0,31.0,39.0,44.0
2019-12-31 22:00:00,41.0,39.0,30.0,45.0,14.0,33.0,35.0,23.0,30.0,38.0,...,40.0,26.0,22.0,23.0,33.0,36.0,37.0,30.0,38.0,44.0


In [9]:
# Generate dataframes for each forecast day in the NETCDF forecast data

# Get a list of the NETCDF forecast files
ncdf_files = sorted([ a for a in os.listdir(DATA_DIR+'/interpolated/'+MODE) if(a[-6:]=="12z.nc")])

# Get the Site IDs. We use only the first file to generate them because they are the same across all of the 12z files.
rootgrp = Dataset(DATA_DIR+"/interpolated/"+MODE+"/"+ncdf_files[0], "r", format="NETCDF3_CLASSIC")
ncdf_siteids = np.array([int("".join([j.decode("utf-8") for j in i])) for i in rootgrp.variables['site_id']])

# We generate forecast tables for each NETCDF file (i.e. each forecast day)
print("Generating forecast data tables")
forecast_days = {}

# We cut out the last two days since they will contain data that won't match up to our airnow data
for current_file in tqdm(ncdf_files[:-2], desc='Forecast Days'):
    
    # Load the current NCDF file.
    rootgrp = Dataset(DATA_DIR+"/interpolated/"+MODE+"/"+current_file, "r", format="NETCDF3_CLASSIC")
    
    # Prepare a dictionary to store data from this file.
    current_data = {}
    
    # We need to generate a forecast table for each variable. The table will have
    # times as rows and station IDs as columns.
    for var in VARS:
        
        # Time of day is not included as a physical variable, so we need to generate
        # it ourselves from the forecast hour
        if(var == 'TIMEOFDAY'):
            # All variable sets have 48 rows (one for each hour) and a column for each
            # station, so each row we make needs to have a length equal to the number
            # of station IDs.
            # TODO: Compensate for time zone
            current_data[var] = []
            for i in range(48):
                current_data[var].append([(i+13)%24]*rootgrp.variables[MODE].shape[1])
            current_data[var] = np.array(current_data[var])
        elif(var == 'site_lat' or var == 'site_lon'):
            # Similarly to the above, latitude and longitude needs to be spread across the entire
            # 48 hours since they have single constant values per station.
            current_data[var] = []
            for i in range(48):
                current_data[var].append(rootgrp.variables[var][:])
            current_data[var] = np.array(current_data[var])
        else:
            # Get the data for a specific physical variable across all stations
            # This is a two dimensional array with the forecast hour as rows
            # and the station as the column (shape of hours x stations).
            current_data[var] = rootgrp.variables[var][:]
            
        # Convert the data to an nparray
        current_data[var] = np.array(current_data[var])

        # O3 is in ppm, but we want it in ppb since it's easier to read that way.
        if(var==MODE and MODE == 'O3'):
            current_data[var]*=mult
    
    # The dataframes for each variable are stacked on top of each other.
    current_data_all = np.concatenate([current_data[var] for var in VARS], axis=0)
    
    # We create a pandas dataframe to store the data from the current forecast day.
    current_table = pd.DataFrame(current_data_all)
    
    # When the dataframe is created, the columns are integers, so we need to rename
    # them to be the actual name of the site ID they represent.
    current_table = current_table.rename(columns={index: id for index,id in enumerate(ncdf_siteids)})
    
    # We can only work with stations that appear in both the forecast and airnow data, so
    # we remove the ones that aren't airnow stations as well.
    current_table = current_table[current_table.columns.intersection(useable_stations)]
    
    # Each 48 hour long segment of rows is generated from a different variable. We add a column in that keeps track of
    # what variable each row represents. We do this by moving through the list of variables in the same order that we
    # stacked the dataframes, setting each consecutive 48 hour block to its corresponding variable.
    current_table['var'] = [VARS[i//48] for i in range(len(current_table.index))]
    
    # The variable column is moved to the left of the data frame because it looks nicer.
    current_table = current_table.reindex(columns=['var', *[a for a in current_table.columns if(a!='var')]])

    # Determine the forecast year, month, and day represented by the file we're working with.
    # These are convienently stored as part of the filename.
    year = current_file[-15:-11]
    month = current_file[-11:-9]
    day = current_file[-9:-7]

    # Combine the above information to create a properly formatted string representing the starting time
    # of the forecast in the NETCDF file.
    start_time = '-'.join([year, month, day])+" 13:00:00"

    # Since each forecast file contrains 48 hours in it, we start with the starting time and
    # generate a date range for the entire 48 hour period
    date_range = pd.date_range(start_time, periods=48, freq="H")

    # We create a new column to store the date and time of each forecast hour. This is not stored in the
    # NETCDF file, but since we know that each file starts on the corresponding day at 13:00 GMT and has
    # exactly 48 hours of data, we can use the date rage we just generated. Since we are dealing with a
    # number of different dataframes each representing the same 48 hour period stacked on top of each
    # other, we repeat the date range len(VARS) times (one repetition for each physical variable). This
    # is similar to what we did with the physical variable above
    current_table["date_time"] = list(date_range)*len(VARS)
    
    # The date_time column is also moved to the left of the data frame because it looks nice.
    current_table = current_table.reindex(columns=['date_time', *[a for a in current_table.columns if(a!='date_time')]])
    
    # Finally, we need to turn the columns (representing different station IDs) into a single stacked column where each
    # entry corresponds to what station ID the value in the 'value' column belongs to. We are left with what looks like
    # a tree structure: the first column indicates the time of the forecast value, the second indicates what kind of
    # variable it is, and the third represents what station the value is from. There is only a single remaining column
    # after this, which contains the value that the unique combination of the previous three columns correspond to.
    current_table = pd.melt(current_table, id_vars=['date_time', 'var'], var_name='stationid')
    
    # This unusual shape allows us to pivot the entire dataframe in such a way that we can
    # create a multiindex dataframe that allows us to easily find the physical variable
    # forecastsfor each point in time for each station ID. The two indicies for the new 
    # dataframe are 
    current_table = current_table.pivot(index=['stationid', 'date_time'], columns='var', values='value')
    
    # Finally, we add the table to the dictionary of all forecast data
    forecast_days['-'.join([year, month, day])] = current_table
    
# Create a list of data for each forecast day
forecast_days_list = [forecast_days[day] for day in forecast_days.keys()]

# Comibine all these lists together into a large dataframe using the forecast day as a new index to reference each one
forecast_data = pd.concat(forecast_days_list, keys=forecast_days.keys(), axis=0, names=['forecast_day', *forecast_days_list[0].index.names])

Generating forecast data tables


Forecast Days:   0%|          | 0/182 [00:00<?, ?it/s]

In [10]:
#scaler = {}
#for variable in VARS:
#    current_variable_data = (forecast_data[variable])
#    if(variable=='O3'):
#        current_variable_data = np.concatenate([current_variable_data, np.array(airnow_data).flatten()])
#    scaler[variable] = StandardScaler().fit(np.array(current_variable_data).reshape(-1,1))
#
#    
#    print(forecast_data[variable])
#    test = scaler[variable].transform(np.array(forecast_data[variable]).reshape(-1,1))
#    print(scaler[variable].transform(np.array(forecast_data[variable]).reshape(-1,1)))
#    print(scaler[variable].inverse_transform(test.reshape(-1,1)))


In [11]:
airnow_data

stationid,31101,40203,40302,41302,50204,50308,50311,50604,52301,53201,...,530090013,530330080,530570011,530730005,540390020,550030010,550270001,550790026,560030002,560450003
2019-07-01 13:00:00,24.0,24.0,19.0,19.0,15.0,12.0,16.0,14.0,10.0,17.0,...,22.0,22.0,4.0,2.0,15.0,23.0,20.0,19.0,26.0,19.0
2019-07-01 14:00:00,24.0,22.0,21.0,19.0,18.0,13.0,18.0,15.0,11.0,21.0,...,23.0,5.0,10.0,4.0,23.0,25.0,22.0,25.0,23.0,20.0
2019-07-01 15:00:00,23.0,19.0,21.0,20.0,18.0,15.0,19.0,18.0,14.0,22.0,...,23.0,3.0,19.0,12.0,29.0,-999000.0,30.0,29.0,36.0,20.0
2019-07-01 16:00:00,22.0,19.0,21.0,21.0,19.0,17.0,25.0,22.0,20.0,24.0,...,24.0,6.0,20.0,18.0,44.0,-999000.0,39.0,31.0,39.0,25.0
2019-07-01 17:00:00,23.0,20.0,20.0,24.0,20.0,22.0,31.0,22.0,23.0,26.0,...,23.0,7.0,19.0,23.0,56.0,28.0,39.0,42.0,41.0,31.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2019-12-31 19:00:00,41.0,38.0,30.0,43.0,25.0,30.0,34.0,24.0,32.0,36.0,...,40.0,27.0,24.0,27.0,34.0,38.0,36.0,30.0,34.0,44.0
2019-12-31 20:00:00,41.0,39.0,30.0,44.0,25.0,31.0,34.0,24.0,32.0,37.0,...,40.0,26.0,25.0,26.0,34.0,39.0,37.0,31.0,38.0,44.0
2019-12-31 21:00:00,41.0,39.0,30.0,45.0,19.0,30.0,35.0,24.0,31.0,37.0,...,40.0,25.0,25.0,24.0,34.0,39.0,37.0,31.0,39.0,44.0
2019-12-31 22:00:00,41.0,39.0,30.0,45.0,14.0,33.0,35.0,23.0,30.0,38.0,...,40.0,26.0,22.0,23.0,33.0,36.0,37.0,30.0,38.0,44.0


In [12]:
#min(airnow_data[560210100].to_numpy())

In [13]:
forecast_data.loc['2019-07-23',60304]

var,NOX,NOY,O3,PBL2,RGRND,TEMP2,TIMEOFDAY,WDIR10,WSPD10
date_time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2019-07-23 13:00:00,0.005036,0.005626,12.825121,116.739182,371.898132,292.40921,13.0,316.241974,0.623503
2019-07-23 14:00:00,0.003315,0.004039,19.010958,177.407272,502.912476,293.904419,14.0,316.278839,0.83592
2019-07-23 15:00:00,0.001763,0.002574,24.067854,605.025269,669.379822,295.142914,15.0,286.92688,1.324139
2019-07-23 16:00:00,0.001096,0.001954,26.365742,1019.169189,800.927063,296.156036,16.0,310.393799,1.804484
2019-07-23 17:00:00,0.000762,0.001546,26.958151,1440.0177,885.975769,296.842255,17.0,316.744141,1.783401
2019-07-23 18:00:00,0.000627,0.001269,26.749819,1609.611206,921.121155,297.151672,18.0,307.72345,2.008036
2019-07-23 19:00:00,0.000655,0.00128,27.167061,1605.471924,897.389832,297.357941,19.0,298.383575,2.126442
2019-07-23 20:00:00,0.000945,0.001616,27.725958,1579.457275,717.993042,297.388672,20.0,285.469086,2.003363
2019-07-23 21:00:00,0.002777,0.00359,26.796473,1440.341064,495.545288,296.848846,21.0,277.820068,2.019284
2019-07-23 22:00:00,0.004955,0.006023,26.244555,188.310455,394.341797,296.612213,22.0,278.37085,1.858191


In [14]:
#x = forecast_data.loc['2019-06-01',560210100][MODE]
#print(x)
#plt.plot(x)
#plt.show()

In [15]:
#forecast_data.to_csv("/xace/d1/hamer/Forecast_"+MODE+".csv")

#min(airnow_data[560210100].to_numpy())

In [16]:
forecast_data

Unnamed: 0_level_0,Unnamed: 1_level_0,var,NOX,NOY,O3,PBL2,RGRND,TEMP2,TIMEOFDAY,WDIR10,WSPD10
forecast_day,stationid,date_time,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
2019-07-01,31101,2019-07-01 13:00:00,0.000257,0.000296,33.891018,354.383392,96.653496,286.246979,13.0,352.508270,5.514773
2019-07-01,31101,2019-07-01 14:00:00,0.000274,0.000322,32.094608,399.077545,139.307571,286.648651,14.0,347.428833,5.562788
2019-07-01,31101,2019-07-01 15:00:00,0.000315,0.000375,30.192698,381.588287,74.190269,286.625183,15.0,350.452484,5.556478
2019-07-01,31101,2019-07-01 16:00:00,0.000333,0.000407,29.272120,309.297760,73.074356,286.339111,16.0,351.686035,5.746549
2019-07-01,31101,2019-07-01 17:00:00,0.000310,0.000394,29.926655,252.776016,131.964478,286.667999,17.0,350.847504,5.844736
...,...,...,...,...,...,...,...,...,...,...,...
2019-12-29,560450003,2019-12-31 08:00:00,0.001781,0.002300,27.971415,181.795319,0.000000,264.614960,8.0,330.069336,4.010539
2019-12-29,560450003,2019-12-31 09:00:00,0.001934,0.002462,27.438204,137.608383,0.000000,264.372650,9.0,331.243195,3.509196
2019-12-29,560450003,2019-12-31 10:00:00,0.002059,0.002561,27.578629,104.542549,0.000000,264.211884,10.0,343.686493,3.058257
2019-12-29,560450003,2019-12-31 11:00:00,0.002174,0.002621,28.027172,87.356682,0.000000,264.444794,11.0,334.226166,2.528249


In [17]:
forecast_data.unstack(1).to_csv(DATA_DIR+"/Forecast_"+MODE+".csv")

In [18]:
forecast_data.unstack(1)

Unnamed: 0_level_0,var,NOX,NOX,NOX,NOX,NOX,NOX,NOX,NOX,NOX,NOX,...,WSPD10,WSPD10,WSPD10,WSPD10,WSPD10,WSPD10,WSPD10,WSPD10,WSPD10,WSPD10
Unnamed: 0_level_1,stationid,31101,40203,40302,41302,50204,50308,50311,50604,52301,53201,...,530090013,530330080,530570011,530730005,540390020,550030010,550270001,550790026,560030002,560450003
forecast_day,date_time,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2,Unnamed: 22_level_2
2019-07-01,2019-07-01 13:00:00,0.000257,0.001515,0.001192,0.000238,0.008442,0.019575,0.012591,0.000804,0.000322,0.000137,...,2.582263,0.918071,2.148381,1.152274,0.899693,2.696055,1.709475,1.567075,1.809994,1.330562
2019-07-01,2019-07-01 14:00:00,0.000274,0.001254,0.001155,0.000162,0.006453,0.018001,0.010722,0.000545,0.000176,0.000095,...,2.427305,0.672259,2.565639,1.140864,0.643541,2.984283,2.455909,1.736496,3.307123,0.614147
2019-07-01,2019-07-01 15:00:00,0.000315,0.001028,0.001120,0.000134,0.004599,0.015750,0.008518,0.000391,0.000082,0.000062,...,2.716349,0.332249,2.752189,0.559285,0.333356,2.239125,3.789387,1.774613,4.508421,1.620302
2019-07-01,2019-07-01 16:00:00,0.000333,0.000856,0.001172,0.000117,0.003827,0.012481,0.005801,0.000300,0.000059,0.000049,...,2.914012,0.177305,2.836377,0.875991,0.258781,1.794313,3.149603,1.638359,5.006749,2.336896
2019-07-01,2019-07-01 17:00:00,0.000310,0.000831,0.001170,0.000104,0.003280,0.011103,0.004260,0.000251,0.000052,0.000048,...,3.269623,0.650627,2.766997,1.215768,0.259829,1.172534,2.919388,0.573992,5.132764,2.859793
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2019-12-29,2019-12-31 08:00:00,0.000928,0.001594,0.000972,0.000892,0.004025,0.002628,0.003339,0.001228,0.001200,0.000636,...,7.173323,3.823772,7.020575,2.517564,4.712719,6.966690,8.462704,9.608441,2.554657,4.010539
2019-12-29,2019-12-31 09:00:00,0.000927,0.001580,0.001146,0.000626,0.004295,0.002734,0.003621,0.001161,0.001153,0.000494,...,6.544968,4.136714,7.224135,2.393618,4.622091,7.172390,8.152199,9.328160,2.649669,3.509196
2019-12-29,2019-12-31 10:00:00,0.000855,0.001636,0.001482,0.000557,0.004890,0.002885,0.004035,0.001097,0.001117,0.000482,...,6.727985,4.243345,6.996727,2.035222,4.620434,7.490269,7.793952,9.047058,2.853716,3.058257
2019-12-29,2019-12-31 11:00:00,0.000788,0.001683,0.001971,0.000531,0.006056,0.003290,0.004817,0.001048,0.001039,0.000527,...,6.985061,4.072632,7.398999,2.442325,4.408671,7.414190,7.533258,8.818333,2.767504,2.528249


In [19]:
# Pickle the AirNow and forecast data as python dictionaries so that we can load them
# more easily in the LSTM

print("Pickling data")
pickle.dump((airnow_data, forecast_data), open(DATA_DIR+"/pickle_files/data_"+MODE+".p", "wb"))

Pickling data


In [20]:
airnow_data

stationid,31101,40203,40302,41302,50204,50308,50311,50604,52301,53201,...,530090013,530330080,530570011,530730005,540390020,550030010,550270001,550790026,560030002,560450003
2019-07-01 13:00:00,24.0,24.0,19.0,19.0,15.0,12.0,16.0,14.0,10.0,17.0,...,22.0,22.0,4.0,2.0,15.0,23.0,20.0,19.0,26.0,19.0
2019-07-01 14:00:00,24.0,22.0,21.0,19.0,18.0,13.0,18.0,15.0,11.0,21.0,...,23.0,5.0,10.0,4.0,23.0,25.0,22.0,25.0,23.0,20.0
2019-07-01 15:00:00,23.0,19.0,21.0,20.0,18.0,15.0,19.0,18.0,14.0,22.0,...,23.0,3.0,19.0,12.0,29.0,-999000.0,30.0,29.0,36.0,20.0
2019-07-01 16:00:00,22.0,19.0,21.0,21.0,19.0,17.0,25.0,22.0,20.0,24.0,...,24.0,6.0,20.0,18.0,44.0,-999000.0,39.0,31.0,39.0,25.0
2019-07-01 17:00:00,23.0,20.0,20.0,24.0,20.0,22.0,31.0,22.0,23.0,26.0,...,23.0,7.0,19.0,23.0,56.0,28.0,39.0,42.0,41.0,31.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2019-12-31 19:00:00,41.0,38.0,30.0,43.0,25.0,30.0,34.0,24.0,32.0,36.0,...,40.0,27.0,24.0,27.0,34.0,38.0,36.0,30.0,34.0,44.0
2019-12-31 20:00:00,41.0,39.0,30.0,44.0,25.0,31.0,34.0,24.0,32.0,37.0,...,40.0,26.0,25.0,26.0,34.0,39.0,37.0,31.0,38.0,44.0
2019-12-31 21:00:00,41.0,39.0,30.0,45.0,19.0,30.0,35.0,24.0,31.0,37.0,...,40.0,25.0,25.0,24.0,34.0,39.0,37.0,31.0,39.0,44.0
2019-12-31 22:00:00,41.0,39.0,30.0,45.0,14.0,33.0,35.0,23.0,30.0,38.0,...,40.0,26.0,22.0,23.0,33.0,36.0,37.0,30.0,38.0,44.0


In [21]:
print("AirNow Data:")
print(airnow_data)
print("Forecast Data:")
print(forecast_data)

AirNow Data:
stationid            31101      40203      40302      41302      50204      \
2019-07-01 13:00:00       24.0       24.0       19.0       19.0       15.0   
2019-07-01 14:00:00       24.0       22.0       21.0       19.0       18.0   
2019-07-01 15:00:00       23.0       19.0       21.0       20.0       18.0   
2019-07-01 16:00:00       22.0       19.0       21.0       21.0       19.0   
2019-07-01 17:00:00       23.0       20.0       20.0       24.0       20.0   
...                        ...        ...        ...        ...        ...   
2019-12-31 19:00:00       41.0       38.0       30.0       43.0       25.0   
2019-12-31 20:00:00       41.0       39.0       30.0       44.0       25.0   
2019-12-31 21:00:00       41.0       39.0       30.0       45.0       19.0   
2019-12-31 22:00:00       41.0       39.0       30.0       45.0       14.0   
2019-12-31 23:00:00       41.0       39.0       30.0       44.0       10.0   

stationid            50308      50311      50604  

In [22]:
print(airnow_data.columns)

Int64Index([    31101,     40203,     40302,     41302,     50204,     50308,
                50311,     50604,     52301,     53201,
            ...
            530090013, 530330080, 530570011, 530730005, 540390020, 550030010,
            550270001, 550790026, 560030002, 560450003],
           dtype='int64', name='stationid', length=385)
