In [29]:
import pandas as pd
import geopandas as gpd
import networkx as nx
import numpy as np
from datetime import datetime
from dateutil import parser
from sklearn import preprocessing
# pip install azureml-opendatasets-runtimeusing
from azureml.opendatasets import NycTlcYellow
import calendar
import numpy.linalg as linalg
import matplotlib.pyplot as plt
import pickle
import momepy
import itertools
# torch stuff
from torch.utils.data import DataLoader, Dataset
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch_geometric.nn import GCNConv
from tqdm import tqdm
import copy
import os.path
from sklearn.metrics import classification_report
from sklearn.utils.extmath import softmax
from pprint import pprint
from itertools import islice

In [30]:
# NOTE: This is what the NOAA columns mean
# AWND - Average daily wind speed (tenths of meters per second),awnd
# DAPR - Number of days included in the multiday precipitation total (MDPR),dapr
# FRGT - Top of frozen ground layer (cm),frgt
# FRTH - Thickness of frozen ground layer (cm),frth
# GAHT - Difference between river and gauge height (cm),gaht
# MDPR - Multiday precipitation total (tenths of mm. use with DAPR and DWPR. if available),mdpr
# PGTM - Peak gust time (hours and minutes i.e. HHMM),pgtm
# PRCP - Precipitation (tenths of mm),prcp
# SNOW - Snowfall (mm),snow
# SNWD - Snow depth (mm),snwd
# THIC - Thickness of ice on water (tenths of mm),thic
# TMAX - Maximum temperature (tenths of degrees C),tmax
# TMIN - Minimum temperature (tenths of degrees C),tmin
# TOBS - Temperature at the time of observation (tenths of degrees C),tobs
# WDFG - Direction of peak wind gust (degrees),wdfg
# WESD - Water equivalent of snow on the ground (tenths of mm),wesd
# WESF - Water equivalent of snowfall (tenths of mm),wesf
# WSFG - Peak guest wind speed (tenths of meters per second),wsfg
# WT01 - Fog ice fog or freezing fog (may include heavy fog),fog
# WT03 - Thunder,thunder
# WT04 - Ice pellets sleet snow pellets or small hail,ice
# WT05 - Hail (may include small hail),hail
# WT06 - Glaze or rime,glaze
# WT07 - Dust volcanic ash blowing dust blowing sand or blowing obstruction,dust
# WT08 - Smoke or haze,smoke
# WT09 - Blowing or drifting snow,snowdrift
# WT11 - High or damaging winds,highwinds
# WT14 - Drizzle,drizzle
# WT16 - Rain (may include freezing rain drizzle and freezing drizzle),prcp2
# WT18 - Snow snow pellets snow grains or ice crystals,snow2

# NOTE: Before filling na, these are the NA values through these 3 years
# STATION       0
# NAME          0
# DATE          0
# AWND          3
# PGTM       1095
# PRCP          0
# SNOW          0
# SNWD          0
# TAVG       1095
# TMAX          0
# TMIN          0
# TSUN       1095
# WDF2          2
# WDF5          9
# WSF2          2
# WSF5          9
# WT01        851
# WT02       1081
# WT03       1095
# WT04       1092
# WT05       1095
# WT06       1091
# WT08        904
# WT13       1068
# WT16       1072
# WT18       1074
# WT19       1086
# WT22       1093
# date          0
# year          0

# NOTE: Turns out, the WT-- columns signify whether an extreme 
# weather took place or not. If not, they get a NaN value. So NaNs
# should be replaced with 0. Other NaNs are sparse, so we can fill forward.
def updated_preprocess_weather(years=[2013]):
    # Convert to int because that's how it's stored in the dataframe
    years = [int(year) for year in years]
    df = pd.read_csv('data/weather.csv')
    df['date'] = pd.to_datetime(df.DATE)
    df['year'] = df.date.dt.year
    # Restrict to years we want
    df = df[df.year.isin(years)]
    # If we want more, we can one hot encode the NAN values
    severe_weather_columns = [col for col in df if col.startswith('WT')]
    df[severe_weather_columns] = df[severe_weather_columns].fillna(0.0)
    # For columns missing only a few values, fill forward seems reasonable
    fill_forward_columns = ['AWND','WDF2','WDF5','WSF2','WSF5']
    df[fill_forward_columns] = df[fill_forward_columns].fillna(method='ffill')
    df = df[df.columns[df.isna().sum() == 0]]

    # Normalize weather data
    df_num = df.select_dtypes(include='number')
    min_max_scaler = preprocessing.MinMaxScaler()
    np_scaled = min_max_scaler.fit_transform(df_num)
    df_normalized = pd.DataFrame(np_scaled, columns = df_num.columns)
    df[df_normalized.columns] = df_normalized

    return df

In [31]:
weather_df = updated_preprocess_weather([2013,2014,2015])
weather_df.columns

Index(['STATION', 'NAME', 'DATE', 'AWND', 'PRCP', 'SNOW', 'SNWD', 'TMAX',
       'TMIN', 'WDF2', 'WDF5', 'WSF2', 'WSF5', 'WT01', 'WT02', 'WT03', 'WT04',
       'WT05', 'WT06', 'WT08', 'WT13', 'WT16', 'WT18', 'WT19', 'WT22', 'date',
       'year'],
      dtype='object')