# Pre-Trained LSTM Autoencoder Models

This notebook is used to train LSTM Autoencoder Models and save to S3.    
This notebook will train one model per California Climate Zone, total of 16 models.  

Steps:  
1) Pull compiled dataset from S3  
2) Break dataset to train, validation, and test  
3) Standard Scaler and drop NA's. Store Standard Scaler model locally.  
4) Train models on train data and save locally.  
5) Push Standard Scaler model and each LSTM Autoencoder model to S3.  

In [2]:
# # Run following pip installs and restart notebook
# !pip install xarray
# !pip install geopandas
# !pip install shapely
# !pip install netCDF4
# !conda install --y aiobotocore
# !conda install botocore
# # REMEMBER TO RESTART NOTEBOOK

In [3]:
import matplotlib.patches as mpatches
import matplotlib.colors as colors
import matplotlib.pyplot as plt
from matplotlib.patches import Polygon
from matplotlib.collections import PatchCollection
import altair as alt
import datetime

import os
import glob
import time
import sklearn
import numpy as np
import pandas as pd
import geojson
import json
import warnings
warnings.filterwarnings('ignore')

from datetime import datetime
# import plotly.express as px
import geopandas as gpd
from shapely.geometry import Point
from descartes import PolygonPatch


from pylab import rcParams
from matplotlib import rc
from pandas.plotting import register_matplotlib_converters
import seaborn as sns

%matplotlib inline
%config InlineBackend.figure_format='retina'

register_matplotlib_converters()
sns.set(style='whitegrid', palette='muted', font_scale=1.5)
rcParams['figure.figsize'] = 10, 8

from sklearn.preprocessing import MinMaxScaler, StandardScaler
import tensorflow as tf
from tensorflow import keras
from keras.preprocessing.sequence import TimeseriesGenerator

#Altair puts a limit on plotting only 5000 rows from a pd.DataFrame. This line gets rid of that limit
alt.data_transformers.disable_max_rows()


DataTransformerRegistry.enable('default')

In [4]:
#Read in Data
s3_file_path = 's3://methane-capstone/data/dt=latest/data-zone-combined.parquet.gzip'
df = pd.read_parquet(s3_file_path)
df['time_utc'] = pd.to_datetime(df['time_utc'])
df

Unnamed: 0,time_utc,BZone,reading_count,methane_mixing_ratio_mean,lat_mean,methane_mixing_ratio_precision_mean,methane_mixing_ratio_bias_corrected_mean,air_pressure_at_mean_sea_level_mean,air_temperature_at_2_metres_mean,air_temperature_at_2_metres_1hour_Maximum_mean,...,eastward_wind_at_10_metres_mean,integral_wrt_time_of_surface_direct_downwelling_shortwave_flux_in_air_1hour_Accumulation_mean,lwe_thickness_of_surface_snow_amount_mean,northward_wind_at_100_metres_mean,northward_wind_at_10_metres_mean,precipitation_amount_1hour_Accumulation_mean,snow_density_mean,surface_air_pressure_mean,qa_val_mode,qa_val_mean
0,2018-11-30,1,6,1821.922485,38.500210,6.093073,1841.293945,101782.562500,284.875000,285.187500,...,4.656250,1683648.000,0.0,-5.281250,-4.000000,0.0,100.0,98380.687500,0.4,0.400000
1,2018-12-01,1,5,1752.204590,39.343140,8.593414,1776.230835,,,,...,,,,,,,,,0.4,0.400000
2,2018-12-05,1,12,1831.783081,40.898571,6.090038,1852.775146,101205.625000,285.312500,285.937500,...,0.812500,1365568.000,0.0,-1.437500,-1.250000,0.0,100.0,97178.812500,0.4,0.450000
3,2018-12-06,1,33,1786.043457,39.603600,9.277852,1810.526978,101423.125000,286.428558,285.946442,...,1.241071,1634258.250,0.0,-1.437500,-1.080357,0.0,100.0,97893.234375,0.4,0.454545
4,2018-12-07,1,10,1821.168579,39.555389,5.236246,1843.013550,102004.750000,284.958344,285.562500,...,0.541667,1580195.500,0.0,2.708333,1.888889,0.0,100.0,97986.421875,0.4,0.400000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
0,2021-10-31,7,9,1872.528687,33.110569,1.792665,1877.078125,101459.312500,293.312500,293.687500,...,2.312500,1986816.000,0.0,0.375000,0.437500,0.0,100.0,97904.187500,0.4,0.400000
0,2021-10-31,10,23,1881.194580,33.337830,1.630748,1882.654541,101459.312500,293.312500,293.687500,...,2.312500,1986816.000,0.0,0.375000,0.437500,0.0,100.0,97904.187500,0.4,0.452174
0,2021-10-31,12,62,1920.426758,37.471706,1.899056,1924.061890,101465.562500,292.687500,292.312500,...,0.625000,1882560.000,0.0,-0.125000,-0.250000,0.0,100.0,98491.187500,0.4,0.467742
0,2021-10-31,14,107,1903.151978,34.467815,1.392077,1890.494995,101058.203125,299.394531,299.190613,...,0.364844,2009565.625,0.0,3.482812,2.931250,0.0,100.0,93787.328125,1.0,0.708411


In [3]:
#####################################
# Function to load data, given region
#####################################
def load_all_zone_data(describe=True):

    #Read in Data
    s3_file_path = 's3://methane-capstone/data/dt=latest/data-zone-combined.parquet.gzip'
    df = pd.read_parquet(s3_file_path)
    df['time_utc'] = pd.to_datetime(df['time_utc'])
    
#     train_date_threshold = '2021-01-01'
#     validation_date_threshold = '2021-06-01'
    train_days_ago_threshold = 270            #270 days ago to 90 days ago = validation set, before 270 days ago is train data
    validation_days_ago_threshold = 90        #90 days ago to present would be test data

    #look at the date on last row of combined dataframe and set train/valid/test thresholds based on that
    from datetime import timedelta, date                                                                     
    train_date_threshold = str(df.time_utc.iloc[-1] - timedelta(train_days_ago_threshold))[:10]         
    validation_date_threshold  = str(df.time_utc.iloc[-1] - timedelta(validation_days_ago_threshold))[:10]   
    
    df = df.set_index('time_utc')
    train = df.loc[df.index < train_date_threshold]
    validation = df.loc[(df.index >= train_date_threshold) & (df.index < validation_date_threshold)]
    test = df.loc[df.index >= validation_date_threshold]

    #Print time range
    print("start_dt:", df.index.min(), "\nend_dt:", df.index.max(), "\nnumber_days:", df.index.max() - df.index.min(), "\n")
    print(df.shape, "\n")
    print(df.dtypes, "\n")
    print(train.shape, validation.shape, test.shape)
    
    return df, train, validation, test


####################################################################
# Function to generate trainx, trainy and return number of features 
# Window Function
####################################################################
def generate_datasets(data, window_size, describe=False):
    _l = len(data) 
    Xs = []
    Ys = []
    for i in range(0, (_l - window_size)):
        # because this is an autoencoder - our Ys are the same as our Xs. No need to pull the next sequence of values
        Xs.append(data[i:i+window_size])
        Ys.append(data[i:i+window_size])
        
    Xs=np.array(Xs)
    Ys=np.array(Ys)    
    
    if describe:
        print(Xs.shape, Ys.shape)
    
    return (Xs.shape[2], Xs, Ys)

### Data 
    * Standardize
    * Training
    * Loss Calculation

In [7]:
####################################################################
# Function to standard scaler the data
####################################################################
def standardize_data(train, validation, test, feature_cols, zone, describe=False, save=False):

    train_input = train[feature_cols]
    val_input = validation[feature_cols]
    test_input = test[feature_cols]

    scaler = StandardScaler()
    scaler = scaler.fit(train_input)

    train_scaled = scaler.transform(train_input)
    val_scaled = scaler.transform(val_input)
    test_scaled = scaler.transform(test_input)

    train_features = train_scaled
    val_features = val_scaled
    test_features = test_scaled

    if describe:
        print("train:", train_features.shape)
        print("val:", val_features.shape)
        print("test:", test_features.shape)
    
    if save:
        #save standardscaler model locally first (will push to s3 after)
        import datetime
        import pickle
        
        try:
            os.makedirs('/root/methane/pipelines/resources/models/autoencoder/models/pretrained/')
        except:
            print("pipeline logs folder already exist, we're good!")
        
        standard_scaler_name = f'ScalerModel_Zone{zone}'                              
        pickle.dump(scaler, open(f'/root/methane/pipelines/resources/models/autoencoder/models/pretrained/{standard_scaler_name}.pkl','wb'))
   
        
    return train_scaled, val_scaled, test_scaled, scaler


####################################################################
# Function to run multivariate neural network
####################################################################
def lstm_multi(trainX, trainY, valX, valY, window_length, num_features, batch_size, epochs, plot=False):
    
    #build model
    model = keras.Sequential()
    model.add(keras.layers.LSTM(units=128, input_shape = (window_length, num_features)))              ############# UPDATE
    model.add(keras.layers.Dropout(rate=0.2))
    model.add(keras.layers.RepeatVector(n=window_length))
    model.add(keras.layers.LSTM(units=128, return_sequences=True))                                    ############# UPDATE
    model.add(keras.layers.Dropout(rate=0.2))
    model.add(keras.layers.TimeDistributed(keras.layers.Dense(units=num_features)))

    #compile model
    model.compile(loss=tf.losses.MeanSquaredError(),
                  optimizer=tf.optimizers.Adam(),
                  metrics=[tf.metrics.MeanSquaredError(),
                           tf.losses.MeanAbsoluteError(),
                           tf.metrics.RootMeanSquaredError()
                          ]
                 )

    #defined early stopping when training
    early_stopping = tf.keras.callbacks.EarlyStopping(
                            monitor='val_loss',
                            min_delta=1e-2,
                            patience=5,
                            verbose=0,
                            mode='auto',
                            baseline=None, 
                            restore_best_weights=True
                        )

    #show model summary
    #model.summary()

    #train and fit model
    history = model.fit(x=trainX,
                        y=trainY,
                        validation_data=(valX, valY),
                        epochs=epochs,
                        batch_size=batch_size, 
                        shuffle=False, 
                        callbacks=[early_stopping])

    if plot:
        plt.title('MAE Loss')
        plt.plot(history.history['loss'], label='train')
        plt.plot(history.history['val_loss'], label='validation')
        plt.legend()

    return model, history


def calculate_loss(feature_num, model, dataX):

    #Predict model and calculate MSE of the feature (0th feature = methane)    
    pred = model.predict(dataX)[:, :, feature_num]
    truth = dataX[:, :, feature_num]

    mse_loss = np.mean(np.square(pred -  truth), axis=1)     
    return mse_loss, pred 




### Load Data

In [8]:
df, train, val, test = load_all_zone_data()

start_dt: 2018-11-28 00:00:00 
end_dt: 2021-10-23 00:00:00 
number_days: 1060 days 00:00:00 

(13357, 22) 

BZone                                                                                              int64
reading_count                                                                                      int64
methane_mixing_ratio_mean                                                                        float32
lat_mean                                                                                         float32
methane_mixing_ratio_precision_mean                                                              float32
methane_mixing_ratio_bias_corrected_mean                                                         float32
air_pressure_at_mean_sea_level_mean                                                              float32
air_temperature_at_2_metres_mean                                                                 float32
air_temperature_at_2_metres_1hour_Maximum_mean      

### Track Everything

In [None]:
#select region and features
zones = [x for x in range(1,17)]

# Track predictions and losses for analysis across different features
feature_loss_tracker = {key: {'train':{}, 'val':{}, 'test':{}} for key in zones}

#Track all the data frames, raw and scaled
df_tracker = {}

#Track all the metrics from each model training cycle
model_metrics_tracker = {}

drop = False

#parameters:
feature_cols = ['methane_mixing_ratio_bias_corrected_mean',  'reading_count',
                 'air_pressure_at_mean_sea_level_mean',
                 'eastward_wind_at_100_metres_mean',
                 'northward_wind_at_100_metres_mean',
                 'air_temperature_at_2_metres_mean',
                 'surface_air_pressure_mean',
                 'integral_wrt_time_of_surface_direct_downwelling_shortwave_flux_in_air_1hour_Accumulation_mean',
                 'precipitation_amount_1hour_Accumulation_mean' ,
                 'dew_point_temperature_at_2_metres_mean']

feature_number_map = {}
for ind, feature in enumerate(feature_cols, 0):
    feature_number_map[feature] = ind

start=time.time()
    
for zone in zones:
    
    print("Zone #", zone)
    train_zone = train[train['BZone'] == zone]
    val_zone = val[val['BZone'] == zone]
    test_zone = test[test['BZone'] == zone] 
    
    if drop:
        #NEED TO DROP ROWS WITH NA VALUES :(
        train_zone=train_zone.dropna()
        val_zone=val_zone.dropna()
        test_zone=test_zone.dropna()
    
    else:
        train_zone=train_zone.interpolate(method='time')
        val_zone=val_zone.interpolate(method='time')
        test_zone=test_zone.interpolate(method='time')
        train_zone=train_zone.dropna()
        val_zone=val_zone.dropna()
        test_zone=test_zone.dropna()
        
    window_length = 7
    batch_size = 32
    num_features = len(feature_cols)
    epochs = 50

    print("Standard scaler'ing data")
    #standardize data
    train_scaled, val_scaled, test_scaled, scaler = standardize_data(train_zone, val_zone, test_zone, feature_cols, zone, save=True)
    
    
    #Track all data for use later on
    df_tracker[zone] = {'train_zone': train_zone,
                        'val_zone': val_zone,
                        'test_zone': test_zone,
                        'train_scaled': train_scaled,
                        'val_scaled': val_scaled,
                        'test_scaled': test_scaled,
                        'scaler': scaler
                       }

    print("Generating Datasets")
    #generate trainX and trainY
    num_feats_train, trainX, trainY = generate_datasets(train_scaled, window_length)
    num_feats_val, valX, valY = generate_datasets(val_scaled, window_length)
    num_feats_test, testX, testY = generate_datasets(test_scaled, window_length)
    
    assert num_feats_train == num_feats_test == num_feats_val
    
    print("training model")
    #Run LSTM Multivariate model and plot
    model, history =  lstm_multi(trainX, trainY, valX, valY, window_length, num_features, batch_size, epochs, plot=False)

    #Save model locally first, then push to S3 later
    import datetime
    cur_date = datetime.datetime.now().strftime("%Y%m%d")
    model_name = f'LSTMAE_Zone{zone}'                              
    print("saving model named:", model_name)
    model.save(f'/root/methane/pipelines/resources/models/autoencoder/models/pretrained/{model_name}.h5')   
    
    model_metrics_tracker[zone] = history.history

    for feature in feature_cols:

        #Predict MSE's:
        feature_num = feature_number_map[feature]
        print("Loss: ", feature, feature_num)
        
        train_mse_loss, X_train_pred = calculate_loss(feature_num, model, trainX)
        val_mse_loss, X_val_pred = calculate_loss(feature_num, model, valX)
        test_mse_loss, X_test_pred = calculate_loss(feature_num, model, testX)
        
        feature_loss_tracker[zone]['train'].update({feature: {'train_mse_loss': train_mse_loss, 'X_train_pred':X_train_pred }})
        feature_loss_tracker[zone]['val'].update({feature: {'val_mse_loss': val_mse_loss, 'X_val_pred': X_val_pred }})
        feature_loss_tracker[zone]['test'].update({feature: {'test_mse_loss': test_mse_loss, 'X_test_pred':X_test_pred }})

    print()
    print()
    print()

end=time.time()
print("TIME: {time:.2f} secs".format(time=(end-start)))

Zone # 1
Standard scaler'ing data
Generating Datasets
training model
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
saving model named: LSTMAE_Zone1
Loss:  methane_mixing_ratio_bias_corrected_mean 0
Loss:  reading_count 1
Loss:  air_pressure_at_mean_sea_level_mean 2
Loss:  eastward_wind_at_100_metres_mean 3
Loss:  northward_wind_at_100_metres_mean 4
Loss:  air_temperature_at_2_metres_mean 5
Loss:  surface_air_pressure_mean 6
Loss:  integral_wrt_time_of_surface_direct_downwelling_shortwave_flux_in_air_1hour_Accumulation_mean 7
Loss:  precipitation_amount_1hour_Accumulation_mean 8
Loss:  dew_point_temperature_at_2_metres_mean 9



Zone # 2
Standard scaler'ing data
pipeline logs folder already exist, we're good!
Generating Datasets
training model
Epoch 1/50
Epoch 2/50
Epoc

In [11]:
# Build Final Dataframes For Visuals

def get_anomaly_threshold(mse_loss):
    upper,lower = np.percentile(mse_loss,[75,25])
    ANOMALY_THRESHOLD = 5*(upper-lower)
    return ANOMALY_THRESHOLD

# plot MSE for Train and Validation
final_dataframes = {key: {'train':None, 'val': None, 'test': None} for key in zones}
anomaly_thresholds = {key: {feature:None} for key in zones}

for zone in zones:
    for split in ['train', 'val', 'test']:
            
        cur_zone_df = df_tracker[zone][f'{split}_zone']
        scored_df = pd.DataFrame(index=cur_zone_df[window_length:].index)
    
        for feature in feature_cols:
            
            ### WE MIGHT HAVE TO FIGURE OUT THE COLOR BUCKETS HERE ###
            
            train_mse_loss = feature_loss_tracker[zone]['train'][feature][f'train_mse_loss']
            mse_loss = feature_loss_tracker[zone][split][feature][f'{split}_mse_loss']
            
            anom_thresh = get_anomaly_threshold(train_mse_loss)
            anomaly_thresholds[zone][feature] = anom_thresh

            
            scored_df[feature] = cur_zone_df[window_length:][feature]
            scored_df[f'{feature}_loss'] = mse_loss
            scored_df[f'{feature}_threshold'] = anom_thresh
            scored_df[f'{feature}_anomaly'] = scored_df[f'{feature}_loss'] > scored_df[f'{feature}_threshold']

        final_dataframes[zone][split] = scored_df

In [13]:
# Save Dictionaries to Local

#### Dictionaries ####
# feature_loss_tracker
# df_tracker
# model_metrics_tracker

import boto3
import pickle


try:
    os.makedirs('/root/methane/pipelines/resources/models/autoencoder/models/zone_artifacts/')
except:
    print("directories already exist, no need to create new ones")


#save to local
with open(f'/root/methane/pipelines/resources/models/autoencoder/models/zone_artifacts/feature_loss_tracker.pickle', 'wb') as handle:
    pickle.dump(feature_loss_tracker, handle, protocol=pickle.HIGHEST_PROTOCOL)
    
with open(f'/root/methane/pipelines/resources/models/autoencoder/models/zone_artifacts/df_tracker.pickle', 'wb') as handle:
    pickle.dump(df_tracker, handle, protocol=pickle.HIGHEST_PROTOCOL)
    
with open(f'/root/methane/pipelines/resources/models/autoencoder/models/zone_artifacts/model_metrics_tracker.pickle', 'wb') as handle:
    pickle.dump(model_metrics_tracker, handle, protocol=pickle.HIGHEST_PROTOCOL)

with open(f'/root/methane/pipelines/resources/models/autoencoder/models/zone_artifacts/final_dataframes.pickle', 'wb') as handle:
    pickle.dump(final_dataframes, handle, protocol=pickle.HIGHEST_PROTOCOL)
    
with open(f'/root/methane/pipelines/resources/models/autoencoder/models/pretrained/pretrained_anomaly_thresholds.pickle', 'wb') as handle:
    pickle.dump(anomaly_thresholds, handle, protocol=pickle.HIGHEST_PROTOCOL)    
        

# Push Zone Artifacts and Pretrained Models to S3

In [21]:
import subprocess

#save pretrained models to latest folder
subprocess.check_output(['aws','s3','cp', '--recursive', '/root/methane/pipelines/resources/models/autoencoder/models' , 's3://methane-capstone/models/autoencoder/dt=latest/'])


#save pretrained models to archive folder
from datetime import timedelta, date
localtime = time.localtime(time.time())
date=str(localtime.tm_year)+str(localtime.tm_mon)+str(localtime.tm_mday) #define date (for naming backup)
s3_location = f's3://methane-capstone/models/autoencoder/dt=archive/dt={date}/'
subprocess.check_output(['aws','s3','cp', '--recursive', '/root/methane/models/autoencoder/models' , s3_location])

b'Completed 256.0 KiB/~38.1 MiB (2.0 MiB/s) with ~33 file(s) remaining (calculating...)\rCompleted 512.0 KiB/~38.1 MiB (3.6 MiB/s) with ~33 file(s) remaining (calculating...)\rCompleted 768.0 KiB/~38.1 MiB (5.3 MiB/s) with ~33 file(s) remaining (calculating...)\rCompleted 1.0 MiB/48.1 MiB (6.6 MiB/s) with 37 file(s) remaining                     \rCompleted 1.2 MiB/48.1 MiB (8.0 MiB/s) with 37 file(s) remaining                     \rCompleted 1.5 MiB/48.1 MiB (9.6 MiB/s) with 37 file(s) remaining                     \rCompleted 1.8 MiB/48.1 MiB (11.0 MiB/s) with 37 file(s) remaining                    \rCompleted 2.0 MiB/48.1 MiB (12.3 MiB/s) with 37 file(s) remaining                    \rCompleted 2.2 MiB/48.1 MiB (13.8 MiB/s) with 37 file(s) remaining                    \rCompleted 2.5 MiB/48.1 MiB (15.2 MiB/s) with 37 file(s) remaining                    \rCompleted 2.8 MiB/48.1 MiB (16.6 MiB/s) with 37 file(s) remaining                    \rCompleted 3.0 MiB/48.1 MiB (17.9 MiB/s) w