In [7]:
from droughts_modelling.data import DataFunctions
import numpy as np
import tensorflow as tf
import matplotlib.pyplot as plt
from tensorflow.keras import models, layers
from sklearn.preprocessing import RobustScaler

## Tensorflow window class method

DataFunctions class

In [8]:
data = DataFunctions().light_weekly_aggregate()

To dos:
1. Train locally a v small DL model with a v small part of the dataset that has been processed using the window generator
2. Work out what Alex was predicting... was it the next week? was it further into the future?
3. Code up and implement in the DL trainer file
- Hook up the DL trainer file to GCP
- Work in the geolocation data
- Think about using autoregressive method to improve performance
- Get the Keras wrapper working so we can use pipelines
4. When it comes to fine tuning think about using the autoregressive method in tensorflow tutorial

In [25]:
class WindowGenerator():
    def __init__(self,data,input_width,label_width,shift,label_columns=None):

        self.data = data
        
        self.label_columns = label_columns
        if label_columns is not None:
            self.label_columns_indices = {name: i for i, name in enumerate(label_columns)}
            self.column_indices = {name: i for i, name in enumerate(self.data.columns)}

    
        self.input_width = input_width
        self.label_width = label_width
        self.shift = shift

        self.total_window_size = input_width + shift

        self.input_slice = slice(0, input_width)
        self.input_indices = np.arange(self.total_window_size)[self.input_slice]

        self.label_start = self.total_window_size - self.label_width
        self.labels_slice = slice(self.label_start, None)
        self.label_indices = np.arange(self.total_window_size)[self.labels_slice]

    def __repr__(self):
        return '\n'.join([
            f'Total window size: {self.total_window_size}',
            f'Input indices: {self.input_indices}',
            f'Label indices: {self.label_indices}',
            f'Label column name(s): {self.label_columns}'])
    
    
    def split_window(self, list_of_consecutive_inputs_w_labels):
        inputs = list_of_consecutive_inputs_w_labels[:, self.input_slice, :]
        labels = list_of_consecutive_inputs_w_labels[:, self.labels_slice, :]
        if self.label_columns is not None:
            labels = tf.stack([labels[:, :, self.column_indices[name]] for name in self.label_columns],axis=-1)

        inputs.set_shape([None, self.input_width, None])
        labels.set_shape([None, self.label_width, None])

        return inputs, labels

    def make_dataset(self):
        data = np.array(self.data, dtype=np.float32)
        ds = tf.keras.preprocessing.timeseries_dataset_from_array(data=data,targets=None,
                                                              sequence_length=self.total_window_size,
          sequence_stride=1,
          shuffle=True,batch_size=32)

        ds = ds.map(self.split_window)

        return ds

In [26]:
preprocessed_data = WindowGenerator(data,input_width=6, label_width=1, shift=1,label_columns=['score_max']).make_dataset()

In [27]:
preprocessed_data.element_spec

(TensorSpec(shape=(None, 6, 21), dtype=tf.float32, name=None),
 TensorSpec(shape=(None, 1, 1), dtype=tf.float32, name=None))

In [31]:
len(preprocessed_data) * 32

164736

In [32]:
len(data)

164724

In [30]:
for example_inputs, example_labels in preprocessed_data.take(1):
    print(f'Inputs shape (batch, time, features): {example_inputs.shape}')
    print(f'Labels shape (batch, time, features): {example_labels.shape}')

Inputs shape (batch, time, features): (32, 6, 21)
Labels shape (batch, time, features): (32, 1, 1)


In [None]:
class DeepLearning:
    
    def __init__(self):
        self.data = DataFunctions().light_weekly_aggregate()
        self.features = self.data.drop(columns=['fips_','week_num_','score_max']).columns
        
    def robust(self):
        df = self.data.copy()
        for f in self.features:
            median = np.median(df[f])
            IQR = np.subtract(*np.percentile(df[f], [75, 25]))
            df[f] = df[f].map(lambda x: (x-median)/IQR)
        
        self.scaled_data = df
            
    def preprocess(self):
        self.robust()
        self.preprocessed_data = WindowGenerator(self.scaled_data,input_width=6, label_width=1, shift=1,label_columns=['score_max']).make_dataset()
        
    def initialize_model(self):
        self.model = models.Sequential()
        self.model.add(layers.LSTM(20))
        self.model.add(layers.Dense(6,activation='softmax'))
        self.model.compile(loss='categorical_crossentropy',optimizer='rmsprop',metrics=['accuracy','recall'])
    
    def train_model(self):
        self.initialize_model()
        self.preprocess()
        self.model.fit(self.preprocessed_data,epochs=1,batch_size=16,verbose=1)

In [1]:
from droughts_modelling.DL_trainer import DeepLearning as dl

In [2]:
dl().train_model()

2021-08-31 22:40:37.195415: I tensorflow/core/platform/cpu_feature_guard.cc:142] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2021-08-31 22:40:39.914185: I tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc:185] None of the MLIR Optimization Passes are enabled (registered 2)




## Manual coding method

In [13]:
def split_subsample_sequence(df, length):
    df_sample = df.sample(length)
    X_sample = df_sample.iloc[:-1].copy()
    y_sample = float(df_sample[['score_max']].iloc[-1])
    
    return X_sample, y_sample

def get_X_y(df, n_sequences, length):
    X = [split_subsample_sequence(df, length)[0] for n in range(n_sequences)]
    y = [split_subsample_sequence(df, length)[1] for n in range(n_sequences)]
    
    
    return np.array(X),np.array(y)

In [14]:
d = data[:1000]

In [None]:
X,y = get_X_y(d, 2000, 21)

In [17]:
def split_subsample_sequence(df, length):
    df_sample = df.sample(length,random_state=1)
    X_sample = df_sample.iloc[:-1].copy()
    y_sample = float(df_sample[['score_max']].iloc[-1])
    
    return X_sample, y_sample

def get_X_y(df, n_sequences,length):
    X = []
    y = []
    for n in range(n_sequences):
        sequence = split_subsample_sequence(df, length)
        X.append(sequence[0])
        y.append(sequence[1])
    
    return np.array(X),np.array(y)

def get_X_y_by_geolocation(df, n_sequences, length):
    X = []
    y = []
    for fip in sorted(set(df['fips_'])):
        fip_df = df[df['fips_'] == fip]
        sequences = get_X_y(fip_df,n_sequences,length)
        X.append(sequences[0])
        y.append(sequences[1])
        

    return np.array(X),np.array(y)