# Overview

This approach will use spectogram and create images of each sensor. Each file will be imported and mapped to create a 130x237 image per sensor. These images will be stacked in the third axis so each `time_to_eruption` will be represented by a 130x237x10 tensor. This data won't fit into memory so I created a custom generator to read batches of files at a time. 

The objective of sharing this notebook is to highligh Tensorflows ability to create a custom generator that reads data from hard disk versus into memory.

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from scipy import signal
import tensorflow as tf

from tensorflow import keras
from tensorflow.keras.models import Sequential, Model, load_model
from tensorflow.keras.layers import Input, Conv2D, AveragePooling2D, MaxPooling2D, Dense, Dropout, Activation, Flatten
from tensorflow.keras.layers import BatchNormalization

import glob
import os

In [None]:
import tensorflow as tf
print("Num GPUs Available: ", len(tf.config.experimental.list_physical_devices('GPU')))


We'll use the dataset API to read the hardisk files as necessary. Previously I ran out of memory to simply loop through. Once  the files are imported, we can perform a map function on the tensor to alter the sensor columns and 60001 readings to manageable spectograms.

In [None]:
df_example = pd.read_csv("../input/predict-volcanic-eruptions-ingv-oe/train/2068207140.csv")
df_example.head()

data_columns = list(df_example.columns)

print('Index Dataframe Shape: {}'.format(df_example.shape))
print('Column Headers:\n')
print(data_columns)
df_example

In [None]:
train_df = pd.read_csv("../input/predict-volcanic-eruptions-ingv-oe/train.csv")
train_df.head()

# Create list of training file paths

We'll deconstruct the train_df to do this so the sequence and label match

In [None]:
train_list = []
train_labels = []
train_dir = '../input/predict-volcanic-eruptions-ingv-oe/train'

for index, row in train_df.iterrows():
    segment_id = str(row['segment_id'])
    fname = os.path.join(train_dir,segment_id+'.csv')

    segment_label = row['time_to_eruption']
    
    train_list.append(fname)
    train_labels.append(segment_label)
    
print("Length of training list: {}".format(len(train_list)))
print("Length of training labels: {}".format(len(train_labels)))

# File Read to Spectogram

This are the basic steps developed and incorporated into the custom generator.

In [None]:
def process_path(segment_path):
    input_df = pd.read_csv(segment_path)
    input_df = input_df.fillna(0.0)
    
    fname = os.path.split(segment_path)[1]
    segment_id = os.path.splitext(fname)[0]
    
    spec_array = {}
        
    for col in input_df.columns:
        f,t,Sxx = signal.spectrogram(input_df[col],100,window=('tukey',.25),nperseg=256,nfft=256,mode='psd',noverlap=3)
        spec_array[col] = Sxx
    
    segment_data = np.stack((list(spec_array.values())),axis=2)
    
    return segment_data

In [None]:
X = np.array([process_path(file_name) for file_name in train_list[0:5]])
X.shape

In [None]:
y = df_example['sensor_2'].to_numpy()
f,t,Sxx = signal.spectrogram(y,100,window=('tukey',.25),nperseg=256,nfft=256,mode='psd',noverlap=3)

In [None]:
print("Length of f: {}".format(len(f)))
print("Length of t: {}".format(len(t)))
print("Shape of Sxx: {}".format(Sxx.shape))
Sxx.dtype

In [None]:
plt.pcolormesh(t, f, Sxx,shading='gouraud',vmax=10)

plt.ylabel('Frequency [Hz]')

plt.xlabel('Time [sec]')

plt.show()

In [None]:
fig, axs = plt.subplots(nrows=5, ncols=2)
fig.set_size_inches(20,10)
fig.subplots_adjust(hspace=0.5)

for col,ax in zip(data_columns, axs.flatten()):
    y = df_example[col]
    f,t,Sxx = signal.spectrogram(y,100,window=('tukey',.25),nperseg=256,nfft=256,mode='psd',noverlap=3)
    ax.pcolormesh(t,f,Sxx,shading='auto',vmax=100)
    ax.set_title(col)

# Creation of Custom Generator

The generator is defined as a class and incorporates the above methods. Files are read and processes in batches thus making it memory friendly.

In [None]:
class SpectoGenerator(tf.keras.utils.Sequence):
    
    def __init__(self, filenames, labels, batch_size,to_predict=False):
        self.filenames = filenames
        self.labels = labels
        self.batch_size = batch_size
        self.to_predict = to_predict
        
    def __len__(self):
        return (np.ceil(len(self.filenames) / float(self.batch_size))).astype(np.int)
    
    def __getitem__(self, index):  
        train_ID_tmp = self.filenames[index * self.batch_size : (index+1) * self.batch_size]
        
        X = np.array([self._process_path(file_name) for file_name in train_ID_tmp])
        
        if self.to_predict:
            return X
        else:      
            y = np.array(self.labels[index * self.batch_size : (index+1) * self.batch_size])
            return X,y
        
    def _process_path(self, segment_path):
        input_df = pd.read_csv(segment_path)
        input_df = input_df.fillna(0.0)

        fname = os.path.split(segment_path)[1]
        segment_id = os.path.splitext(fname)[0]

        spec_array = {}

        for col in input_df.columns:
            f,t,Sxx = signal.spectrogram(input_df[col],100,window=('tukey',.25),nperseg=256,nfft=256,mode='psd',noverlap=3)
            spec_array[col] = Sxx

        segment_data = np.stack((list(spec_array.values())),axis=2)

        return segment_data

In [None]:
training_generator = SpectoGenerator(train_list,train_labels,32)

# Create Model

In [None]:
model = Sequential()

model.add(Conv2D(filters=16, kernel_size = (2,8), activation='relu',input_shape=(129, 237, 10)))
model.add(BatchNormalization(axis=3))
model.add(MaxPooling2D(pool_size=(1,4)))

model.add(Conv2D(filters=32, kernel_size = (8,2), dilation_rate=2,activation='relu'))
model.add(BatchNormalization(axis=3))
model.add(MaxPooling2D(pool_size=(2,1)))

model.add(Conv2D(filters=32, kernel_size = (2,8), dilation_rate=3, activation='relu'))
model.add(BatchNormalization(axis=3))
model.add(MaxPooling2D(pool_size=(1,2)))

model.add(Conv2D(filters=32, kernel_size = (8,2), dilation_rate=4,activation='relu'))
model.add(BatchNormalization(axis=3))
model.add(MaxPooling2D(pool_size=(2,1)))

model.add(Flatten())
model.add(Dense(1,activation="relu"))

model.summary()

In [None]:
def scheduler(epoch, lr):
  if epoch < 15:
    return lr
  else:
    return lr * tf.math.exp(-0.1*epoch)

scheduler = tf.keras.callbacks.LearningRateScheduler(scheduler)
earlystop = tf.keras.callbacks.EarlyStopping(monitor='mae',min_delta=50000,patience=3)

optimizer = tf.keras.optimizers.Adam(learning_rate=0.01)
model.compile(loss=tf.keras.losses.MeanSquaredError(), optimizer=optimizer, metrics=['mae'])

In [None]:
history = model.fit(training_generator,epochs=40,steps_per_epoch=int(4431//32),verbose=1,
                   callbacks=[scheduler,earlystop])

In [None]:
model.save('cnn_model_40epochs_XXX.h5')

In [None]:
mae = history.history['mae']
loss = history.history['loss']

epochs = range(len(mae))

#plt.plot(epochs, loss, 'r', label='loss')
plt.plot(epochs, mae, 'b', label='MAE')
plt.title('Training and validation accuracy')
plt.legend(loc=0)
plt.figure()


plt.show()

# Create List of Test Path Files

In [None]:
test_df = pd.read_csv('../sample_submission.csv')
test_df

In [None]:
test_list = []
test_labels = [...]
test_dir = '../test/'

for index, row in test_df.iterrows():
    segment_id = str(row['segment_id'])
    fname = os.path.join(test_dir,segment_id+'.csv')

    segment_label = row['time_to_eruption']
    
    test_list.append(fname)
    
print("Length of training list: {}".format(len(train_list)))

## Create Test Data Generator

Same generator can be used for creating a test class. I created a boolean `to_predict` which will return test set parameters. This can then be passed into the model for prediction.

In [None]:
test_generator = SpectoGenerator(test_list,test_labels,64,to_predict=True)
pred = model.predict(test_generator)
pred.shape

# Predict Test Values

In [None]:
df_submit = test_df.copy()
df_submit['time_to_eruption'] = abs(pred)
df_submit.head(10)

In [None]:
print("Minimum event time is: {}".format(df_submit['time_to_eruption'].min()))
print("Maximum event time is: {}".format(df_submit['time_to_eruption'].max()))

In [None]:
df_submit.to_csv('./submission.csv',index=False)