In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import dask.dataframe as dd
import time
import os
from tqdm import tqdm

import keras
from keras import metrics
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation, Flatten
from keras.layers import Conv1D, MaxPooling1D, GlobalAveragePooling1D
from keras.layers.normalization import BatchNormalization
from keras.optimizers import SGD, Adam, adam
from keras.callbacks import EarlyStopping, ModelCheckpoint, ReduceLROnPlateau

from DataPreparation.dataset_preparation import get_LANL_data
from Utilities.evaluation_utils import save_submission, LossPlot

import tensorflow as tf
import warnings
tf.logging.set_verbosity(tf.logging.ERROR)
warnings.filterwarnings('ignore')

%matplotlib inline
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


# =============================================================
# ====================== Experiment 1 ===========================
# =============================================================

Base Model

Slicing the training set (3305 samples, WITH failures).

Random splitting the validation set (1000 samples, no failures).

# 1. Load Dataset

In [5]:
data_dir = '../Data/LANL_Earthquake_prediction/'
method = 'slicing' # 'slicing', 'random'
n = 5000
seq_len = 150000
X_train, y_train, X_val, y_val, X_test_dict, X_train_moments = get_LANL_data(data_dir, method, n)
print('-------------')
print(X_train.shape)
print(y_train.shape)
print(X_val.shape)
print(y_val.shape)
print('Number of test files: %d' % len(X_test_dict))

Reading data...
Splitting & Sampling...
Standardizing...
[DONE]
-------------
(3305, 150000, 1)
(3305,)
(1000, 150000, 1)
(1000,)
Number of test files: 2624


# 2. Build Model

In [6]:
model = Sequential()

# Conv 1
model.add(Conv1D(32, 10, activation='relu', input_shape=(seq_len, 1)))

# Max Pooling
model.add(MaxPooling1D(100))

# Conv 3
model.add(Conv1D(64, 10, activation='relu'))

# Average Pooling
model.add(GlobalAveragePooling1D())

model.add(Dense(16, kernel_initializer='normal',activation='relu'))
# Output Layer
model.add(Dense(1, kernel_initializer='normal',activation='linear'))

earlyStopping = EarlyStopping(monitor='val_loss',
                              patience=10,
                              verbose=0,
                              mode='min')
mcp_save = ModelCheckpoint('.mdl_wts.hdf5',
                           save_best_only=True,
                           monitor='val_loss',
                           mode='min')
reduce_lr_loss = ReduceLROnPlateau(monitor='val_loss',
                                   factor=0.1,
                                   patience=5,
                                   verbose=1,
                                   epsilon=1e-4,
                                   mode='min')

model.compile(loss='mean_absolute_error', optimizer= adam(lr=1e-4), metrics=['mean_absolute_error'])
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv1d_1 (Conv1D)            (None, 149991, 32)        352       
_________________________________________________________________
max_pooling1d_1 (MaxPooling1 (None, 1499, 32)          0         
_________________________________________________________________
conv1d_2 (Conv1D)            (None, 1490, 64)          20544     
_________________________________________________________________
global_average_pooling1d_1 ( (None, 64)                0         
_________________________________________________________________
dense_1 (Dense)              (None, 16)                1040      
_________________________________________________________________
dense_2 (Dense)              (None, 1)                 17        
Total params: 21,953
Trainable params: 21,953
Non-trainable params: 0
_________________________________________________________________


# 3. Train Model

In [7]:
liveplot = LossPlot()

In [8]:
model.fit(X_train, 
        y_train,
        batch_size= 16,
        epochs= 100, 
        validation_data= (X_val, y_val),
        callbacks=[earlyStopping, mcp_save, reduce_lr_loss, liveplot],
        verbose=0
         )

NameError: name 'plt' is not defined

# 4. Evaluate Model

## A. MAE Scores

In [None]:
train_mae = model.evaluate(X_train, y_train, verbose=0)
val_mae = model.evaluate(X_val, y_val, verbose=0)

print('Train MAE: %.4f' % train_mae[0])
print('Validation MAE: %.4f' % val_mae[0])

## B. Predictions vs Targets

In [None]:
y_pred = model.predict(X_val)

In [None]:
plt.figure(figsize=(6, 6))
plt.scatter(y_val, y_pred)
plt.xlim(0, 20)
plt.ylim(0, 20)
plt.xlabel('actual', fontsize=12)
plt.ylabel('predicted', fontsize=12)
plt.plot([(0, 0), (20, 20)], [(0, 0), (20, 20)])
plt.show()

## C. Save Test Submission

In [17]:
X_test = []
segment_ids = []
for seg_id, X_test_i in X_test_dict.items():
    X_test.append(X_test_i)
    segment_ids.append(seg_id)

X_test = np.asarray(X_test)
y_test_pred = np.squeeze(model.predict(X_test)).tolist()

prediction_dict = {seg_id:pred for seg_id, pred in zip(segment_ids, y_test_pred)}
# save_submission(prediction_dict, 'noname')

Submission saved to disk: submissions/Step5_submission.csv
