In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)


from sklearn.metrics import mean_absolute_error, max_error, mean_absolute_percentage_error

#cross validation and hyperparameter optimization 
from sklearn.model_selection import GridSearchCV

#for train and test dataset split
from sklearn.model_selection import train_test_split

#for feedforward neural network model
from sklearn.neural_network import MLPRegressor

#for data plots
import matplotlib.pyplot as plt


# Access to Kaggle LANL-Earthquake prediction dataset
More info:
https://www.kaggle.com/c/LANL-Earthquake-Prediction

In [None]:
#Extract training data into from the kaggle dataset to a numpy representation
dataset = pd.read_csv('../input/LANL-Earthquake-Prediction/train.csv', nrows=500000000, dtype={'acoustic_data': np.float64, 'time_to_failure': np.float64}).values

#only for a check
print(dataset)
print(dataset.shape[0])
print(dataset.shape[1])

# Utility functions definition
We define some functions for creating the X matrix and Y vector. For the X matrix, we create it starting from the decided features. 
In this case we use the following features scheme: for every segment we split it into a series of step, and then, for every segment we consider mean, std, min and max of the acoustic_signal value over the entire segment, over the last 100 steps and over the last 10 steps. We have at the end 12 features for every segment. 

In [None]:
#function for extracting features from the data, we extract the mean, standard deviation, max and min of the input vector
#we can also try different statistical measures such that quantilies and so on

def extract_features(z):
     return np.c_[z.mean(axis=0), 
                  z.std(axis=0),
                  z.max(axis=0),
                  z.min(axis=0),
                #  np.transpose(np.percentile(np.abs(z), q=[0, 50, 75, 100], axis=0)) .reshape(1,4)
                 ]

In [None]:
#function for creating the labels vector
def createY(dataset, last_index=None, n_steps=150, step_length=1000):
  segments= dataset.shape[0]//(n_steps*step_length)
  y=np.ones(segments)
  for i in range(0,segments):
    y[i]=dataset[i*n_steps*step_length][1]
  return y

In [None]:
#function for creating the training set with the decided features

def create_X(x, n_features, last_index=None, n_steps=150, step_length=1000):
    segments= len(x)// (n_steps*step_length)

    X_train = np.zeros((segments, n_features ), dtype= np.float64)
    for i in range (0,segments):
      seg = x[i*n_steps*step_length:i*n_steps*step_length+n_steps*step_length]
      series = np.zeros((n_steps, step_length),  dtype= np.float64)
      for j in range(0, n_steps):
        series[j]=np.r_[seg[j*step_length:j*step_length+step_length]]
      X_train[i] = np.r_[extract_features(series)[1],
                 extract_features(series [ -10:])[1],
                 extract_features(series [ -100:])[1]]
    return X_train

In [None]:
def plot(y_train, y_pred):
    plt.figure(figsize=(6, 6))
    plt.scatter(y_train.flatten(), y_pred)
    plt.xlim(0, 20)
    plt.ylim(0, 20)
    plt.xlabel('actual', fontsize=12)
    plt.ylabel('predicted', fontsize=12)
    plt.plot([(0, 0), (20, 20)], [(0, 0), (20, 20)])
    plt.show()
    
def score(y_train, y_pred):
    y_train_flatten = y_train.flatten()
    max = max_error(y_train_flatten, y_pred)
    mae = mean_absolute_error(y_train_flatten, y_pred)
    mape = mean_absolute_percentage_error(y_train_flatten, y_pred)
    print(f'Max Error: {max:0.3f}')
    print(f'Mean Absolute Error: {mae:0.3f}')
    print(f'Mean Absolute Percentage Error: {mape:0.3f}')    

# TRAINING-VALIDATION-TEST Split of the Dataset
We subdivide the dataset into training and test set, and then we use cross validation on the training set for parameter tuning and model optimization

In [None]:
#CREATING THE TRAINING AND TEST SET

#create the label vector
y= createY(dataset)

#create the X matrix
X = create_X(dataset[:,0], n_features=12)

#split X in X_train and X_test
X_train, X_test, y_train, y_test = train_test_split(X, y,random_state=1, test_size=0.2)

#for a check
print(X_train) 
print(y_train)



# Model training
We train 3 different Feedforward Neural Networks. 
We consider the following activation functions:
* ReLu
* Sigmoid Activation Function (tanh)
* Logitstic Regression

and for every activation function we train different architectures. With the 5-Fold cross validation we manage to discover the best architecture for every activation function. After the training phase the output results and the predicted-true labels graphs are shown. 

In [None]:
#SETTING THE BEST LAYERS CONFIGURATION FOR THE MLPREGRESSOR BASED ON PRECEDENT STUDIES AND TRIALS WITH CROSS VALIDATION AND RELU ACTIVATION FUNCTION

#MLPRegressor requires in input the parameter hidden_layer_sizes, that is a tuple specifying the number of 
#neurons in the hidden layers; for example: (10,) means that there is only 1 hidden layer with 10 neurons; 
#(10,50) means that there are 2 hidden layers, the first with 10 neurons, the second with 50 neurons

#different hidder_layer configurations 
hl_parameters = {'hidden_layer_sizes': [(10,20,20,100), (20,20,20,100) , (20,20,20,20), (50,20,20,100), (100,20,20,100)] }

mlp_cv = MLPRegressor(max_iter=200000, activation="relu", solver='adam', tol=1e-4, random_state=1)

#find best model using 5-fold CV and train it using all the training data
 
mlp_grid = GridSearchCV(mlp_cv, hl_parameters, cv=5)  

# fitting the model for grid search
mlp_grid.fit(X_train, y_train)
    
print ('RESULTS FOR NN\n')

print("Best parameters set found:")
print(mlp_grid.best_params_)

#get training and test error for the best NN model from CV

best_mlp = mlp_grid.best_estimator_

# fit the model on the entire training set
best_mlp.fit(X_train, y_train)

training_error = 1. -best_mlp.score(X_train, y_train)

test_score = best_mlp.score(X_test, y_test)

print ('RESULTS FOR BEST NN\n')

print ("Best NN training error: %f" % training_error)
print ("Best NN test score: %f" %test_score)


In [None]:
y_pred= best_mlp.predict(X_test)

plot(y_test, y_pred)

score(y_test, y_pred)

In [None]:
#SETTING THE BEST LAYERS CONFIGURATION FOR THE MLPREGRESSOR BASED ON PRECEDENT STUDIES AND TRIALS WITH CROSS VALIDATION ANS SIGMOID ACTIVATION FUNCTION

#MLPRegressor requires in input the parameter hidden_layer_sizes, that is a tuple specifying the number of 
#neurons in the hidden layers; for example: (10,) means that there is only 1 hidden layer with 10 neurons; 
#(10,50) means that there are 2 hidden layers, the first with 10 neurons, the second with 50 neurons

#different hidder_layer configurations 
hl_parameters = {'hidden_layer_sizes': [(10,20,20,100), (20,20,20,100) , (20,20,20,20), (50,20,20,100), (100,20,20,100)] }

mlp_cv = MLPRegressor(max_iter=200000, activation="tanh", solver='adam', tol=1e-4, random_state=1)

#find best model using 5-fold CV and train it using all the training data
 
mlp_grid = GridSearchCV(mlp_cv, hl_parameters, cv=5)  

# fitting the model for grid search
mlp_grid.fit(X_train, y_train)
    
print ('RESULTS FOR NN\n')

print("Best parameters set found:")
print(mlp_grid.best_params_)

#get training and test error for the best NN model from CV

best_mlp = mlp_grid.best_estimator_

# fit the model on the entire training set
best_mlp.fit(X_train, y_train)

training_error = 1. -best_mlp.score(X_train, y_train)

test_score = best_mlp.score(X_test, y_test)

print ('RESULTS FOR BEST NN\n')

print ("Best NN training error: %f" % training_error)
print ("Best NN test score: %f" %test_score)


In [None]:
y_pred= best_mlp.predict(X_test)

plot(y_test, y_pred)

score(y_test, y_pred)

In [None]:
#SETTING THE BEST LAYERS CONFIGURATION FOR THE MLPREGRESSOR BASED ON PRECEDENT STUDIES AND TRIALS WITH CROSS VALIDATION ANS LOGISTIC ACTIVATION FUNCTION 

#MLPRegressor requires in input the parameter hidden_layer_sizes, that is a tuple specifying the number of 
#neurons in the hidden layers; for example: (10,) means that there is only 1 hidden layer with 10 neurons; 
#(10,50) means that there are 2 hidden layers, the first with 10 neurons, the second with 50 neurons

#different hidder_layer configurations 
hl_parameters = {'hidden_layer_sizes': [(10,20,20,100), (20,20,20,100) , (20,20,20,20), (50,20,20,100), (100,20,20,100)] }

mlp_cv = MLPRegressor(max_iter=200000, activation="logistic", solver='adam', tol=1e-4, random_state=1)

#find best model using 5-fold CV and train it using all the training data
 
mlp_grid = GridSearchCV(mlp_cv, hl_parameters, cv=5)  

# fitting the model for grid search
mlp_grid.fit(X_train, y_train)
    
print ('RESULTS FOR NN\n')

print("Best parameters set found:")
print(mlp_grid.best_params_)

#get training and test error for the best NN model from CV

best_mlp = mlp_grid.best_estimator_

# fit the model on the entire training set
best_mlp.fit(X_train, y_train)

training_error = 1. -best_mlp.score(X_train, y_train)

test_score = best_mlp.score(X_test, y_test)

print ('RESULTS FOR BEST NN\n')

print ("Best NN training error: %f" % training_error)
print ("Best NN test score: %f" %test_score)


In [None]:
y_pred= best_mlp.predict(X_test)

plot(y_test, y_pred)

score(y_test, y_pred)