<h1 align="center"> LANL Earthquake Prediction</h1>

<img style="margin: 0 auto;" src="https://storage.googleapis.com/kaggle-media/competitions/LANL/nik-shuliahin-585307-unsplash.jpg" width="450" height="200" style="align:center"/>

# Table of Contents
[1. Introduction](#sec1) <br>
[2. Import Libraries](#sec2)<br>
[3. Data Processing](#sec3)<br>
[4. Evaluating Some Models](#sec4)<br>
[5. Hyperparameter Tuning](#sec5)<br>
[6. Building Final Models](#sec6)<br>
[7. Prediction](#sec7)


# 1. Introduction
<a id='sec1'></a>
This competition aims to use seismic signals to predict the timing of laboratory earthquakes. The data is from an experimental set-up. The acoustic_data input signal is used to predict the time remaining before the next laboratory earthquake (time_to_failure).<br>
The training data is a single continuous segment of experimental data. The test folder contains many small segments, which do not represent a continuous segment of the experiment. Predictions cannot be assumed to follow the same regular pattern seen in the training file.

# 2. Import Libraries
<a id='sec2'></a>

In [None]:
import os
import glob
import numpy as np
from random import seed
from random import randint

from tqdm import tqdm
# Fix seeds
from numpy.random import seed
seed(639)

from scipy.signal import hilbert
from scipy.signal import convolve
from scipy.signal import hann

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import KFold, cross_val_score

from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import (RandomForestRegressor, GradientBoostingRegressor, 
                              AdaBoostRegressor)
#from sklearn.ensemble import AdaBoostRegressor

from sklearn.ensemble import VotingRegressor

from sklearn.model_selection import RandomizedSearchCV

from sklearn.model_selection import RepeatedKFold
from sklearn.model_selection import GridSearchCV
import seaborn as sns


import pandas as pd

import matplotlib.pyplot as plt
from tqdm import tqdm_notebook
import warnings
warnings.filterwarnings('ignore')
pd.options.display.precision = 15

# 3. Data Processing
<a id='sec3'></a>

In [None]:
# Load training data
INPATH = '../input/LANL-Earthquake-Prediction/'
train_df = pd.read_csv(os.path.join( INPATH, 'train.csv'),\
                       dtype={'acoustic_data': np.int16, 'time_to_failure': np.float32})


### 3.1. Useful Functions
We'll use the following functions for data processing and feature engineering.

In [None]:
# sta/lta function from obspy: https://docs.obspy.org/_modules/obspy/signal/trigger.html
def recursive_sta_lta(a, nsta, nlta):
    '''
    a: seismic trace
    nsta: Length of short time average window in samples
    nlta: Length of long time average window in samples
    '''
    try:
        a = a.tolist()
    except Exception:
        pass
    ndat = len(a)
    # compute the short time average (STA) and long time average (LTA)
    # given by Evans and Allen
    csta = 1. / nsta
    clta = 1. / nlta
    sta = 0.
    lta = 1e-99  # avoid zero division
    charfct = [0.0] * len(a)
    icsta = 1 - csta
    iclta = 1 - clta
    for i in range(1, ndat):
        sq = a[i] ** 2
        sta = csta * sq + icsta * sta
        lta = clta * sq + iclta * lta
        charfct[i] = sta / lta
        if i < nlta:
            charfct[i] = 0.
    return np.array(charfct)

In [None]:
def add_linear_trend(arr):
    idx = np.array(range(len(arr)))
    lr = LinearRegression()
    lr.fit(idx.reshape(-1, 1), arr)
    return lr.coef_[0]

In [None]:
#https://www.kaggle.com/gpreda/lanl-earthquake-eda-and-prediction
def create_features(seg_id, seg, X):
    xc = pd.Series(seg['acoustic_data'].values)
    zc = np.fft.fft(xc)
    
    X.loc[seg_id, 'mean'] = xc.mean()
    X.loc[seg_id, 'std'] = xc.std()
    X.loc[seg_id, 'max'] = xc.max()
    X.loc[seg_id, 'min'] = xc.min()
    
    X.loc[seg_id, 'mad'] = xc.mad()
    X.loc[seg_id, 'kurt'] = xc.kurtosis()
    X.loc[seg_id, 'skew'] = xc.skew()
    X.loc[seg_id, 'med'] = xc.median()
    
    #FFT transform values
    realFFT = np.real(zc)
    imagFFT = np.imag(zc)
    X.loc[seg_id, 'Rmean'] = realFFT.mean()
    X.loc[seg_id, 'Rstd'] = realFFT.std()
    X.loc[seg_id, 'Rmax'] = realFFT.max()
    X.loc[seg_id, 'Rmin'] = realFFT.min()
    X.loc[seg_id, 'Imean'] = imagFFT.mean()
    X.loc[seg_id, 'Istd'] = imagFFT.std()
    X.loc[seg_id, 'Imax'] = imagFFT.max()
    X.loc[seg_id, 'Imin'] = imagFFT.min()
    
    X.loc[seg_id, 'mean_change_abs'] = np.mean(np.diff(xc))
       
    X.loc[seg_id, 'abs_max'] = np.abs(xc).max()
    X.loc[seg_id, 'abs_min'] = np.abs(xc).min()
    X.loc[seg_id, 'abs_std'] = np.abs(xc).std()
    X.loc[seg_id, 'abs_mean'] = np.abs(xc).mean()
    
    for value in [10000, 50000]:
        #value = randint(10000, 150000)
        X.loc[seg_id, 'std_first_%s' %value] = xc[:value].std()
        X.loc[seg_id, 'std_last_%s' %value] = xc[-value:].std()
        
        X.loc[seg_id, 'mean_first_%s' %value] = xc[:value].mean()
        X.loc[seg_id, 'mean_last_%s' %value] = xc[-value:].mean()
       
        X.loc[seg_id, 'min_first_%s' %value] = xc[:value].min()
        X.loc[seg_id, 'min_last_%s' %value] = xc[-value:].min()
        
        X.loc[seg_id, 'max_first_%s' %value] = xc[:value].max()
        X.loc[seg_id, 'max_last_%s' %value] = xc[-value:].max()
        
        #X.loc[seg_id, 'mean_change_rate_last_%s' %value] = np.mean(np.nonzero((np.diff(xc[-value:]) / xc[-value:][:-1]))[0])
        #X.loc[seg_id, 'mean_change_rate_first_%s' %value] = np.mean(np.nonzero((np.diff(xc[:value]) / xc[:value][:-1]))[0])
        
    X.loc[seg_id, 'max_to_min'] = xc.max() / np.abs(xc.min())
    X.loc[seg_id, 'max_min_diff'] = xc.max() - np.abs(xc.min())
    X.loc[seg_id, 'sum'] = xc.sum()
    X.loc[seg_id, 'mean_change_abs'] = np.mean(np.diff(xc))
    
    X.loc[seg_id, 'q95'] = np.quantile(xc, 0.95)
    X.loc[seg_id, 'q80'] = np.quantile(xc, 0.80)
    X.loc[seg_id, 'q05'] = np.quantile(xc, 0.05)
    X.loc[seg_id, 'q20'] = np.quantile(xc, 0.20)
    #interquartile rang
    X.loc[seg_id, 'iqr'] = np.subtract(*np.percentile(xc, [75, 25]))
    X.loc[seg_id, 'q999'] = np.quantile(xc,0.999)
    X.loc[seg_id, 'q001'] = np.quantile(xc,0.001)
    
    X.loc[seg_id, 'trend'] = add_linear_trend(xc)
  # recursive_sta_lta
    df = 1/(np.sort(realFFT))
    X.loc[seg_id, 'recursive_sta_lta1_mean'] = recursive_sta_lta(xc, 500, 10000).mean()
    X.loc[seg_id, 'recursive_sta_lta2_mean'] = recursive_sta_lta(xc, 5000, 100000).mean()
    #X.loc[seg_id, 'recursive_sta_lta3_mean'] = recursive_sta_lta(xc, df[0]*50, df[0]*100).mean()
    #X.loc[seg_id, 'recursive_sta_lta4_mean'] = recursive_sta_lta(xc, df[1]*100, df[1]*200).mean()
    
    X.loc[seg_id, 'Hann_window_mean'] = (convolve(xc, hann(150), mode='same') / sum(hann(150))).mean()
    X.loc[seg_id, 'Hilbert_mean'] = np.abs(hilbert(xc)).mean()
    
    # windowing
    for winlen in [10, 100, 1000]:
        x_rollwindow_std = xc.rolling(window=winlen, win_type='cosine').std().dropna().values
        X.loc[seg_id, 'ave_roll_std_' + str(winlen)] = x_rollwindow_std.mean()    
        X.loc[seg_id, 'std_roll_std_' + str(winlen)] = x_rollwindow_std.std()
        X.loc[seg_id, 'max_roll_std_' + str(winlen)] = x_rollwindow_std.max()   
        X.loc[seg_id, 'min_roll_std_' + str(winlen)] = x_rollwindow_std.min()    
        X.loc[seg_id, 'q01_roll_std_' + str(winlen)] = np.quantile(x_rollwindow_std, 0.01)
        X.loc[seg_id, 'q05_roll_std_' + str(winlen)] = np.quantile(x_rollwindow_std, 0.05)
        X.loc[seg_id, 'q95_roll_std_' + str(winlen)] = np.quantile(x_rollwindow_std, 0.95)
        X.loc[seg_id, 'q99_roll_std_' + str(winlen)] = np.quantile(x_rollwindow_std, 0.99)
        X.loc[seg_id, 'ave_change_abs_roll_std_' + str(winlen)] = np.mean(np.diff(x_rollwindow_std))
        
        x_rollwindow_mean = xc.rolling(window=winlen, win_type='cosine').mean().dropna().values
        X.loc[seg_id, 'ave_roll_mean_' + str(winlen)] = x_rollwindow_mean.mean()
        X.loc[seg_id, 'std_roll_mean_' + str(winlen)] = x_rollwindow_mean.std()
        X.loc[seg_id, 'max_roll_mean_' + str(winlen)] = x_rollwindow_mean.max()
        X.loc[seg_id, 'min_roll_mean_' + str(winlen)] = x_rollwindow_mean.min()
        X.loc[seg_id, 'q01_roll_mean_' + str(winlen)] = np.quantile(x_rollwindow_mean, 0.01)
        X.loc[seg_id, 'q05_roll_mean_' + str(winlen)] = np.quantile(x_rollwindow_mean, 0.05)
        X.loc[seg_id, 'q95_roll_mean_' + str(winlen)] = np.quantile(x_rollwindow_mean, 0.95)
        X.loc[seg_id, 'q99_roll_mean_' + str(winlen)] = np.quantile(x_rollwindow_mean, 0.99)
        X.loc[seg_id, 'ave_change_abs_roll_mean_' + str(winlen)] = np.mean(np.diff(x_rollwindow_mean))
        
    for winlen in [500, 1000]:
        #winlen = randint(500, 10000)
        X.loc[seg_id, 'Moving_average_%s_mean' %winlen] = xc.rolling(window=winlen).mean().mean(skipna=True)
        X.loc[seg_id, 'MA_%s_std_mean' %winlen] = xc.rolling(window=winlen).std().mean()
        X.loc[seg_id,'MA_%s_BB_high_mean' %winlen] = (X.loc[seg_id, 'Moving_average_%s_mean' %winlen] + \
                                                      2 * X.loc[seg_id, 'MA_%s_std_mean' %winlen]).mean()
        X.loc[seg_id,'MA_%s_BB_low_mean' %winlen] = (X.loc[seg_id, 'Moving_average_%s_mean' %winlen] - \
                                                      2 * X.loc[seg_id, 'MA_%s_std_mean' %winlen]).mean()


### 3.2. Process the Train Dataset


In [None]:
print('Train dataset has {} rows and {} columns.'.format(train_df.shape[0], train_df.shape[1]))
print("There are {} files in the test folder.".format(len(os.listdir(os.path.join(INPATH, 'test' )))))
print('Each test segment has {} rows.'.format(\
    pd.read_csv(glob.glob(os.path.join(INPATH, 'test', '*.csv'))[0]).shape[0]))

In [None]:
train_segments = int(np.round(train_df.shape[0] / 150000))
print("Number of segments: ", train_segments)

train_X = pd.DataFrame(index=range(train_segments), dtype=np.float32)
train_y = pd.DataFrame(index=range(train_segments), dtype=np.float32, columns=['time_to_failure'])

In [None]:
# Feature engineering
# Split the train data into segments of the same dimension as the test files.
seg_length = 150000
# Iterate over all segments
for seg_id in tqdm(range(train_segments)):
    seg = train_df.iloc[seg_id*seg_length:(seg_id+1)*seg_length]
    create_features(seg_id, seg, train_X)
    train_y.loc[seg_id, 'time_to_failure'] = seg['time_to_failure'].values[-1]

In [None]:
# Scale the train data
scaler = StandardScaler()
scaler.fit(train_X)
scaled_train_X = pd.DataFrame(scaler.transform(train_X), columns=train_X.columns)
#scaled_train_X.head(4)
scaled_train_X.info()

In [None]:
np.any(np.isnan(train_X))
for col in train_y:
    if train_y[col].isnull().values.any():
        print(col)


### 3.3. Process the Test Dataset
<a id='sec3.3'></a>

In [None]:
# Check the shape of the submission file
submission = pd.read_csv(os.path.join(INPATH,'sample_submission.csv'))
submission.shape

In [None]:
X_test = pd.DataFrame(index=range(submission.shape[0]), dtype=np.float32)

# Load files located in the test directory
test_files = glob.glob(os.path.join( INPATH, 'test', '*_*.*'))

# Feature engineering for the test set
# Iterate over all files in the test directory and
# create the test dataframe with new features
all_test_files = len(test_files)
#all_test_files =600
for seg_id in tqdm(range(all_test_files)):
    seg = pd.read_csv(test_files[seg_id])
    create_features(seg_id, seg, X_test)
    


### 3.4 Data Selection

In [None]:
Y_train = train_y['time_to_failure'].values

selection = list(scaled_train_X.columns)
X_train = scaled_train_X[selection].values

x_test = X_test[selection].values


In [None]:
scaled_train_X[selection].describe()

# 4. Evaluating Some Models
<a id='sec4'></a>

In [None]:
#Build the Models
random_state = 1

# Step 1: create a list containing all estimators with their default parameters
model_list = [LinearRegression(), Ridge(), Lasso(),
          KNeighborsRegressor(), DecisionTreeRegressor(),
          RandomForestRegressor(), GradientBoostingRegressor(), 
          AdaBoostRegressor()]


# Step 2: calculate the cross-validation mean and standard deviation for the estimators
cv_mean, cv_std = [], []

 
for mdl in model_list: 
    print('-----------------------------', mdl)
    cv = cross_val_score(mdl, X_train, y = Y_train, scoring='neg_mean_squared_error', cv = 7, n_jobs = -1)
    
    cv_mean.append(abs(cv.mean()))
    cv_std.append(cv.std())

        
# Step 3: create a dataframe and plot  means with error bars
cv_total = pd.DataFrame({'Algorithm': ['Linear Regression',  'Ridge', 'Lasso Regression',
         'K Neighbors Regressor', 'Decision Tree Regressor', 
         'Random Forest Regressor', 'Gradient Boosting Regressor',
         'Adaboost Regressor'],
                         'CV-Means': cv_mean, 
                         'CV-Errors': cv_std})

sns.barplot(x='CV-Means', y='Algorithm', data = cv_total, palette = 'Set1', orient = 'h',\
            **{'xerr': cv_std})
plt.xlabel('Mean Squared Error')
plt.title('Cross Validation Scores')


In [None]:
cv_total

# 5. Hyperparameter Tuning
<a id='sec5'></a>
We're optimising parameters for those models that have the highest accuracy.

#### Ridge

In [None]:
#Hyperparameter search for Ridge
ridge = Ridge()
param_grid = {'alpha': [0.05, 0.5, 0.7]
             }

cv = RepeatedKFold(n_splits=10, n_repeats=3, random_state=1)                                 
grid_ridge = GridSearchCV(ridge, param_grid = param_grid, cv=cv , scoring='neg_mean_squared_error', verbose = True, n_jobs = -1);
grid_ridge_result = grid_ridge.fit(X_train, Y_train);
# summarize results for rf
print('-------------------------------------------\n')
print("\nBest Performance: MSE= %f using \n%s" % (grid_ridge_result.best_score_, grid_ridge_result.best_params_));

means = grid_ridge_result.cv_results_['mean_test_score']
stds = grid_ridge_result.cv_results_['std_test_score']
params = grid_ridge_result.cv_results_['params']
#for mean, stdev, param in zip(means, stds, params):#
#    print("%f (%f) with: %r" % (mean, stdev, param))

#### Random Forest Regressor

In [None]:
#Hyperparameter search for Randome Forest
# https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestRegressor.html
rf = RandomForestRegressor()
param_grid = {'n_estimators': [100, 200],
              'bootstrap': [True],
              'max_depth': [3, 5],
              #'max_features': ['auto','sqrt'],
              'min_samples_leaf': [2, 3],
              'min_samples_split': [2, 3]}
cv = RepeatedKFold(n_splits=10, n_repeats=3, random_state=random_state)                                   
grid_rf = GridSearchCV(rf, param_grid = param_grid, cv = 7, scoring='neg_mean_squared_error', verbose = True, n_jobs = -1);
grid_rf_result = grid_rf.fit(X_train, Y_train);
# summarize results for rf
print('-------------------------------------------\n')
print("\nBest Performance : MSE= %f using \n%s" % (grid_rf_result.best_score_, grid_rf_result.best_params_));


#### AdaBoost Regressor

In [None]:
param_dist = {
 'n_estimators': [50, 100],
 'learning_rate' : [0.03,0.05,0.1,0.3],
 'loss' : ['linear', 'square', 'exponential']
 }

#cv = RepeatedKFold(n_splits=5, n_repeats=3, random_state=random_state)  
rand_ada = RandomizedSearchCV(AdaBoostRegressor(), param_distributions = param_dist,
                            cv=10, n_iter = 7, n_jobs=-1, scoring='neg_mean_squared_error')
rand_ada_result = rand_ada.fit(X_train, Y_train)

print('-------------------------------------------\n')
print("\nBest Performance: MSE= %f using \n%s" % (rand_ada_result.best_score_, rand_ada_result.best_params_));


# 6. Building Final Models
<a id='sec6'></a>

In [None]:
estimators = [('rf', grid_rf_result.best_estimator_),\
             ('Ridge', grid_ridge_result.best_estimator_),
             ('AdaBoost', rand_ada_result.best_estimator_)]
                
tuned_voting = VotingRegressor(estimators = estimators, n_jobs = -1)

tuned_voting.fit(X_train, Y_train)


cv = RepeatedKFold(n_splits=5, n_repeats=3, random_state=random_state)  

cv_voting = cross_val_score(tuned_voting, X_train, Y_train, cv = 10, scoring='neg_mean_squared_error')

print ('Tuned Models - Ensemble\n-----------------------')
print ('Voting: {}%'.format(np.round(cv_voting.mean(), 2)))


# 7. Prediction
<a id='sec7'></a>
I'll use the Ridge model for the final prediction. 

In [None]:
#y_pred_ensemble = tuned_voting.predict(x_test)
y_pred_ridge = grid_ridge.predict(x_test)

## 6.1. Submission

In [None]:
submission.time_to_failure = y_pred_ridge
submission.to_csv('submission.csv',index=False)
submission.head(4)

## References
[Gabriel Preda](https://www.kaggle.com/gpreda/lanl-earthquake-eda-and-prediction)<br>
[obspy](https://docs.obspy.org/_modules/obspy/signal/trigger.html)