<h2>1. About this notebook</h2>

In this notebook I try a few different models to predict the time to failure during earthquake simulations. I'm using some new features like trend and absolute values with others from public kernels (e.g. quantiles and rolling means).

Update 03/02: Fixed erros at lgbm; add feature importance and visualizations.

For more details about LANL competition you can check my [previous kernel](https://www.kaggle.com/jsaguiar/seismic-data-exploration).

In [None]:
import os
import time
import random
import numpy as np
import pandas as pd
from tqdm import tqdm_notebook as tqdm
# Visualizations
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.graph_objs as go
from plotly.offline import init_notebook_mode, iplot
# Sklearn
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import GridSearchCV, KFold
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import NuSVR, SVR
from sklearn.kernel_ridge import KernelRidge
from sklearn.ensemble import AdaBoostRegressor
from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor
import lightgbm as lgb
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
warnings.simplefilter(action='ignore', category=UserWarning)
sns.set()
init_notebook_mode(connected=True)

In [None]:
data_type = {'acoustic_data': np.int16, 'time_to_failure': np.float64}
train = pd.read_csv('../input/LANL-Earthquake-Prediction/train.csv', dtype=data_type)
train.head(20)
train.shape
# train.describe()
# print(os.listdir("../input"))

In [None]:
train_acoustic_data_small = train['acoustic_data'].values[::50]
train_time_to_failure_small = train['time_to_failure'].values[::50]

fig, ax1 = plt.subplots(figsize=(16, 8))
plt.title("Trends of acoustic_data and time_to_failure. 2% of data (sampled)")
plt.plot(train_acoustic_data_small, color='b')
ax1.set_ylabel('acoustic_data', color='b')
plt.legend(['acoustic_data'])
ax2 = ax1.twinx()
plt.plot(train_time_to_failure_small, color='g')
ax2.set_ylabel('time_to_failure', color='g')
plt.legend(['time_to_failure'], loc=(0.875, 0.9))
plt.grid(False)

del train_acoustic_data_small
del train_time_to_failure_small
del train

<h2>2. Feature Engineering</h2>

Simple trend feature: fit a linear regression and return the coefficient

In [None]:
def add_trend_feature(arr, abs_values=False):
    idx = np.array(range(len(arr)))
    if abs_values:
        arr = np.abs(arr)
    lr = LinearRegression()
    lr.fit(idx.reshape(-1, 1), arr)
    return lr.coef_[0]

cross zero rate feature

In [None]:
cz = pd.Series(index=range(150000), dtype=np.float64)
def add_czr_feature(arrs):
    lens=len(arrs)
    return (arrs[lens-1]-arrs[0])/lens

def cpt_czr(arrs):
    lens=len(arrs)    
    cnt=0
    for i in range(1,lens):
        if((arrs[i]>0 and arrs[i-1]<0) or (arrs[i]<0 and arrs[i-1]>0)):
            cnt=cnt+1
        cz[i]=cnt
    cz[0]=0

Group the training data in chunks of 150,000 examples and extract the following features:

* Aggregations: min, max, mean and std
* Absolute features: max, mean and std
* Quantile features
* Trend features
* Rolling features
* Ratios

In [None]:

rows = 150_000
segments = int(np.floor(629145480 / rows))
X_train = pd.DataFrame(index=range(segments), dtype=np.float64)
y_train = pd.DataFrame(index=range(segments), dtype=np.float64)
for segment in tqdm(range(segments)):
    break
    seg = train.iloc[segment*rows:segment*rows+rows]
    x = seg['acoustic_data']   # pd series
    y = seg['time_to_failure'].values[-1]  # single value
    cpt_czr(x.values)
    y_train.loc[segment, 'time_to_failure'] = y
    X_train.loc[segment, 'ave'] = x.values.mean()
    X_train.loc[segment, 'std'] = x.values.std()
    X_train.loc[segment, 'max'] = x.values.max()
    X_train.loc[segment, 'min'] = x.values.min()
    X_train.loc[segment, 'abs_max'] = np.abs(x.values).max()
    X_train.loc[segment, 'abs_mean'] = np.abs(x.values).mean()
    X_train.loc[segment, 'abs_std'] = np.abs(x.values).std()
    X_train.loc[segment, 'trend'] = add_trend_feature(x.values)
    X_train.loc[segment, 'abs_trend'] = add_trend_feature(np.abs(x.values))
    X_train.loc[segment, 'czr'] = add_czr_feature(cz.values)
    # New features - rolling features
    for w in [10, 100, 1000]:
        x_roll_std = x.rolling(w).std().dropna().values
        x_roll_mean = x.rolling(w).mean().dropna().values
        x_roll_abs_mean = x.abs().rolling(w).mean().dropna().values
        x_roll_abs_max = x.abs().rolling(w).max().dropna().values
        x_roll_czr = cz.rolling(w).apply(add_czr_feature).dropna().values
        
        X_train.loc[segment, 'trent_roll_std_' + str(w)] = add_trend_feature(x_roll_std)
        X_train.loc[segment, 'ave_roll_std_' + str(w)] = x_roll_std.mean()
        X_train.loc[segment, 'std_roll_std_' + str(w)] = x_roll_std.std()
        X_train.loc[segment, 'max_roll_std_' + str(w)] = x_roll_std.max()
        X_train.loc[segment, 'min_roll_std_' + str(w)] = x_roll_std.min()
        X_train.loc[segment, 'q01_roll_std_' + str(w)] = np.quantile(x_roll_std, 0.01)
        X_train.loc[segment, 'q05_roll_std_' + str(w)] = np.quantile(x_roll_std, 0.05)
        X_train.loc[segment, 'q10_roll_std_' + str(w)] = np.quantile(x_roll_std, 0.10)
        X_train.loc[segment, 'q95_roll_std_' + str(w)] = np.quantile(x_roll_std, 0.95)
        X_train.loc[segment, 'q99_roll_std_' + str(w)] = np.quantile(x_roll_std, 0.99)
        
        X_train.loc[segment, 'trent_roll_mean_' + str(w)] = add_trend_feature(x_roll_mean)
        X_train.loc[segment, 'ave_roll_mean_' + str(w)] = x_roll_mean.mean()
        X_train.loc[segment, 'std_roll_mean_' + str(w)] = x_roll_mean.std()
        X_train.loc[segment, 'max_roll_mean_' + str(w)] = x_roll_mean.max()
        X_train.loc[segment, 'min_roll_mean_' + str(w)] = x_roll_mean.min()
        X_train.loc[segment, 'q01_roll_mean_' + str(w)] = np.quantile(x_roll_mean, 0.01)
        X_train.loc[segment, 'q05_roll_mean_' + str(w)] = np.quantile(x_roll_mean, 0.05)
        X_train.loc[segment, 'q95_roll_mean_' + str(w)] = np.quantile(x_roll_mean, 0.95)
        X_train.loc[segment, 'q99_roll_mean_' + str(w)] = np.quantile(x_roll_mean, 0.99)
        
        X_train.loc[segment, 'trent_roll_abs_mean_' + str(w)] = add_trend_feature(x_roll_abs_mean)
        X_train.loc[segment, 'ave_roll_abs_mean_' + str(w)] = x_roll_abs_mean.mean()
        X_train.loc[segment, 'std_roll_abs_mean_' + str(w)] = x_roll_abs_mean.std()
        X_train.loc[segment, 'max_roll_abs_mean_' + str(w)] = x_roll_abs_mean.max()
        X_train.loc[segment, 'min_roll_abs_mean_' + str(w)] = x_roll_abs_mean.min()
        X_train.loc[segment, 'q01_roll_abs_mean_' + str(w)] = np.quantile(x_roll_abs_mean, 0.01)
        X_train.loc[segment, 'q05_roll_abs_mean_' + str(w)] = np.quantile(x_roll_abs_mean, 0.05)
        X_train.loc[segment, 'q95_roll_abs_mean_' + str(w)] = np.quantile(x_roll_abs_mean, 0.95)
        X_train.loc[segment, 'q99_roll_abs_mean_' + str(w)] = np.quantile(x_roll_abs_mean, 0.99)
        
        X_train.loc[segment, 'trent_roll_abs_max_' + str(w)] = add_trend_feature(x_roll_abs_max)
    
        X_train.loc[segment, 'trent_roll_czr_' + str(w)] = add_trend_feature(x_roll_czr)
        X_train.loc[segment, 'ave_roll_czr_' + str(w)] = x_roll_czr.mean()
        X_train.loc[segment, 'std_roll_czr_' + str(w)] = x_roll_czr.std()
        X_train.loc[segment, 'max_roll_czr_' + str(w)] = x_roll_czr.max()
        X_train.loc[segment, 'min_roll_czr_' + str(w)] = x_roll_czr.min()
        X_train.loc[segment, 'q01_roll_czr_' + str(w)] = np.quantile(x_roll_czr, 0.01)
        X_train.loc[segment, 'q05_roll_czr_' + str(w)] = np.quantile(x_roll_czr, 0.05)
        X_train.loc[segment, 'q95_roll_czr_' + str(w)] = np.quantile(x_roll_czr, 0.95)
        X_train.loc[segment, 'q99_roll_czr_' + str(w)] = np.quantile(x_roll_czr, 0.99)

In [None]:
X_train=pd.read_csv("../input/feature/X_train.csv")
y_train=pd.read_csv("../input/feature/y_train.csv")
X_train2=pd.read_csv("../input/feature2/X_train.csv")
y_train2=pd.read_csv("../input/feature2/y_train.csv")
del X_train['Unnamed: 0']
del y_train['Unnamed: 0']
del X_train2['Unnamed: 0']
del y_train2['Unnamed: 0']
X_train2=pd.concat([X_train,X_train2])
y_train2=pd.concat([y_train,y_train2])


In [None]:
print("Train shape:", X_train.shape)
X_train.head(3)
print("Train2 shape:", X_train2.shape)
X_train2.head(3)

Scale features and helper functions:

In [None]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
target = y_train.values.flatten()
X_train_scaled2 = scaler.fit_transform(X_train2)
target2 = y_train2.values.flatten()
num_folds = 5

def grid_search_cv(estimator, grid, features, target):
    """Return the best hyperparameters combination in grid."""
    t0 = time.time()
    reg = GridSearchCV(estimator, grid, cv=num_folds, scoring='neg_mean_absolute_error')
    reg.fit(features, target)
    
    t0 = time.time() - t0
    print("Best CV score: {:.4f}, time: {:.1f}s".format(-reg.best_score_, t0))
    print(reg.best_params_)
    return reg.best_params_

def make_predictions(estimator, features, target, test=None, plot=True, lgb=False):
    """Train the estimator and make predictions for oof and test data."""
    folds = KFold(num_folds, shuffle=True, random_state=2019)
    oof_predictions = np.zeros(features.shape[0])
    if test is not None:
        sub_predictions = np.zeros(test.shape[0])
    for (train_index, valid_index) in folds.split(features, target):
        
        if lgb:
            estimator.fit(features[train_index], target[train_index],
                          early_stopping_rounds=100, verbose=False,
                          eval_set=[(features[train_index], target[train_index]),
                                    (features[valid_index], target[valid_index])])
        else:
            estimator.fit(features[train_index], target[train_index])
        oof_predictions[valid_index] = estimator.predict(features[valid_index]).flatten()
        if test is not None:
            sub_predictions += estimator.predict(test).flatten() / num_folds
    
    # Plot out-of-fold predictions vs actual values
    if plot:
        fig, axis = plt.subplots(2,1, figsize=(12,5))
        ax1, ax2 = axis
        ax1.set_xlabel('actual')
        ax1.set_ylabel('predicted')
        ax2.set_xlabel('train index')
        ax2.set_ylabel('time to failure')
        ax1.scatter(target[:4194], oof_predictions[:4194], color='brown')
        ax1.plot([(0, 0), (20, 20)], [(0, 0), (20, 20)], color='blue')
        
        ax2.plot(oof_predictions[:4194], color='orange')
        ax2.plot(target[:4194], color='blue', label='y_train')
    if test is not None:
        return oof_predictions, sub_predictions
    else:
        return oof_predictions

<h2>**3. Models**</h2>

Let's try a few different models and submit the one with the best validation score. The predicted values in the following plots are using a out-of-fold scheme.

<h3>**Ridge Regression**</h3>

The first model will be a linear regression with L2 regularization.



In [None]:
print(X_train_scaled2.shape)
print(target2.shape)

In [None]:
grid = [{'alpha': np.concatenate([np.linspace(0.001, 1, 100), np.linspace(1, 200, 1000)])}]
rr_params = grid_search_cv(Ridge(), grid, X_train_scaled, target)
ridge_oof = make_predictions(Ridge(**rr_params), X_train_scaled, target)

There are some huge negative values when using a linear model. We can try to change negative values for zeros:

In [None]:
ridge_oof[ridge_oof < 0] = 0
print("Mean error: {:.4f}".format(mean_absolute_error(target, ridge_oof)))

和上一个模型一样，只是数据是带有50%的重叠，下面的结果都是带有50%重叠的

In [None]:
grid = [{'alpha': np.concatenate([np.linspace(0.001, 1, 100), np.linspace(1, 200, 1000)])}]
rr_params2 = grid_search_cv(Ridge(), grid, X_train_scaled2, target2)
ridge_oof2 = make_predictions(Ridge(**rr_params2), X_train_scaled2, target2)

There are some huge negative values when using a linear model. We can try to change negative values for zeros:

In [None]:
ridge_oof2[ridge_oof2 < 0] = 0
print("Mean error: {:.4f}".format(mean_absolute_error(target2, ridge_oof2)))

<h3>**Kernel Ridge**</h3>

This model combines regularized linear regression with a given kernel (radial basis in this case).

In [None]:
grid = [{'gamma': np.linspace(1e-8, 0.1, 10), 'alpha': [0.0005, 0.001, 0.02, 0.08, 0.1]}]
kr_params2 = grid_search_cv(KernelRidge(kernel='rbf'), grid, X_train_scaled2, target2)
kr_oof2 = make_predictions(KernelRidge(kernel='rbf', **kr_params2), X_train_scaled2, target2)

<h3>**Gradient Boosting**</h3>

The last model is a gradient boosting decision tree. It's not possible to use GridSearchCV with early stopping (lightgbm), so I am using a custom function for random search.

In [None]:
fixed_params = {
    'objective': 'regression_l1',
    'boosting': 'gbdt',
    'verbosity': -1,
    'random_seed': 19,
    'n_estimators': 20000,
}

param_grid = {
    'learning_rate': [0.1, 0.05, 0.01, 0.005],
    'num_leaves': list(range(8, 92, 4)),
    'max_depth': [3, 4, 5, 6, 8, 12, 16, -1],
    'feature_fraction': [0.8, 0.85, 0.9, 0.95, 1],
    'subsample': [0.8, 0.85, 0.9, 0.95, 1],
    'lambda_l1': [0, 0.1, 0.2, 0.4, 0.6, 0.9],
    'lambda_l2': [0, 0.1, 0.2, 0.4, 0.6, 0.9],
    'min_data_in_leaf': [10, 20, 40, 60, 100],
    'min_gain_to_split': [0, 0.001, 0.01, 0.1],
}

best_score = 999
dataset = lgb.Dataset(X_train2, label=y_train2)  # no need to scale features

for i in range(20):
    params = {k: random.choice(v) for k, v in param_grid.items()}
    params.update(fixed_params)
    result = lgb.cv(params, dataset, nfold=5, early_stopping_rounds=100,
                    stratified=False)
    
    if result['l1-mean'][-1] < best_score:
        best_score = result['l1-mean'][-1]
        best_params = params
        best_nrounds = len(result['l1-mean'])

In [None]:
print("Best mean score: {:.4f}, num rounds: {}".format(best_score, best_nrounds))
print(best_params)
gb_oof = make_predictions(lgb.LGBMRegressor(**best_params), X_train.values, target, lgb=True)

Now let's have a look at the <b>feature importance</b>:

In [None]:
def plot_feature_importance(features, target, columns):
    folds = KFold(num_folds, shuffle=True, random_state=2019)
    importance_frame = pd.DataFrame()
    for (train_index, valid_index) in folds.split(features, target):
        reg = lgb.LGBMRegressor(**best_params)
        reg.fit(features[train_index], target[train_index],
                early_stopping_rounds=100, verbose=False,
                eval_set=[(features[train_index], target[train_index]),
                          (features[valid_index], target[valid_index])])
        fold_importance = pd.DataFrame()
        fold_importance["feature"] = columns
        fold_importance["gain"] = reg.booster_.feature_importance(importance_type='gain')
        #fold_importance["split"] = reg.booster_.feature_importance(importance_type='split')
        importance_frame = pd.concat([importance_frame, fold_importance], axis=0)
        
    mean_importance = importance_frame.groupby('feature').mean().reset_index()
    mean_importance.sort_values(by='gain', ascending=True, inplace=True)
    trace = go.Bar(y=mean_importance.feature, x=mean_importance.gain,
                   orientation='h', marker=dict(color='rgb(49,130,189)'))

    layout = go.Layout(
        title='Feature importance', height=1200, width=800,
        showlegend=False,
        xaxis=dict(
            title='Importance by gain',
            titlefont=dict(size=14, color='rgb(107, 107, 107)'),
            domain=[0.25, 1]
        ),
    )

    fig = go.Figure(data=[trace], layout=layout)
    iplot(fig)
    
plot_feature_importance(X_train.values, target, X_train.columns)