In [1]:
import gc
import os
import time
import logging
import datetime
import warnings
import numpy as np
import pandas as pd
import seaborn as sns
import xgboost as xgb
import lightgbm as lgb
from scipy import stats
from scipy.signal import hann
from tqdm import tqdm_notebook
import matplotlib.pyplot as plt
from scipy.signal import hilbert
from scipy.signal import convolve
from sklearn.svm import NuSVR, SVR
from catboost import CatBoostRegressor
from sklearn.kernel_ridge import KernelRidge
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import mean_absolute_error
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import KFold,StratifiedKFold, RepeatedKFold
from sklearn.ensemble import RandomForestRegressor
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV,RandomizedSearchCV
warnings.filterwarnings("ignore")
from typing import TypeVar, List, Dict, Tuple
PandasDataFrame = TypeVar('pandas.core.frame.DataFrame')

# Refs: https://www.kaggle.com/byfone/basic-feature-feat-catboost
# Refs: https://www.kaggle.com/kernels/scriptcontent/13873316/download

In [3]:
IS_LOCAL = False
if(IS_LOCAL):
    PATH="../input/LANL/"
else:
    PATH="../input/"
os.listdir(PATH)

train_X = pd.read_csv(os.path.join(PATH,'../input/basic-features-randomized/scaled_train_X_rand_shuffle.csv'))
train_X = train_X.drop(train_X.columns[0], axis=1)

train_y = pd.read_csv(os.path.join(PATH,'../input/basic-features-randomized/train_y_rand2_shuffle.csv'))
train_y = train_y.drop(train_y.columns[0], axis=1)

train_X.head()

Unnamed: 0,mean,std,max,min,Rmean,Rstd,Rmax,Rmin,Imean,Istd,Imax,Imin,Rmean_last_5000,Rstd__last_5000,Rmax_last_5000,Rmin_last_5000,Rmean_last_15000,Rstd_last_15000,Rmax_last_15000,Rmin_last_15000,mean_change_abs,mean_change_rate,abs_max,abs_min,std_first_50000,std_last_50000,std_first_10000,std_last_10000,avg_first_50000,avg_last_50000,avg_first_10000,avg_last_10000,min_first_50000,min_last_50000,min_first_10000,min_last_10000,max_first_50000,max_last_50000,max_first_10000,max_last_10000,...,q01_roll_std_759,q05_roll_std_759,q95_roll_std_759,q99_roll_std_759,av_change_abs_roll_std_759,av_change_rate_roll_std_759,abs_max_roll_std_759,ave_roll_mean_759,std_roll_mean_759,max_roll_mean_759,min_roll_mean_759,q01_roll_mean_759,q05_roll_mean_759,q95_roll_mean_759,q99_roll_mean_759,av_change_abs_roll_mean_759,av_change_rate_roll_mean_759,abs_max_roll_mean_759,ave_roll_std_1000,std_roll_std_1000,max_roll_std_1000,min_roll_std_1000,q01_roll_std_1000,q05_roll_std_1000,q95_roll_std_1000,q99_roll_std_1000,av_change_abs_roll_std_1000,av_change_rate_roll_std_1000,abs_max_roll_std_1000,ave_roll_mean_1000,std_roll_mean_1000,max_roll_mean_1000,min_roll_mean_1000,q01_roll_mean_1000,q05_roll_mean_1000,q95_roll_mean_1000,q99_roll_mean_1000,av_change_abs_roll_mean_1000,av_change_rate_roll_mean_1000,abs_max_roll_mean_1000
0,-0.866159,-0.269957,-0.151219,0.106208,-0.690624,-0.242498,-0.866159,0.325043,-0.208961,-0.270338,-0.376071,0.376071,-0.236758,-0.267913,-0.331788,0.364167,-0.729032,-0.281551,-0.318025,0.325043,0.120638,0.107414,-0.158886,0.0,-0.292581,-0.129222,-0.636064,-0.269932,-1.191343,-0.398134,-1.358672,-0.259505,0.30603,-0.170096,0.753611,0.345998,-0.325824,0.097631,-0.673641,-0.375425,...,-1.370965,-1.288314,-0.895859,-0.186812,-0.035866,-0.21201,-0.132986,-0.864653,-0.191683,-0.195095,0.125786,-0.162078,-0.637024,-1.026392,-0.602866,0.22465,-0.202246,-0.19386,-0.582107,-0.207736,-0.134641,-1.349465,-1.220609,-1.265844,-0.845943,-0.186879,-0.032153,0.264048,-0.134641,-0.864453,-0.202793,-0.209165,0.114181,-0.182046,-0.669546,-0.991285,-0.693224,-0.159263,0.254931,-0.209165
1,-1.359823,-0.149174,-0.141383,0.148828,-0.225349,-0.177262,-1.359823,0.004611,-0.224443,-0.154322,-0.125572,0.125572,0.011912,-0.189235,-0.173442,0.279324,0.189519,-0.142955,0.062292,0.004611,-0.027405,-1.37482,-0.149948,0.0,0.025787,-0.30426,-0.494023,-0.222812,-1.9897,-0.730608,-1.629616,-0.479359,-0.06029,0.398384,0.539884,0.301323,0.116376,-0.391122,-0.556455,-0.337728,...,-0.489757,-0.532032,-0.319132,-0.082173,-0.033981,-1.218251,-0.108121,-1.361888,0.070308,-0.218407,0.014229,-0.637405,-1.476834,-1.244975,-0.72071,-0.075411,-1.185362,-0.216948,-0.234896,-0.123798,-0.106817,-1.060932,-0.593264,-0.519711,-0.338419,-0.084915,-0.03055,-1.271445,-0.106817,-1.361585,0.109248,-0.213951,-0.070868,-0.700897,-1.533654,-1.262518,-0.781938,0.080959,-1.278397,-0.213951
2,-0.910674,-0.096915,-0.22007,0.22751,0.395018,-0.131183,-0.910674,0.0837,0.255525,-0.097339,-0.068314,0.068314,0.193654,-0.128482,-0.202969,0.22907,0.281613,-0.093124,-0.027335,0.0837,-0.471534,1.58138,-0.221448,0.0,-0.174216,0.045298,-0.365792,0.193018,-0.243173,-1.348183,-0.82042,-0.141645,0.288305,0.044534,0.411648,-0.127553,-0.161416,-0.023119,-0.361146,0.076938,...,0.779945,0.684723,0.182893,-0.119915,0.043069,1.971829,-0.190323,-0.91617,0.001522,-0.213859,-0.020491,-0.427636,-0.907629,-0.712774,-0.500066,0.126192,1.950223,-0.212443,0.037385,-0.130342,-0.182806,0.64415,0.674383,0.624823,0.084665,-0.136731,0.078407,2.296023,-0.182806,-0.917983,0.027625,-0.205975,-0.056389,-0.483923,-0.925578,-0.71282,-0.534806,0.044926,2.29897,-0.205975
3,-1.20844,-0.252248,-0.138104,0.198004,-0.535532,-0.244028,-1.20844,0.352303,0.332939,-0.250578,-0.280843,0.280843,-0.097595,-0.220148,-0.289157,0.282102,-0.139797,-0.265328,-0.166044,0.352303,0.416724,-0.111127,-0.146969,0.0,-0.066788,-0.367231,-0.557934,-0.211106,-1.4549,-1.028206,-1.60153,-1.206595,-0.007115,0.433189,0.561257,0.220909,0.122046,-0.316371,-0.517393,-0.026728,...,-1.062233,-1.044394,-0.697624,-0.143581,-0.032073,-1.120191,-0.199076,-1.206632,-0.166964,-0.182302,0.085279,-0.407552,-0.944954,-1.154691,-0.597852,-0.530191,-1.096603,-0.181189,-0.49626,-0.207264,-0.194952,-0.993794,-1.107091,-1.048331,-0.644325,-0.10581,-0.03238,-0.703565,-0.194952,-1.205603,-0.179014,-0.162375,0.060149,-0.465056,-0.939802,-1.139559,-0.589724,-0.375464,-0.708218,-0.162375
4,-0.091303,-0.049959,0.199594,-0.395392,0.860293,-0.070145,-0.091303,-0.033622,-0.255409,-0.056957,-0.040162,0.040162,0.110735,-0.184604,-0.233829,0.212317,0.131113,-0.032995,-0.132706,-0.033622,-0.545555,-0.3093,0.296926,0.0,-0.088781,0.225578,0.151646,0.957975,-0.737176,0.368894,-0.932432,-0.054869,0.252854,-1.057621,-0.229532,-2.075363,-0.184093,0.712886,0.439623,1.660209,...,0.284801,0.017844,-0.040275,-0.181253,0.151932,-0.033113,0.086451,-0.093199,-0.151842,0.000216,0.019052,0.03207,-0.105144,-0.289865,-0.25435,0.072275,-0.017637,-0.000428,-0.073037,-0.019526,0.068833,1.023482,0.348283,-0.049371,-0.090806,-0.194214,-0.321057,-0.47179,0.068833,-0.093225,-0.161436,-0.013498,0.068978,0.002854,-0.09703,-0.253533,-0.275,0.079243,-0.474514,-0.013498


In [4]:
submission = pd.read_csv(os.path.join(PATH,'LANL-Earthquake-Prediction/sample_submission.csv'), index_col='seg_id')
test_X = pd.read_csv(os.path.join(PATH,'lanlfeatures198/feature_extraction_ds_test.csv'), index_col='seg_id')
submission.shape, test_X.shape

((2624, 1), (2624, 198))

In [None]:
"""Use cross-validation (e.g., KFold)"""
def fold_generator(x, y, groups=None, num_folds=10, shuffle=True, seed=2019):
    folds = KFold(num_folds, shuffle=shuffle, random_state=seed)
    for train_index, test_index in folds.split(x, y, groups):
        yield train_index, test_index

In [None]:
"""Perform transforms and return final estimator"""
def make_pipeline(estimator):
    pipeline = Pipeline([
        # Each item is a tuple with a name and a transformer or estimator
        ('scaler', StandardScaler()),
        ('model', estimator)
    ])
    return pipeline

In [None]:
"""Search the hyperparameters and return the estimator with the best parameters"""
def search_cv(x, y, pipeline, grid, max_iter=None, num_folds=10, shuffle=True):
    t0 = time.time()
    
    cv = fold_generator(x, y, num_folds=num_folds)
    if max_iter is None:
        # Exhaustive search over specified parameter values for an estimator (pipeline)
        search = GridSearchCV(pipeline, grid, cv=cv,
                              scoring='neg_mean_absolute_error')
    else:
        # Randomized search on hyper parameters with 
        # The number of parameter settings that are tried is given by n_iter (not all parameter values are tried out)
        search = RandomizedSearchCV(pipeline, grid, n_iter=max_iter, cv=cv,
                                    scoring='neg_mean_absolute_error')
    search.fit(x, y)
    
    t0 = time.time() - t0
    print("Best CV score: {:.4f}, time: {:.1f}s".format(-search.best_score_, t0))
    print(search.best_params_)
    return search.best_estimator_

In [None]:
"""Train, make predictions & plot results"""
def make_predictions(x, y, pipeline, num_folds=10, shuffle=True, test=None, plot=True):
    if test is not None:
        sub_prediction = np.zeros(test.shape[0])
        
    oof_prediction = np.zeros(x.shape[0])
    # use cross-validation (10-fold cross-validation)
    for tr_idx, val_idx in fold_generator(x, y, num_folds=num_folds):
        pipeline.fit(x.iloc[tr_idx], y.iloc[tr_idx])
        oof_prediction[val_idx] = pipeline.predict(x.iloc[val_idx])

        if test is not None:
            sub_prediction += pipeline.predict(test) / num_folds
    
    if plot:
        plot_predictions(y, oof_prediction)
    if test is None:
        return oof_prediction
    else:
        return oof_prediction, sub_prediction

In [None]:
def plot_predictions(y, oof_predictions):
    """Plot out-of-fold predictions vs actual values."""
    fig, axis = plt.subplots(1, 2, figsize=(14, 6))
    ax1, ax2 = axis
    ax1.set_xlabel('actual')
    ax1.set_ylabel('predicted')
    ax1.set_ylim([-5, 20])
    ax2.set_xlabel('train index')
    ax2.set_ylabel('time to failure')
    ax2.set_ylim([-2, 18])
    ax1.scatter(y, oof_predictions, color='brown')
    ax1.plot([(0, 0), (20, 20)], [(0, 0), (20, 20)], color='blue')
    ax2.plot(y, color='blue', label='y_train')
    ax2.plot(oof_predictions, color='orange')

In [None]:
grid = {
    'model__max_depth': [4, 6, 8, 10, 12],
    'model__max_features': ['auto', 'sqrt', 'log2'],
    'model__min_samples_leaf': [2, 4, 8, 12, 14, 16, 20],
    'model__min_samples_split': [2, 4, 6, 8, 12, 16, 20],
}
rf_pipe = make_pipeline(RandomForestRegressor(criterion='mae', n_estimators=50))
rf_pipe = search_cv(train_X, train_y, rf_pipe, grid, max_iter=10)
rf_oof = make_predictions(train_X, train_y, rf_pipe)

In [None]:
rf_oof, rf_sub = make_predictions(train_X, train_y, rf_pipe,
                                  test=test_X, plot=False)

In [None]:
submission.time_to_failure = rf_sub
submission.to_csv('submission.csv',index=True)