In [27]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

import os
print(os.listdir("../input"))

# Any results you write to the current directory are saved as output.

['test', 'train.csv', 'sample_submission.csv']


In [28]:
import time
import datetime
import warnings
import matplotlib.pyplot as plt
import seaborn as sns
import xgboost as xgb
import lightgbm as lgb
from tqdm import tqdm
import scipy as sp
from sklearn.svm import NuSVR, SVR
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from sklearn.linear_model import LassoCV, RidgeCV
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import RobustScaler
from sklearn.model_selection import KFold, cross_val_score
warnings.filterwarnings("ignore")
warnings.simplefilter(action='ignore', category=FutureWarning)
from numpy import inf
pd.options.display.precision = 15

In [29]:
import os
import gc
from joblib import Parallel, delayed

In [30]:
def classic_sta_lta(x, Ns, Nl):
    sta = np.cumsum(x ** 2)
    sta = np.require(sta, dtype=np.float)
    lta = sta.copy()
    lta[Nl:-Ns] = lta[Nl:-Ns] - lta[:-Nl-Ns]
    lta /= Nl
    sta[Nl+Ns-1:] = sta[Nl+Ns-1:] - sta[Nl-1:-Ns]
    sta /= Ns
    sta[:Nl - 1 + Ns] = 0
    dtiny = np.finfo(0.0).tiny
    idx = lta < dtiny
    lta[idx] = dtiny
    return sta / lta

Taken from [Abhishek's Kernel](https://www.kaggle.com/abhishek/quite-a-few-features-1-51). Features used here are different though.

In [31]:
class FeatureGenerator(object):
    def __init__(self, dtype, n_jobs=1, chunk_size=None):
        self.chunk_size = chunk_size
        self.dtype = dtype
        self.filename = None
        self.n_jobs = n_jobs
        self.test_files = []
        if self.dtype == 'train':
            self.filename = '../input/train.csv'
            self.total_data = int(629145481 / self.chunk_size)
        else:
            submission = pd.read_csv('../input/sample_submission.csv')
            for seg_id in submission.seg_id.values:
                self.test_files.append((seg_id, '../input/test/' + seg_id + '.csv'))
            self.total_data = int(len(submission))

    def read_chunks(self):
        if self.dtype == 'train':
            iter_df = pd.read_csv(self.filename, iterator=True, chunksize=self.chunk_size,
                                  dtype={'acoustic_data': np.float64, 'time_to_failure': np.float64})
            for counter, df in enumerate(iter_df):
                x = df.acoustic_data.values
                y = df.time_to_failure.values[-1]
                seg_id = 'train_' + str(counter)
                del df
                yield seg_id, x, y
        else:
            for seg_id, f in self.test_files:
                df = pd.read_csv(f, dtype={'acoustic_data': np.float64})
                x = df.acoustic_data.values[-self.chunk_size:]
                del df
                yield seg_id, x, -999

    def features(self, x, y, seg_id):
        feature_dict = dict()
        feature_dict['target'] = y
        feature_dict['seg_id'] = seg_id

        # create features here
        feature_dict['mean'] = np.mean(x)
        feature_dict['max'] = np.max(x)
        feature_dict['min'] = np.min(x)
        feature_dict['std'] = np.std(x)
        feature_dict['var'] = np.var(x)
        feature_dict['quantile_03'] = np.quantile(x, 0.03)
        feature_dict['skew'] = sp.stats.skew(x)
        feature_dict['kurtosis'] = sp.stats.kurtosis(x)
        feature_dict['moment_3'] = sp.stats.moment(x, 3)
        
        pct_change = pd.Series(x).pct_change()
        pct_change[pct_change == -inf] = 0
        pct_change[pct_change == inf] = 0
        feature_dict['pct_change_mean'] = pct_change.mean()
        rate_change = pd.Series(x).pct_change().pct_change()
        rate_change[rate_change == -inf] = 0
        rate_change[rate_change == inf] = 0
        feature_dict['rate_change_max'] = rate_change.max()
        feature_dict['rate_change_mean'] = rate_change.mean()
        feature_dict['classic_sta_lta_mean'] = classic_sta_lta(x, 100, 5000).mean()
        
        window_size = 10
        x_roll_std = pd.Series(x).rolling(window_size).std().dropna().values
        feature_dict['q03_roll_std_' + str(window_size)] = np.quantile(x_roll_std, 0.03)
        window_size = 150
        x_roll_std = pd.Series(x).rolling(window_size).std().dropna().values
        feature_dict['q03_roll_std_' + str(window_size)] = np.quantile(x_roll_std, 0.03)
        
        return feature_dict
    
    def generate(self):
        feature_list = []
        res = Parallel(n_jobs=self.n_jobs,
                       backend='threading')(delayed(self.features)(x, y, s)
                                            for s, x, y in tqdm(self.read_chunks(), total=self.total_data))
        for r in res:
            feature_list.append(r)
        return pd.DataFrame(feature_list)
    
training_fg = FeatureGenerator(dtype='train', n_jobs=10, chunk_size=150000)
training_data = training_fg.generate()

test_fg = FeatureGenerator(dtype='test', n_jobs=10, chunk_size=150000)
test_data = test_fg.generate()
        

4195it [03:47, 18.44it/s]                          
100%|██████████| 2624/2624 [02:06<00:00, 20.66it/s]


In [32]:
X = training_data.drop(['target', 'seg_id'], axis=1)
X_test = test_data.drop(['target', 'seg_id'], axis=1)
test_segs = test_data.seg_id
y = training_data.target

In [33]:
folds = KFold(n_splits=5, shuffle=True, random_state=42)
oof_preds = np.zeros((len(X), 1))
test_preds = np.zeros((len(X_test), 1))

In [34]:
params = {
    "learning_rate": 0.01,
    "max_depth": 3,
    "n_estimators": 10000,
    "min_child_weight": 4,
    "colsample_bytree": 1,
    "subsample": 0.9,
    "nthread": 12,
    "random_state": 42
}

In [35]:
for fold_, (trn_, val_) in enumerate(folds.split(X)):
    print("Current Fold: {}".format(fold_))
    trn_x, trn_y = X.iloc[trn_], y.iloc[trn_]
    val_x, val_y = X.iloc[val_], y.iloc[val_]

    clf = xgb.XGBRegressor(**params)
    clf.fit(
        trn_x, trn_y,
        eval_set=[(trn_x, trn_y), (val_x, val_y)],
        eval_metric='mae',
        verbose=150,
        early_stopping_rounds=100
    )
    val_pred = clf.predict(val_x, ntree_limit=clf.best_ntree_limit)
    test_fold_pred = clf.predict(X_test, ntree_limit=clf.best_ntree_limit)
    print("MAE = {}".format(mean_absolute_error(val_y, val_pred)))
    oof_preds[val_, :] = val_pred.reshape((-1, 1))
    test_preds += test_fold_pred.reshape((-1, 1))
test_preds /= 5

oof_score = mean_absolute_error(y, oof_preds)
print("Mean MAE = {}".format(oof_score))

Current Fold: 0
[0]	validation_0-mae:5.16502	validation_1-mae:5.14112
Multiple eval metrics have been passed: 'validation_1-mae' will be used for early stopping.

Will train until validation_1-mae hasn't improved in 100 rounds.
[150]	validation_0-mae:2.13414	validation_1-mae:2.19956
[300]	validation_0-mae:1.96297	validation_1-mae:2.09104
Stopping. Best iteration:
[270]	validation_0-mae:1.97196	validation_1-mae:2.08829

MAE = 2.0882864178712217
Current Fold: 1
[0]	validation_0-mae:5.16701	validation_1-mae:5.13132
Multiple eval metrics have been passed: 'validation_1-mae' will be used for early stopping.

Will train until validation_1-mae hasn't improved in 100 rounds.
[150]	validation_0-mae:2.14262	validation_1-mae:2.16153
[300]	validation_0-mae:1.97079	validation_1-mae:2.03739
Stopping. Best iteration:
[268]	validation_0-mae:1.97963	validation_1-mae:2.03706

MAE = 2.03706235715146
Current Fold: 2
[0]	validation_0-mae:5.20054	validation_1-mae:4.99898
Multiple eval metrics have been pass

In [36]:
print(clf.feature_importances_)

[0.01522332 0.01312999 0.01295322 0.0183396  0.0111583  0.01447534
 0.01787995 0.6591948  0.15391296 0.01395802 0.01176884 0.02993136
 0.01381253 0.01426176 0.        ]


In [37]:
print(training_data.columns)

Index(['classic_sta_lta_mean', 'kurtosis', 'max', 'mean', 'min', 'moment_3',
       'pct_change_mean', 'q03_roll_std_10', 'q03_roll_std_150', 'quantile_03',
       'rate_change_max', 'rate_change_mean', 'seg_id', 'skew', 'std',
       'target', 'var'],
      dtype='object')


In [38]:
feature_importance = pd.concat([pd.Series(list(set(list(training_data)) - set(['seg_id', 'target']))), pd.Series(clf.feature_importances_)], axis = 1, keys = ['feature', 'importance'])

In [39]:
feature_importance.sort_values(by = ['importance'], ascending = False, inplace = True)

In [40]:
feature_importance

Unnamed: 0,feature,importance
7,min,0.659194827079773
8,var,0.153912961483002
11,max,0.029931357130408
3,q03_roll_std_10,0.018339600414038
6,rate_change_max,0.017879946157336
0,quantile_03,0.015223322436213
5,q03_roll_std_150,0.014475340023637
13,std,0.014261762611568
9,rate_change_mean,0.013958019204438
12,moment_3,0.013812527060509


In [41]:
submission = pd.DataFrame(columns=['seg_id', 'time_to_failure'])
submission.seg_id = test_segs
submission.time_to_failure = test_preds
submission.to_csv('submission.csv', index=False)