In [340]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler 
from scipy.signal import argrelextrema
from tqdm import tqdm
from sklearn.svm import SVR,NuSVR
from sklearn.model_selection import GridSearchCV
from multiprocessing import Pool
from scipy import stats
from joblib import Parallel, delayed
from sklearn.neural_network import MLPRegressor
from tsfresh.feature_extraction import feature_calculators

subset_size = [629145481, 
               100147179,
               18227196][0]

In [311]:
train=pd.read_csv("../input/train.csv",nrows=subset_size, dtype={"acoustic_data": np.int16, "time_to_failure": np.float32})

In [341]:
def exps_to_data(train):
    diff = train['time_to_failure'].diff().abs()
    diff[0] = 0.001
    exps_indices = np.array(diff[diff > 0.1].index)
    
    l = 0
    r = 150000
    jump = 35345
    while r < train.shape[0]:
        if all(np.logical_or(exps_indices <= l, exps_indices >= r)):
            if train.iloc[l: r, :]['time_to_failure'].values[-1]>0.5:
                yield train.iloc[l: r, :]
            l += jump
            r += jump
        else:
            l += 1
            r += 1
segments = sum(1 for i in exps_to_data(train))
print(segments)

454


In [327]:
col_names = ['mean','max','variance','min', 'stdev', 'q1', 'q5','q95', 'q99',
             'A0',
'fft0_mean','fft0_max','fft0_min','fft0_q1','fft0_q5','fft0_q25','fft0_q50','fft0_q75','fft0_q95','fft0_q99','fft0_std',
'fft1_mean','fft1_max','fft1_min','fft1_q1','fft1_q5','fft1_q25','fft1_q50','fft1_q75','fft1_q95','fft1_q99','fft1_std',
'fft2_mean','fft2_max','fft2_min','fft2_q1','fft2_q5','fft2_q25','fft2_q50','fft2_q75','fft2_q95','fft2_q99','fft2_std',
'fft3_mean','fft3_max','fft3_min','fft3_q1','fft3_q5','fft3_q25','fft3_q50','fft3_q75','fft3_q95','fft3_q99','fft3_std',
             'q05_rolling_std_100', 'num_peak_1', 'autocorrelation_1'
            ]    

def preprocess(seg, segment, X1, Y1=None):
        if not Y1 is None:
            Y1.loc[segment, 'time_to_failure'] = seg['time_to_failure'].values[-1]

        x = seg['acoustic_data'].values
        X1.loc[segment, 'mean'] = x.mean()
        X1.loc[segment, 'stdev'] = x.std()
        X1.loc[segment, 'variance'] = X1.loc[segment, 'stdev']**2
        X1.loc[segment, 'max'] = x.max()
        X1.loc[segment, 'min'] = x.min()
        X1.loc[segment][['q1','q5','q95','q99']] =  np.quantile(x, [0.01,0.05,0.95,0.99]).transpose()
        X1.loc[segment, 'q05_rolling_std_100'] = np.quantile(seg['acoustic_data'].rolling(100).std().dropna().values, 0.05)
        
        X1.loc[segment, 'autocorrelation_1'] = feature_calculators.autocorrelation(x, 1)
        X1.loc[segment, 'num_peak_1'] = feature_calculators.number_peaks(x, 1)
        
        fft = np.abs(np.fft.fft(x)[:11000])
        X1.loc[segment, 'A0'] = fft[0]
        fft_lines = [1500,2500,4100,7900,11000]
        for i in range(len(fft_lines)-1):
            cur = fft[fft_lines[i]:fft_lines[i+1]]
            X1.loc[segment,f'fft{i}_mean'] = cur.mean()
            X1.loc[segment,f'fft{i}_max'] = cur.max()
            X1.loc[segment,f'fft{i}_min'] = cur.min()
            X1.loc[segment,f'fft{i}_q1'] = np.quantile(cur, 0.5)
            X1.loc[segment,f'fft{i}_q5'] = np.quantile(cur, 0.5)
            X1.loc[segment,f'fft{i}_q25'] = np.quantile(cur, 0.5)
            X1.loc[segment,f'fft{i}_q50'] = np.quantile(cur, 0.5)
            X1.loc[segment,f'fft{i}_q75'] = np.quantile(cur, 0.5)
            X1.loc[segment,f'fft{i}_q95'] = np.quantile(cur, 0.5)
            X1.loc[segment,f'fft{i}_q99'] = np.quantile(cur, 0.5)
            X1.loc[segment,f'fft{i}_std'] = cur.std()
cnt_segments = sum([1 for _ in exps_to_data(train)])

In [328]:
X1 = pd.DataFrame(index=range(segments), dtype=np.float64, columns=col_names)
Y1 = pd.DataFrame(index=range(segments), dtype=np.float64, columns=['time_to_failure'])

_=[preprocess(seg, segment, X1, Y1) for segment, seg in tqdm(enumerate(exps_to_data(train)),total=cnt_segments)]


  0%|          | 0/52 [00:00<?, ?it/s][A
  2%|▏         | 1/52 [00:00<00:06,  8.08it/s][A
  8%|▊         | 4/52 [00:00<00:04, 10.02it/s][A
 13%|█▎        | 7/52 [00:00<00:03, 12.14it/s][A
 19%|█▉        | 10/52 [00:00<00:02, 14.11it/s][A
 25%|██▌       | 13/52 [00:00<00:03, 12.60it/s][A
 31%|███       | 16/52 [00:00<00:02, 14.65it/s][A
 37%|███▋      | 19/52 [00:01<00:01, 16.52it/s][A
 42%|████▏     | 22/52 [00:01<00:01, 18.15it/s][A
 48%|████▊     | 25/52 [00:01<00:01, 19.36it/s][A
 54%|█████▍    | 28/52 [00:01<00:01, 20.40it/s][A
 60%|█████▉    | 31/52 [00:01<00:01, 21.00it/s][A
 65%|██████▌   | 34/52 [00:01<00:00, 21.71it/s][A
 71%|███████   | 37/52 [00:01<00:00, 22.31it/s][A
 77%|███████▋  | 40/52 [00:01<00:00, 22.85it/s][A
 83%|████████▎ | 43/52 [00:02<00:00, 23.17it/s][A
 88%|████████▊ | 46/52 [00:02<00:00, 23.46it/s][A
 94%|█████████▍| 49/52 [00:02<00:00, 23.50it/s][A
100%|██████████| 52/52 [00:02<00:00, 23.43it/s][A

In [329]:
# segments = [x for x in exps_to_data(train)]
# X1 = pd.DataFrame(index=range(len(segments)), dtype=np.float64)
# for seg_id, seg in enumerate(segments):
#     x = seg['acoustic_data'].values
#     X1.loc[seg_id,'time_to_failure'] = seg['time_to_failure'].values[-1]
#     std_res = seg['acoustic_data'].rolling(1000).std().dropna().values
#     X1.loc[seg_id, 'q5'] = np.quantile(std_res, 0.05)



In [330]:
# sc=StandardScaler()
# X2 = pd.DataFrame(sc.fit_transform(X1), columns = X1.columns)
# plt.plot(X2)
# plt.show()

In [331]:
sc=StandardScaler()
sc.fit(X1)
scX = pd.DataFrame(sc.transform(X1), columns = X1.columns)

In [332]:
parameters = [{'hidden_layer_sizes': [(100,), (100,100,), (100,100,100,)], 'activation': ['logistic', 'tanh', 'relu']}]
model = GridSearchCV(MLPRegressor(), parameters, cv=5, scoring='neg_mean_absolute_error', n_jobs=4)
model.fit(scX, Y1.values.flatten())



GridSearchCV(cv=5, error_score='raise-deprecating',
       estimator=MLPRegressor(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
       beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=(100,), learning_rate='constant',
       learning_rate_init=0.001, max_iter=200, momentum=0.9,
       n_iter_no_change=10, nesterovs_momentum=True, power_t=0.5,
       random_state=None, shuffle=True, solver='adam', tol=0.0001,
       validation_fraction=0.1, verbose=False, warm_start=False),
       fit_params=None, iid='warn', n_jobs=4,
       param_grid=[{'hidden_layer_sizes': [(100,), (100, 100), (100, 100, 100, 100, 100)], 'activation': ['logistic', 'tanh', 'relu']}],
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='neg_mean_absolute_error', verbose=0)

In [333]:
model.cv_results_



{'mean_fit_time': array([0.63217359, 2.41135283, 3.61105242, 0.55295267, 1.82050338,
        3.374157  , 0.47811394, 1.48810306, 2.08771238]),
 'std_fit_time': array([0.10002107, 0.27053276, 0.17376128, 0.21799129, 0.23695676,
        1.3635432 , 0.06865193, 0.06488646, 0.5870199 ]),
 'mean_score_time': array([0.00264935, 0.00194688, 0.00208597, 0.00455799, 0.00316343,
        0.00210915, 0.00308304, 0.00165997, 0.00330353]),
 'std_score_time': array([1.10658628e-03, 1.71539502e-04, 1.12528255e-04, 5.79634793e-03,
        2.72444792e-03, 1.19738985e-04, 2.88030609e-03, 3.73977154e-05,
        2.70761684e-03]),
 'param_activation': masked_array(data=['logistic', 'logistic', 'logistic', 'tanh', 'tanh',
                    'tanh', 'relu', 'relu', 'relu'],
              mask=[False, False, False, False, False, False, False, False,
                    False],
        fill_value='?',
             dtype=object),
 'param_hidden_layer_sizes': masked_array(data=[(100,), (100, 100), (100, 100, 10

In [334]:
sub=pd.read_csv("../input/sample_submission.csv",index_col='seg_id')
xtest=pd.DataFrame(columns=X1.columns,dtype=np.float64,index=sub.index)

In [335]:
_ = Parallel(n_jobs=20, require='sharedmem', verbose=0)(
    delayed(preprocess)(pd.read_csv('../input/test/' + seg_id + '.csv'), seg_id, xtest) for seg_id in tqdm(xtest.index))


  0%|          | 0/2624 [00:00<?, ?it/s][A
  0%|          | 5/2624 [00:00<01:03, 40.96it/s][A
  0%|          | 8/2624 [00:00<01:17, 33.76it/s][A
  0%|          | 10/2624 [00:00<01:35, 27.47it/s][A
  0%|          | 13/2624 [00:00<01:40, 26.03it/s][A
  1%|          | 15/2624 [00:00<01:54, 22.81it/s][A
  1%|          | 17/2624 [00:00<02:01, 21.43it/s][A
  1%|          | 20/2624 [00:00<01:59, 21.76it/s][A
  1%|          | 23/2624 [00:00<01:56, 22.34it/s][A
  1%|          | 26/2624 [00:01<02:00, 21.54it/s][A
  1%|          | 29/2624 [00:01<02:00, 21.53it/s][A
  1%|          | 32/2624 [00:01<02:12, 19.61it/s][A
  1%|▏         | 34/2624 [00:01<02:17, 18.82it/s][A
  1%|▏         | 36/2624 [00:01<02:27, 17.60it/s][A
  1%|▏         | 38/2624 [00:01<02:24, 17.84it/s][A
  2%|▏         | 40/2624 [00:01<02:27, 17.53it/s][A
  2%|▏         | 42/2624 [00:02<02:22, 18.08it/s][A
  2%|▏         | 44/2624 [00:02<02:24, 17.89it/s][A
  2%|▏         | 47/2624 [00:02<02:12, 19.44it/s][A
  2

 15%|█▌        | 399/2624 [00:20<01:47, 20.66it/s][A
 15%|█▌        | 402/2624 [00:20<01:49, 20.30it/s][A
 15%|█▌        | 405/2624 [00:20<01:55, 19.13it/s][A
 16%|█▌        | 408/2624 [00:20<01:46, 20.77it/s][A
 16%|█▌        | 411/2624 [00:20<01:43, 21.29it/s][A
 16%|█▌        | 414/2624 [00:20<01:46, 20.72it/s][A
 16%|█▌        | 417/2624 [00:20<01:46, 20.66it/s][A
 16%|█▌        | 420/2624 [00:21<01:56, 18.86it/s][A
 16%|█▌        | 422/2624 [00:21<01:58, 18.59it/s][A
 16%|█▌        | 424/2624 [00:21<01:56, 18.92it/s][A
 16%|█▋        | 427/2624 [00:21<01:53, 19.34it/s][A
 16%|█▋        | 430/2624 [00:21<01:46, 20.54it/s][A
 17%|█▋        | 433/2624 [00:21<01:52, 19.41it/s][A
 17%|█▋        | 435/2624 [00:21<01:56, 18.87it/s][A
 17%|█▋        | 438/2624 [00:22<01:49, 20.00it/s][A
 17%|█▋        | 441/2624 [00:22<01:46, 20.52it/s][A
 17%|█▋        | 444/2624 [00:22<01:50, 19.75it/s][A
 17%|█▋        | 447/2624 [00:22<01:57, 18.53it/s][A
 17%|█▋        | 449/2624 [0

 30%|███       | 798/2624 [00:40<01:34, 19.39it/s][A
 30%|███       | 800/2624 [00:40<01:33, 19.41it/s][A
 31%|███       | 802/2624 [00:40<01:33, 19.46it/s][A
 31%|███       | 804/2624 [00:40<01:37, 18.72it/s][A
 31%|███       | 807/2624 [00:40<01:29, 20.31it/s][A
 31%|███       | 810/2624 [00:41<01:30, 20.07it/s][A
 31%|███       | 813/2624 [00:41<01:25, 21.13it/s][A
 31%|███       | 816/2624 [00:41<01:29, 20.25it/s][A
 31%|███       | 819/2624 [00:41<01:27, 20.67it/s][A
 31%|███▏      | 822/2624 [00:41<01:26, 20.92it/s][A
 31%|███▏      | 825/2624 [00:41<01:27, 20.48it/s][A
 32%|███▏      | 828/2624 [00:41<01:28, 20.39it/s][A
 32%|███▏      | 831/2624 [00:42<01:24, 21.20it/s][A
 32%|███▏      | 834/2624 [00:42<01:26, 20.64it/s][A
 32%|███▏      | 837/2624 [00:42<01:28, 20.30it/s][A
 32%|███▏      | 840/2624 [00:42<01:29, 20.04it/s][A
 32%|███▏      | 843/2624 [00:42<01:30, 19.70it/s][A
 32%|███▏      | 845/2624 [00:42<01:30, 19.60it/s][A
 32%|███▏      | 847/2624 [0

 46%|████▌     | 1196/2624 [01:00<01:11, 20.05it/s][A
 46%|████▌     | 1199/2624 [01:00<01:08, 20.74it/s][A
 46%|████▌     | 1202/2624 [01:00<01:10, 20.16it/s][A
 46%|████▌     | 1205/2624 [01:01<01:08, 20.76it/s][A
 46%|████▌     | 1208/2624 [01:01<01:08, 20.75it/s][A
 46%|████▌     | 1211/2624 [01:01<01:12, 19.47it/s][A
 46%|████▌     | 1213/2624 [01:01<01:16, 18.40it/s][A
 46%|████▋     | 1216/2624 [01:01<01:11, 19.70it/s][A
 46%|████▋     | 1219/2624 [01:01<01:09, 20.35it/s][A
 47%|████▋     | 1222/2624 [01:01<01:07, 20.73it/s][A
 47%|████▋     | 1225/2624 [01:02<01:10, 19.96it/s][A
 47%|████▋     | 1228/2624 [01:02<01:15, 18.59it/s][A
 47%|████▋     | 1230/2624 [01:02<01:19, 17.47it/s][A
 47%|████▋     | 1232/2624 [01:02<01:19, 17.53it/s][A
 47%|████▋     | 1235/2624 [01:02<01:12, 19.28it/s][A
 47%|████▋     | 1237/2624 [01:02<01:15, 18.29it/s][A
 47%|████▋     | 1239/2624 [01:02<01:15, 18.45it/s][A
 47%|████▋     | 1242/2624 [01:02<01:09, 20.00it/s][A
 47%|████▋

 60%|██████    | 1576/2624 [01:20<00:50, 20.66it/s][A
 60%|██████    | 1579/2624 [01:20<00:54, 19.35it/s][A
 60%|██████    | 1582/2624 [01:20<00:51, 20.23it/s][A
 60%|██████    | 1585/2624 [01:20<00:52, 19.97it/s][A
 61%|██████    | 1588/2624 [01:20<00:50, 20.40it/s][A
 61%|██████    | 1591/2624 [01:20<00:52, 19.55it/s][A
 61%|██████    | 1594/2624 [01:21<00:50, 20.29it/s][A
 61%|██████    | 1597/2624 [01:21<00:55, 18.65it/s][A
 61%|██████    | 1599/2624 [01:21<00:58, 17.42it/s][A
 61%|██████    | 1601/2624 [01:21<00:59, 17.30it/s][A
 61%|██████    | 1604/2624 [01:21<00:54, 18.74it/s][A
 61%|██████    | 1607/2624 [01:21<00:51, 19.79it/s][A
 61%|██████▏   | 1610/2624 [01:21<00:50, 20.19it/s][A
 61%|██████▏   | 1613/2624 [01:21<00:48, 20.86it/s][A
 62%|██████▏   | 1616/2624 [01:22<00:52, 19.37it/s][A
 62%|██████▏   | 1618/2624 [01:22<00:55, 18.05it/s][A
 62%|██████▏   | 1620/2624 [01:22<00:56, 17.82it/s][A
 62%|██████▏   | 1622/2624 [01:22<00:54, 18.28it/s][A
 62%|█████

 75%|███████▌  | 1975/2624 [01:40<00:37, 17.33it/s][A
 75%|███████▌  | 1977/2624 [01:40<00:36, 17.74it/s][A
 75%|███████▌  | 1980/2624 [01:40<00:33, 19.44it/s][A
 76%|███████▌  | 1983/2624 [01:40<00:31, 20.64it/s][A
 76%|███████▌  | 1986/2624 [01:40<00:31, 20.02it/s][A
 76%|███████▌  | 1989/2624 [01:40<00:31, 20.15it/s][A
 76%|███████▌  | 1992/2624 [01:41<00:30, 20.60it/s][A
 76%|███████▌  | 1995/2624 [01:41<00:30, 20.33it/s][A
 76%|███████▌  | 1998/2624 [01:41<00:31, 20.01it/s][A
 76%|███████▋  | 2001/2624 [01:41<00:31, 19.98it/s][A
 76%|███████▋  | 2004/2624 [01:41<00:29, 20.72it/s][A
 76%|███████▋  | 2007/2624 [01:41<00:31, 19.61it/s][A
 77%|███████▋  | 2009/2624 [01:42<00:33, 18.62it/s][A
 77%|███████▋  | 2011/2624 [01:42<00:36, 16.79it/s][A
 77%|███████▋  | 2013/2624 [01:42<00:35, 17.31it/s][A
 77%|███████▋  | 2016/2624 [01:42<00:32, 18.90it/s][A
 77%|███████▋  | 2019/2624 [01:42<00:31, 19.46it/s][A
 77%|███████▋  | 2022/2624 [01:42<00:29, 20.53it/s][A
 77%|█████

 90%|█████████ | 2373/2624 [02:00<00:12, 20.47it/s][A
 91%|█████████ | 2376/2624 [02:00<00:12, 19.95it/s][A
 91%|█████████ | 2379/2624 [02:00<00:12, 19.86it/s][A
 91%|█████████ | 2381/2624 [02:00<00:12, 19.85it/s][A
 91%|█████████ | 2383/2624 [02:00<00:12, 18.81it/s][A
 91%|█████████ | 2385/2624 [02:01<00:13, 17.50it/s][A
 91%|█████████ | 2387/2624 [02:01<00:13, 17.77it/s][A
 91%|█████████ | 2390/2624 [02:01<00:12, 19.26it/s][A
 91%|█████████ | 2392/2624 [02:01<00:12, 19.01it/s][A
 91%|█████████▏| 2395/2624 [02:01<00:11, 19.35it/s][A
 91%|█████████▏| 2397/2624 [02:01<00:11, 19.13it/s][A
 91%|█████████▏| 2400/2624 [02:01<00:11, 20.24it/s][A
 92%|█████████▏| 2403/2624 [02:01<00:10, 20.78it/s][A
 92%|█████████▏| 2406/2624 [02:02<00:10, 20.73it/s][A
 92%|█████████▏| 2409/2624 [02:02<00:10, 20.48it/s][A
 92%|█████████▏| 2412/2624 [02:02<00:10, 20.36it/s][A
 92%|█████████▏| 2415/2624 [02:02<00:10, 19.69it/s][A
 92%|█████████▏| 2417/2624 [02:02<00:11, 18.27it/s][A
 92%|█████

In [336]:
sctestx = pd.DataFrame(sc.transform(xtest), columns = xtest.columns)

In [337]:
pred = model.predict(sctestx)
print(pred.shape)

(2624,)


In [338]:
sub['time_to_failure'] = pred
sub.head()

Unnamed: 0_level_0,time_to_failure
seg_id,Unnamed: 1_level_1
seg_00030f,5.524024
seg_0012b5,3.347173
seg_00184e,3.778955
seg_003339,9.521089
seg_0042cc,6.488661


In [339]:
sub.to_csv("submittedoutput.csv")