In [40]:
import os
import pandas as pd
import numpy as np
from pathlib import Path
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from scipy.stats import kurtosis
from scipy.stats import skew
from scipy import stats
from tqdm import tqdm

from scipy.signal import hilbert
from scipy.signal import hann
from scipy.signal import convolve
from scipy import stats

In [41]:
def calc_change_rate(x):
    change = (np.diff(x) / x[:-1]).values
    change = change[np.nonzero(change)[0]]
    change = change[~np.isnan(change)]
    change = change[change != -np.inf]
    change = change[change != np.inf]
    return np.mean(change)

In [42]:
def classic_sta_lta(x, length_sta, length_lta):
    
    sta = np.cumsum(x ** 2)

    # Convert to float
    sta = np.require(sta, dtype=np.float)

    # Copy for LTA
    lta = sta.copy()

    # Compute the STA and the LTA
    sta[length_sta:] = sta[length_sta:] - sta[:-length_sta]
    sta /= length_sta
    lta[length_lta:] = lta[length_lta:] - lta[:-length_lta]
    lta /= length_lta

    # Pad zeros
    sta[:length_lta - 1] = 0

    # Avoid division by zero by setting zero values to tiny float
    dtiny = np.finfo(0.0).tiny
    idx = lta < dtiny
    lta[idx] = dtiny

    return sta / lta

In [43]:
def add_trend_feature(arr, abs_values=False):
    idx = np.array(range(len(arr)))
    if abs_values:
        arr = np.abs(arr)
    lr = LinearRegression()
    lr.fit(idx.reshape(-1, 1), arr)
    return lr.coef_[0]

In [44]:
submission = pd.read_csv('data/sample_submission.csv', index_col='seg_id')
submission

Unnamed: 0_level_0,time_to_failure
seg_id,Unnamed: 1_level_1
seg_00030f,0
seg_0012b5,0
seg_00184e,0
seg_003339,0
seg_0042cc,0
...,...
seg_ff4236,0
seg_ff7478,0
seg_ff79d9,0
seg_ffbd6a,0


In [45]:
X= pd.DataFrame(dtype=np.float64, index=submission.index)
X

seg_00030f
seg_0012b5
seg_00184e
seg_003339
seg_0042cc
...
seg_ff4236
seg_ff7478
seg_ff79d9
seg_ffbd6a
seg_ffe7cc


In [46]:
segment = 0
c = 0
for i, segment in enumerate(tqdm(X.index)):
#       if c<=2:
        c += 1
        url = 'data/test/{}.csv'.format(segment)
        seg = pd.read_csv(url)
        
        x = pd.Series(seg['acoustic_data'].values)  
        X.loc[segment, 'mean'] = x.mean()
        X.loc[segment, 'std'] = x.std()
        X.loc[segment, 'max'] = x.max()
        X.loc[segment, 'min'] = x.min()

        X.loc[segment, 'mean_change_abs'] = np.mean(np.diff(x))
        X.loc[segment, 'mean_change_rate'] = calc_change_rate(x)
        X.loc[segment, 'abs_max'] = np.abs(x).max()
        X.loc[segment, 'abs_min'] = np.abs(x).min()

        X.loc[segment, 'std_first_50000'] = x[:50000].std()
        X.loc[segment, 'std_last_50000'] = x[-50000:].std()
        X.loc[segment, 'std_first_10000'] = x[:10000].std()
        X.loc[segment, 'std_last_10000'] = x[-10000:].std()

        X.loc[segment, 'avg_first_50000'] = x[:50000].mean()
        X.loc[segment, 'avg_last_50000'] = x[-50000:].mean()
        X.loc[segment, 'avg_first_10000'] = x[:10000].mean()
        X.loc[segment, 'avg_last_10000'] = x[-10000:].mean()

        X.loc[segment, 'min_first_50000'] = x[:50000].min()
        X.loc[segment, 'min_last_50000'] = x[-50000:].min()
        X.loc[segment, 'min_first_10000'] = x[:10000].min()
        X.loc[segment, 'min_last_10000'] = x[-10000:].min()

        X.loc[segment, 'max_first_50000'] = x[:50000].max()
        X.loc[segment, 'max_last_50000'] = x[-50000:].max()
        X.loc[segment, 'max_first_10000'] = x[:10000].max()
        X.loc[segment, 'max_last_10000'] = x[-10000:].max()

        X.loc[segment, 'max_to_min'] = x.max() / np.abs(x.min())
        X.loc[segment, 'max_to_min_diff'] = x.max() - np.abs(x.min())
        X.loc[segment, 'count_big'] = len(x[np.abs(x) > 500])
        X.loc[segment, 'sum'] = x.sum()

        X.loc[segment, 'mean_change_rate_first_50000'] = calc_change_rate(x[:50000])
        X.loc[segment, 'mean_change_rate_last_50000'] = calc_change_rate(x[-50000:])
        X.loc[segment, 'mean_change_rate_first_10000'] = calc_change_rate(x[:10000])
        X.loc[segment, 'mean_change_rate_last_10000'] = calc_change_rate(x[-10000:])

        X.loc[segment, 'q95'] = np.quantile(x, 0.95)
        X.loc[segment, 'q99'] = np.quantile(x, 0.99)
        X.loc[segment, 'q05'] = np.quantile(x, 0.05)
        X.loc[segment, 'q01'] = np.quantile(x, 0.01)

        X.loc[segment, 'abs_q95'] = np.quantile(np.abs(x), 0.95)
        X.loc[segment, 'abs_q99'] = np.quantile(np.abs(x), 0.99)
        X.loc[segment, 'abs_q05'] = np.quantile(np.abs(x), 0.05)
        X.loc[segment, 'abs_q01'] = np.quantile(np.abs(x), 0.01)

        X.loc[segment, 'trend'] = add_trend_feature(x)
        X.loc[segment, 'abs_trend'] = add_trend_feature(x, abs_values=True)
        X.loc[segment, 'abs_mean'] = np.abs(x).mean()
        X.loc[segment, 'abs_std'] = np.abs(x).std()

        X.loc[segment, 'mad'] = x.mad()
        X.loc[segment, 'kurt'] = x.kurtosis()
        X.loc[segment, 'skew'] = x.skew()
        X.loc[segment, 'med'] = x.median()

        X.loc[segment, 'Hilbert_mean'] = np.abs(hilbert(x)).mean()
        X.loc[segment, 'Hann_window_mean'] = (convolve(x, hann(150), mode='same') / sum(hann(150))).mean()
        X.loc[segment, 'classic_sta_lta1_mean'] = classic_sta_lta(x, 500, 10000).mean()
        X.loc[segment, 'classic_sta_lta2_mean'] = classic_sta_lta(x, 5000, 100000).mean()
        X.loc[segment, 'classic_sta_lta3_mean'] = classic_sta_lta(x, 3333, 6666).mean()
        X.loc[segment, 'classic_sta_lta4_mean'] = classic_sta_lta(x, 10000, 25000).mean()
        X.loc[segment, 'classic_sta_lta5_mean'] = classic_sta_lta(x, 50, 1000).mean()
        X.loc[segment, 'classic_sta_lta6_mean'] = classic_sta_lta(x, 100, 5000).mean()
        X.loc[segment, 'classic_sta_lta7_mean'] = classic_sta_lta(x, 333, 666).mean()
        X.loc[segment, 'classic_sta_lta8_mean'] = classic_sta_lta(x, 4000, 10000).mean()
        X.loc[segment, 'Moving_average_700_mean'] = x.rolling(window=700).mean().mean(skipna=True)
        ewma = pd.Series.ewm
        X.loc[segment, 'exp_Moving_average_300_mean'] = (ewma(x, span=300).mean()).mean(skipna=True)
        X.loc[segment, 'exp_Moving_average_3000_mean'] = ewma(x, span=3000).mean().mean(skipna=True)
        X.loc[segment, 'exp_Moving_average_30000_mean'] = ewma(x, span=30000).mean().mean(skipna=True)
        no_of_std = 3
        X.loc[segment, 'MA_700MA_std_mean'] = x.rolling(window=700).std().mean()
        X.loc[segment,'MA_700MA_BB_high_mean'] = (X.loc[segment, 'Moving_average_700_mean'] + no_of_std * X.loc[segment, 'MA_700MA_std_mean']).mean()
        X.loc[segment,'MA_700MA_BB_low_mean'] = (X.loc[segment, 'Moving_average_700_mean'] - no_of_std * X.loc[segment, 'MA_700MA_std_mean']).mean()
        X.loc[segment, 'MA_400MA_std_mean'] = x.rolling(window=400).std().mean()
        X.loc[segment,'MA_400MA_BB_high_mean'] = (X.loc[segment, 'Moving_average_700_mean'] + no_of_std * X.loc[segment, 'MA_400MA_std_mean']).mean()
        X.loc[segment,'MA_400MA_BB_low_mean'] = (X.loc[segment, 'Moving_average_700_mean'] - no_of_std * X.loc[segment, 'MA_400MA_std_mean']).mean()
        X.loc[segment, 'MA_1000MA_std_mean'] = x.rolling(window=1000).std().mean()
        X.drop('Moving_average_700_mean', axis=1, inplace=True)

        X.loc[segment, 'iqr'] = np.subtract(*np.percentile(x, [75, 25]))
        X.loc[segment, 'q999'] = np.quantile(x,0.999)
        X.loc[segment, 'q001'] = np.quantile(x,0.001)
        X.loc[segment, 'ave10'] = stats.trim_mean(x, 0.1)

        for windows in [10, 100, 1000]:
            x_roll_std = x.rolling(windows).std().dropna().values
            x_roll_mean = x.rolling(windows).mean().dropna().values

            X.loc[segment, 'ave_roll_std_' + str(windows)] = x_roll_std.mean()
            X.loc[segment, 'std_roll_std_' + str(windows)] = x_roll_std.std()
            X.loc[segment, 'max_roll_std_' + str(windows)] = x_roll_std.max()
            X.loc[segment, 'min_roll_std_' + str(windows)] = x_roll_std.min()
            X.loc[segment, 'q01_roll_std_' + str(windows)] = np.quantile(x_roll_std, 0.01)
            X.loc[segment, 'q05_roll_std_' + str(windows)] = np.quantile(x_roll_std, 0.05)
            X.loc[segment, 'q95_roll_std_' + str(windows)] = np.quantile(x_roll_std, 0.95)
            X.loc[segment, 'q99_roll_std_' + str(windows)] = np.quantile(x_roll_std, 0.99)
            X.loc[segment, 'av_change_abs_roll_std_' + str(windows)] = np.mean(np.diff(x_roll_std))

            X.loc[segment, 'av_change_rate_roll_std_' + str(windows)] = np.mean(np.nonzero((np.diff(x_roll_std) / x_roll_std[:-1]))[0])
            X.loc[segment, 'abs_max_roll_std_' + str(windows)] = np.abs(x_roll_std).max()

            X.loc[segment, 'ave_roll_mean_' + str(windows)] = x_roll_mean.mean()
            X.loc[segment, 'std_roll_mean_' + str(windows)] = x_roll_mean.std()
            X.loc[segment, 'max_roll_mean_' + str(windows)] = x_roll_mean.max()
            X.loc[segment, 'min_roll_mean_' + str(windows)] = x_roll_mean.min()
            X.loc[segment, 'q01_roll_mean_' + str(windows)] = np.quantile(x_roll_mean, 0.01)
            X.loc[segment, 'q05_roll_mean_' + str(windows)] = np.quantile(x_roll_mean, 0.05)
            X.loc[segment, 'q95_roll_mean_' + str(windows)] = np.quantile(x_roll_mean, 0.95)
            X.loc[segment, 'q99_roll_mean_' + str(windows)] = np.quantile(x_roll_mean, 0.99)

            X.loc[segment, 'av_change_abs_roll_mean_' + str(windows)] = np.mean(np.diff(x_roll_mean))
            X.loc[segment, 'abs_max_roll_mean_' + str(windows)] = np.abs(x_roll_mean).max()

  X.loc[segment, 'av_change_rate_roll_std_' + str(windows)] = np.mean(np.nonzero((np.diff(x_roll_std) / x_roll_std[:-1]))[0])
  X.loc[segment, 'av_change_rate_roll_std_' + str(windows)] = np.mean(np.nonzero((np.diff(x_roll_std) / x_roll_std[:-1]))[0])
100%|██████████| 2624/2624 [10:50<00:00,  4.03it/s]


In [47]:
X.head()

Unnamed: 0_level_0,mean,std,max,min,mean_change_abs,mean_change_rate,abs_max,abs_min,std_first_50000,std_last_50000,...,ave_roll_mean_1000,std_roll_mean_1000,max_roll_mean_1000,min_roll_mean_1000,q01_roll_mean_1000,q05_roll_mean_1000,q95_roll_mean_1000,q99_roll_mean_1000,av_change_abs_roll_mean_1000,abs_max_roll_mean_1000
seg_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
seg_00030f,4.49178,4.89369,115.0,-75.0,2.7e-05,0.005003,115.0,0.0,5.350451,4.793876,...,4.491468,0.231891,5.495,3.774,3.889,4.099,4.867,5.0,1.778523e-06,5.495
seg_0012b5,4.171153,5.922839,152.0,-140.0,-1.3e-05,-0.016036,152.0,0.0,6.249515,4.147562,...,4.173633,0.230914,5.009,3.342,3.644,3.79,4.541,4.739,-1.946309e-07,5.009
seg_00184e,4.61026,6.94699,248.0,-193.0,-2e-05,0.037691,248.0,0.0,9.793473,5.225913,...,4.612455,0.247219,6.234,3.544,4.013,4.215,4.966,5.082,8.053691e-08,6.234
seg_003339,4.531473,4.114147,85.0,-93.0,4.7e-05,0.064439,93.0,0.0,3.664088,3.48084,...,4.532571,0.224909,5.446,3.889,4.032,4.184,4.911,5.051,3.899329e-06,5.446
seg_0042cc,4.12834,5.797164,177.0,-147.0,-7e-06,-0.010527,177.0,0.0,5.321133,7.486142,...,4.125805,0.274025,5.027,3.357,3.534,3.662,4.57,4.87,-2.939597e-06,5.027


In [48]:
title.head()

Unnamed: 0,title
0,seg_69787a.csv
1,seg_d146a6.csv
2,seg_df99d4.csv
3,
4,


In [49]:
from sklearn.preprocessing import StandardScaler 
scaler = StandardScaler() 
scaler.fit(X) 
X = scaler.transform(X)

In [50]:
pd.DataFrame(X)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,125,126,127,128,129,130,131,132,133,134
0,1.360263,-0.204790,-0.183328,0.271522,0.457118,0.488846,-0.188242,0.0,-0.095469,-0.297402,...,1.358749,-0.123290,0.047740,0.229347,0.639452,1.286196,1.147464,0.610019,0.709554,0.047740
1,0.089337,-0.087078,-0.049495,0.048400,-0.268446,-0.026120,-0.068265,0.0,-0.031544,-0.453902,...,0.099269,-0.128903,-0.078660,0.090380,0.155712,0.176446,-0.033307,0.038710,-0.054818,-0.078660
2,1.829903,0.030062,0.297747,-0.133530,-0.389373,1.288950,0.243025,0.0,0.220436,-0.192787,...,1.838182,-0.035257,0.239940,0.155360,0.884284,1.702801,1.506041,0.789511,0.051778,0.239940
3,1.517602,-0.293952,-0.291841,0.209734,0.819900,1.943635,-0.259580,0.0,-0.215371,-0.615345,...,1.521625,-0.163389,0.034996,0.266341,0.921798,1.591467,1.306832,0.721654,1.531125,0.034996
4,-0.080370,-0.101453,0.040932,0.024371,-0.147518,0.108705,0.012800,0.0,-0.097553,0.354512,...,-0.090256,0.118693,-0.073978,0.095205,-0.061477,-0.283256,0.071730,0.325459,-1.118180,-0.073978
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2619,-0.209460,-0.183201,-0.342481,0.357338,0.215264,-0.454763,-0.330917,0.0,-0.099611,-0.312649,...,-0.201982,-0.029864,-0.110390,0.121583,0.098453,-0.150373,-0.120235,0.023387,0.670556,-0.110390
2620,-0.040123,-0.074639,-0.006090,-0.185020,-0.147518,0.099655,0.113321,0.0,-0.207415,-0.609683,...,-0.037550,0.273150,-0.018841,0.019610,-0.557064,-0.689086,-0.076771,-0.051036,1.611722,-0.018841
2621,-1.585667,0.318355,0.522007,-0.511121,0.215264,-1.528370,0.444067,0.0,0.546524,0.024365,...,-1.591656,-0.147554,-0.020141,-0.124504,-0.756483,-1.281671,-1.605254,-1.057941,1.658520,-0.020141
2622,0.765311,0.745192,0.710096,-0.566044,-0.026591,-0.579639,0.612683,0.0,0.156550,1.101475,...,0.768731,-0.038707,0.131746,-0.004516,0.440033,0.654106,0.586054,0.820156,0.951346,0.131746


In [51]:
import pickle

In [52]:
filename = "SVRModel.pkl"
with open(filename, 'rb') as file:
    pickle_model = pickle.load(file)

In [53]:
y_predict = pickle_model.predict(X) 
y_predict

array([ 4.86314659,  5.987392  ,  6.16338093, ...,  3.37816682,
        2.55060664, 12.7082222 ])

In [55]:
pd.DataFrame(y_predict)

Unnamed: 0,0
0,4.863147
1,5.987392
2,6.163381
3,10.248204
4,8.596133
...,...
2619,5.753354
2620,7.255379
2621,3.378167
2622,2.550607


In [56]:
submission['time_to_failure'] = y_predict

In [57]:
submission

Unnamed: 0_level_0,time_to_failure
seg_id,Unnamed: 1_level_1
seg_00030f,4.863147
seg_0012b5,5.987392
seg_00184e,6.163381
seg_003339,10.248204
seg_0042cc,8.596133
...,...
seg_ff4236,5.753354
seg_ff7478,7.255379
seg_ff79d9,3.378167
seg_ffbd6a,2.550607


In [58]:
submission.to_csv('submission_1')