In [1]:
import os

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import math

from scipy import stats
from scipy.signal import hilbert,convolve,hann
     
from sklearn.linear_model import LinearRegression

In [2]:
import sys

In [3]:
import tensorflow as tf

In [4]:
import keras
import keras.backend as K
from keras.models import Sequential,Model
from keras.layers import LSTM,Dense

Using TensorFlow backend.


In [5]:
import warnings
warnings.filterwarnings('ignore')

In [6]:
pd.set_option('precision',30)

In [7]:
train_df=pd.read_csv('F:\\Qplus\\Kaggle\\train.csv',chunksize=5000000,iterator=True,
                     dtype={'acoustic_data':np.int16,'time_to_failure':np.float32})

In [8]:
def create_many_features(xc,seg_id=0):
    X=pd.DataFrame(index=[seg_id,])
    zc=np.fft.fft(xc)
    
    X.loc[seg_id,'mean']=xc.mean()
    X.loc[seg_id,'std']=xc.std()
    X.loc[seg_id,'max']=xc.max()
    X.loc[seg_id,'min']=xc.min()
    
    #FFT transform values
    realFFT = np.real(zc)
    imagFFT = np.imag(zc)
    X.loc[seg_id, 'Rmean'] = realFFT.mean()
    X.loc[seg_id, 'Rstd'] = realFFT.std()
    X.loc[seg_id, 'Rmax'] = realFFT.max()
    X.loc[seg_id, 'Rmin'] = realFFT.min()
    X.loc[seg_id, 'Imean'] = imagFFT.mean()
    X.loc[seg_id, 'Istd'] = imagFFT.std()
    X.loc[seg_id, 'Imax'] = imagFFT.max()
    X.loc[seg_id, 'Imin'] = imagFFT.min()
    
    X.loc[seg_id, 'Rmean_last_5000'] = realFFT[-5000:].mean()
    X.loc[seg_id, 'Rstd__last_5000'] = realFFT[-5000:].std()
    X.loc[seg_id, 'Rmax_last_5000'] = realFFT[-5000:].max()
    X.loc[seg_id, 'Rmin_last_5000'] = realFFT[-5000:].min()
    X.loc[seg_id, 'Rmean_last_15000'] = realFFT[-15000:].mean()
    X.loc[seg_id, 'Rstd_last_15000'] = realFFT[-15000:].std()
    X.loc[seg_id, 'Rmax_last_15000'] = realFFT[-15000:].max()
    X.loc[seg_id, 'Rmin_last_15000'] = realFFT[-15000:].min()
    
    X.loc[seg_id, 'std_first_50000'] = xc[:50000].std()
    X.loc[seg_id, 'std_last_50000'] = xc[-50000:].std()
    X.loc[seg_id, 'std_first_10000'] = xc[:10000].std()
    X.loc[seg_id, 'std_last_10000'] = xc[-10000:].std()
    
    X.loc[seg_id, 'avg_first_50000'] = xc[:50000].mean()
    X.loc[seg_id, 'avg_last_50000'] = xc[-50000:].mean()
    X.loc[seg_id, 'avg_first_10000'] = xc[:10000].mean()
    X.loc[seg_id, 'avg_last_10000'] = xc[-10000:].mean()
    
    X.loc[seg_id, 'min_first_50000'] = xc[:50000].min()
    X.loc[seg_id, 'min_last_50000'] = xc[-50000:].min()
    X.loc[seg_id, 'min_first_10000'] = xc[:10000].min()
    X.loc[seg_id, 'min_last_10000'] = xc[-10000:].min()
    
    X.loc[seg_id, 'max_first_50000'] = xc[:50000].max()
    X.loc[seg_id, 'max_last_50000'] = xc[-50000:].max()
    X.loc[seg_id, 'max_first_10000'] = xc[:10000].max()
    X.loc[seg_id, 'max_last_10000'] = xc[-10000:].max()
    
    X.loc[seg_id, 'abs_max'] = np.abs(xc).max()
    X.loc[seg_id, 'abs_min'] = np.abs(xc).min()
    X.loc[seg_id, 'avg_diff'] = np.mean(np.diff(xc))
    #X.loc[seg_id, 'avg_diff_rate'] = np.mean(np.nonzero((np.diff(xc) / xc[:-1]))[0]) #seems do not help
    
    X.loc[seg_id, 'q95'] = np.quantile(xc, 0.95)
    X.loc[seg_id, 'q99'] = np.quantile(xc, 0.99)
    X.loc[seg_id, 'q05'] = np.quantile(xc, 0.05)
    X.loc[seg_id, 'q01'] = np.quantile(xc, 0.01)
    
    X.loc[seg_id, 'abs_q95'] = np.quantile(np.abs(xc), 0.95)
    X.loc[seg_id, 'abs_q99'] = np.quantile(np.abs(xc), 0.99)
    X.loc[seg_id, 'abs_q05'] = np.quantile(np.abs(xc), 0.05)
    X.loc[seg_id, 'abs_q01'] = np.quantile(np.abs(xc), 0.01)
    
    X.loc[seg_id, 'trend'] = add_trend_feature(xc)
    X.loc[seg_id, 'abs_trend'] = add_trend_feature(xc, abs_values=True)
    X.loc[seg_id, 'abs_mean'] = np.abs(xc).mean()
    X.loc[seg_id, 'abs_std'] = np.abs(xc).std()
    
    X.loc[seg_id, 'mad'] = xc.mad()
    X.loc[seg_id, 'kurt'] = xc.kurtosis()
    X.loc[seg_id, 'skew'] = xc.skew()
    X.loc[seg_id, 'med'] = xc.median()
    
    X.loc[seg_id, 'Hilbert_mean'] = np.abs(hilbert(xc)).mean()
    X.loc[seg_id, 'Hann_window_mean'] = (convolve(xc, hann(150), mode='same') / sum(hann(150))).mean()
    X.loc[seg_id, 'classic_sta_lta1_mean'] = classic_sta_lta(xc, 500, 10000).mean()
    X.loc[seg_id, 'classic_sta_lta2_mean'] = classic_sta_lta(xc, 5000, 100000).mean()
    X.loc[seg_id, 'classic_sta_lta3_mean'] = classic_sta_lta(xc, 3333, 6666).mean()
    X.loc[seg_id, 'classic_sta_lta4_mean'] = classic_sta_lta(xc, 10000, 25000).mean()
    
    X.loc[seg_id, 'Moving_average_700_mean'] = xc.rolling(window=700).mean().mean(skipna=True)
    X.loc[seg_id, 'Moving_average_1500_mean'] = xc.rolling(window=1500).mean().mean(skipna=True)
    X.loc[seg_id, 'Moving_average_3000_mean'] = xc.rolling(window=3000).mean().mean(skipna=True)
    X.loc[seg_id, 'Moving_average_6000_mean'] = xc.rolling(window=6000).mean().mean(skipna=True)
    
    ewma = pd.Series.ewm
    X.loc[seg_id, 'exp_Moving_average_300_mean'] = (ewma(xc, span=300).mean()).mean(skipna=True)
    X.loc[seg_id, 'exp_Moving_average_3000_mean'] = ewma(xc, span=3000).mean().mean(skipna=True)
    X.loc[seg_id, 'exp_Moving_average_30000_mean'] = ewma(xc, span=6000).mean().mean(skipna=True)
    
    no_of_std = 2
    X.loc[seg_id, 'MA_700MA_std_mean'] = xc.rolling(window=700).std().mean()
    X.loc[seg_id,'MA_700MA_BB_high_mean'] = (X.loc[seg_id, 'Moving_average_700_mean'] + no_of_std * X.loc[seg_id, 'MA_700MA_std_mean']).mean()
    X.loc[seg_id,'MA_700MA_BB_low_mean'] = (X.loc[seg_id, 'Moving_average_700_mean'] - no_of_std * X.loc[seg_id, 'MA_700MA_std_mean']).mean()
    
    X.loc[seg_id, 'MA_400MA_std_mean'] = xc.rolling(window=400).std().mean()
    X.loc[seg_id,'MA_400MA_BB_high_mean'] = (X.loc[seg_id, 'Moving_average_700_mean'] + no_of_std * X.loc[seg_id, 'MA_400MA_std_mean']).mean()
    X.loc[seg_id,'MA_400MA_BB_low_mean'] = (X.loc[seg_id, 'Moving_average_700_mean'] - no_of_std * X.loc[seg_id, 'MA_400MA_std_mean']).mean()
    X.loc[seg_id, 'MA_1000MA_std_mean'] = xc.rolling(window=1000).std().mean()
    
    X.loc[seg_id, 'iqr'] = np.subtract(*np.percentile(xc, [75, 25]))
    X.loc[seg_id, 'q999'] = np.quantile(xc,0.999)
    X.loc[seg_id, 'q001'] = np.quantile(xc,0.001)
    X.loc[seg_id, 'ave10'] = stats.trim_mean(xc, 0.1)
    
    for windows in [10, 100, 1000]:
        x_roll_std = xc.rolling(windows).std().dropna().values
        x_roll_mean = xc.rolling(windows).mean().dropna().values
        
        X.loc[seg_id, 'ave_roll_std_' + str(windows)] = x_roll_std.mean()
        X.loc[seg_id, 'std_roll_std_' + str(windows)] = x_roll_std.std()
        X.loc[seg_id, 'max_roll_std_' + str(windows)] = x_roll_std.max()
        X.loc[seg_id, 'min_roll_std_' + str(windows)] = x_roll_std.min()
        X.loc[seg_id, 'q01_roll_std_' + str(windows)] = np.quantile(x_roll_std, 0.01)
        X.loc[seg_id, 'q05_roll_std_' + str(windows)] = np.quantile(x_roll_std, 0.05)
        X.loc[seg_id, 'q95_roll_std_' + str(windows)] = np.quantile(x_roll_std, 0.95)
        X.loc[seg_id, 'q99_roll_std_' + str(windows)] = np.quantile(x_roll_std, 0.99)
        X.loc[seg_id, 'av_change_abs_roll_std_' + str(windows)] = np.mean(np.diff(x_roll_std))
        X.loc[seg_id, 'av_change_rate_roll_std_' + str(windows)] = np.mean(np.nonzero((np.diff(x_roll_std) / x_roll_std[:-1]))[0])
        X.loc[seg_id, 'abs_max_roll_std_' + str(windows)] = np.abs(x_roll_std).max()
        
        X.loc[seg_id, 'ave_roll_mean_' + str(windows)] = x_roll_mean.mean()
        X.loc[seg_id, 'std_roll_mean_' + str(windows)] = x_roll_mean.std()
        X.loc[seg_id, 'max_roll_mean_' + str(windows)] = x_roll_mean.max()
        X.loc[seg_id, 'min_roll_mean_' + str(windows)] = x_roll_mean.min()
        X.loc[seg_id, 'q01_roll_mean_' + str(windows)] = np.quantile(x_roll_mean, 0.01)
        X.loc[seg_id, 'q05_roll_mean_' + str(windows)] = np.quantile(x_roll_mean, 0.05)
        X.loc[seg_id, 'q95_roll_mean_' + str(windows)] = np.quantile(x_roll_mean, 0.95)
        X.loc[seg_id, 'q99_roll_mean_' + str(windows)] = np.quantile(x_roll_mean, 0.99)
        X.loc[seg_id, 'av_change_abs_roll_mean_' + str(windows)] = np.mean(np.diff(x_roll_mean))
        X.loc[seg_id, 'av_change_rate_roll_mean_' + str(windows)] = np.mean(np.nonzero((np.diff(x_roll_mean) / x_roll_mean[:-1]))[0])
        X.loc[seg_id, 'abs_max_roll_mean_' + str(windows)] = np.abs(x_roll_mean).max()
    return X

In [9]:
#generate features for a list of acoustic data
def gen_features_list(acd_list):
    train_features=pd.DataFrame()
    for acd in acd_list:
        train_features=train_features.append(create_many_features(acd),ignore_index=True)
    #print(train_features.shape)
    return train_features

In [10]:
def split_segments(acd_segments,ttf_segments,split_length):
    length=len(acd_segments)
    #split each complete segements into many small segments
    split_segments_index=np.array_split(np.arange(length),length//split_length)
    print('the length of this segments is %d'%len(split_segments_index))
    acd_split=[]
    ttf_split=pd.Series()
    for indices in split_segments_index:
        acd_split.append(pd.Series(acd_segments[indices]))
        ttf_split=ttf_split.append(pd.Series(ttf_segments[indices][-1]),ignore_index=True)
    #print(len(acd_split))
    return acd_split,ttf_split

In [11]:
def add_trend_feature(arr,abs_values=False):
    index=np.arange(len(arr))
    if abs_values:
        arr=np.abs(arr)
    lr=LinearRegression()
    lr.fit(index.reshape(-1,1),arr)
    return lr.coef_[0]

In [12]:
def classic_sta_lta(x,length_sta,length_lta):
    sta=np.cumsum(x**2)
    sta=np.require(sta,dtype=np.float)
    
    lta=sta.copy()
    
    sta[length_sta:]=sta[length_sta:]-sta[:-length_sta]
    sta/=length_sta
    
    lta[length_lta:]=lta[length_lta:]-lta[:-length_lta]
    lta/=length_lta
    
    sta[:length_sta-1]=0
    
    dtiny=np.finfo(0.0).tiny
    idx=lta<dtiny
    lta[idx]=dtiny
    return sta/lta
    
    
    

In [13]:

train_X=pd.DataFrame()
train_y=pd.Series()
acd_to_be_extended=np.array([])
ttf_to_be_extended=np.array([])
last=math.inf
split_length=100000
for chunk in train_df:
    acd=chunk.acoustic_data.values
    ttf=chunk.time_to_failure.values
    #split_index=[]   #record the index for segmentation
    split_index=np.array([])
    if ttf[0]>last:
        #split_index.append[0]
        acd_split,ttf_split=split_segments(acd_to_be_extended,ttf_to_be_extended,split_length)      
        train_X=train_X.append(gen_features_list(acd_split),ignore_index=True)
        train_y=train_y.append(ttf_split,ignore_index=True)
        acd_to_be_extended=np.array([])
        ttf_to_be_extended=np.array([])
        
    find_split=ttf[1:]>ttf[:-1]
    #split_index.append(np.where(find_split))
    split_index=np.append(split_index,np.where(find_split))
    length=len(split_index.tolist())
    if length!=0:     #which means a segment split exists
        print(length)
        print(split_index)
        acd_to_be_extended=np.append(acd_to_be_extended,
                                     acd[:int(split_index[0]+1)])
        ttf_to_be_extended=np.append(ttf_to_be_extended,ttf[:int(split_index[0]+1)])
        acd_split,ttf_split=split_segments(acd_to_be_extended,ttf_to_be_extended,split_length)
        train_X=train_X.append(gen_features_list(acd_split),ignore_index=True)
        train_y=train_y.append(ttf_split,ignore_index=True)
        #print(train_X.describe())
        #print(train_y.describe())
        acd_to_be_extended=np.array([])
        ttf_to_be_extended=np.array([])
        
        for i in range(length-1):
            acd_to_be_extended=acd[int(split_index[i])+1:int(split_index[i+1])+1]
            ttf_to_be_extended=ttf[int(split_index[i])+1:int(split_index[i+1])+1]
            acd_split,ttf_split=split_segments(acd_to_be_extended,ttf_to_be_extended,split_length)
            train_X=train_X.append(gen_features_list(acd_split),ignore_index=True)
            train_y=train_y.append(ttf_split,ignore_index=True)
            acd_to_be_extended=np.array([])
            ttf_to_be_extended=np.array([])
        acd_to_be_extended=acd[int(split_index[-1]):]
        ttf_to_be_extended=ttf[int(split_index[-1]):]
        
    else:
        acd_to_be_extended=np.append(acd_to_be_extended,acd)
        ttf_to_be_extended=np.append(ttf_to_be_extended,ttf)
    last=ttf[-1]

1
[656573.]
the length of this segments is 56
1
[85877.]
the length of this segments is 444
1
[4677355.]
the length of this segments is 545
1
[3772452.]
the length of this segments is 340
1
[2641819.]
the length of this segments is 488
1
[3652629.]
the length of this segments is 310
1
[829584.]
the length of this segments is 271
1
[2838916.]
the length of this segments is 620
1
[3276286.]
the length of this segments is 304
1
[377847.]
the length of this segments is 371
1
[4368879.]
the length of this segments is 439
1
[1811622.]
the length of this segments is 424
1
[800224.]
the length of this segments is 339
1
[3777114.]
the length of this segments is 329
1
[568143.]
the length of this segments is 567
1
[1985672.]
the length of this segments is 364


In [14]:
train_X.shape

(6211, 145)

In [15]:
train_X.describe()

Unnamed: 0,mean,std,max,min,Rmean,Rstd,Rmax,Rmin,Imean,Istd,...,std_roll_mean_1000,max_roll_mean_1000,min_roll_mean_1000,q01_roll_mean_1000,q05_roll_mean_1000,q95_roll_mean_1000,q99_roll_mean_1000,av_change_abs_roll_mean_1000,av_change_rate_roll_mean_1000,abs_max_roll_mean_1000
count,6211.0,6211.0,6211.0,6211.0,6211.0,6211.0,6211.0,6211.0,6211.0,6211.0,...,6211.0,6211.0,6211.0,6211.0,6211.0,6211.0,6211.0,6211.0,6211.0,6211.0
mean,4.521794057319453,6.430987478526413,139.4979874416358,-126.21059410722911,4.723393978425362,2091.84537677106,452852.09063298145,-19064.76116424869,-0.0,1439.2035016272375,...,0.2454099723768594,5.493421510223812,3.601909515375932,3.957356815327635,4.12593319111254,4.912264981484449,5.0869960924166575,6.803709e-09,49572.74316446682,5.493421510223812
std,0.2622870601602583,8.657797853724391,228.15484357098848,221.69000005880275,6.335885461568504,1871.6949297379567,26392.86221816743,23951.327619581265,9.5e-15,1937.9838589709427,...,0.1600870438933056,2.601413582855254,1.9723592326287411,0.4974187051902145,0.2861494528753491,0.281985540624572,0.6329006192720822,3.9102465569e-06,134.12704402464195,2.601413582855254
min,3.5865115814696487,2.7633798446846334,20.0,-5515.0,-81.99999999999999,1376.742713799496,359225.0,-527625.6707328672,-5.039e-13,617.6815522581195,...,0.1447614466950285,4.112,-48.778,-6.466919999999999,1.462,3.879,3.971,-3.9589961117e-05,49083.852842218956,4.112
25%,4.350461710507792,4.384949846821973,78.0,-137.0,2.0000000000000053,1740.8889970177931,435588.0,-21297.22662057976,-1.4e-15,981.3005400829768,...,0.2123099989544767,5.082,3.525,3.799,3.952,4.729,4.874,-2.5619818979e-06,49486.210439533825,5.082
50%,4.525091236314553,5.490830675311534,108.0,-95.0,4.9999999999999645,1890.3335684476433,453228.0,-16235.764832525365,0.0,1229.4698044279087,...,0.2320018160439164,5.286,3.764,3.984,4.137,4.912,5.057,8.07542446e-08,49567.69445932794,5.286
75%,4.697557280828992,6.84416560903011,151.0,-67.0,6.9999999999999964,2099.5701920674614,470510.0,-12101.91559536314,1.4e-15,1531.6301757459037,...,0.2542263592429742,5.5145,3.972,4.168,4.311,5.09,5.242,2.5409760277e-06,49653.66373753123,5.5145
max,5.48997683891063,187.73210864520124,5444.0,-10.0,121.00000000000036,41999.190200526486,549920.0,-3946.711260560193,3.402e-13,41982.61385978724,...,4.303318311510358,73.842,4.86,5.056,5.121,6.884,20.912,3.9175882442e-05,50362.366165046486,73.842


In [17]:
test_X_seg_ids=pd.read_csv('F:\\Qplus\\Kaggle\\sample_submission.csv',dtype={'time_to_failure':np.float32},index_col='seg_id')

In [19]:
test_X_seg_ids.head()

Unnamed: 0_level_0,time_to_failure
seg_id,Unnamed: 1_level_1
seg_00030f,0.0
seg_0012b5,0.0
seg_00184e,0.0
seg_003339,0.0
seg_0042cc,0.0


In [20]:
test_X=pd.DataFrame()
for seg_id in test_X_seg_ids.index.values:
    #print(type(seg_id))
    seg_data=pd.read_csv('F:\\Qplus\\Kaggle\\test\\'+seg_id+'.csv',dtype={'acoustic_data':np.float32})
    test_X=test_X.append(create_many_features(seg_data.acoustic_data,seg_id=seg_id),ignore_index=True)

In [21]:
test_X.shape

(2624, 145)

In [22]:
Merged_X=pd.concat((train_X,test_X),ignore_index=True)

In [23]:
from sklearn.preprocessing import StandardScaler
scaler=StandardScaler()
scaler.fit(Merged_X)
train_X_scaled=scaler.transform(train_X)

In [24]:
test_X_scaled=scaler.transform(test_X)

In [25]:
train_X_transformed=train_X_scaled.reshape(train_X_scaled.shape[0],train_X_scaled.shape[1],1)

In [26]:
train_X_transformed.shape

(6211, 145, 1)

In [27]:
LSTM_model=Sequential()
LSTM_model.add(LSTM(100,input_shape=(train_X_transformed.shape[1],train_X_transformed.shape[2])))
LSTM_model.add(Dense(1))

Instructions for updating:
Colocations handled automatically by placer.


In [28]:
LSTM_model.compile(optimizer='adam',loss='mae')
history=LSTM_model.fit(train_X_transformed,train_y,epochs=200,batch_size=64,verbose=True)

Instructions for updating:
Use tf.cast instead.
Epoch 1/200
Epoch 2/200
Epoch 3/200
Epoch 4/200
Epoch 5/200
Epoch 6/200
Epoch 7/200
Epoch 8/200
Epoch 9/200
Epoch 10/200
Epoch 11/200
Epoch 12/200
Epoch 13/200
Epoch 14/200
Epoch 15/200
Epoch 16/200
Epoch 17/200
Epoch 18/200
Epoch 19/200
Epoch 20/200
Epoch 21/200
Epoch 22/200
Epoch 23/200
Epoch 24/200
Epoch 25/200
Epoch 26/200
Epoch 27/200
Epoch 28/200
Epoch 29/200
Epoch 30/200
Epoch 31/200
Epoch 32/200
Epoch 33/200
Epoch 34/200
Epoch 35/200
Epoch 36/200
Epoch 37/200
Epoch 38/200
Epoch 39/200
Epoch 40/200
Epoch 41/200
Epoch 42/200
Epoch 43/200
Epoch 44/200
Epoch 45/200
Epoch 46/200
Epoch 47/200
Epoch 48/200
Epoch 49/200
Epoch 50/200
Epoch 51/200
Epoch 52/200
Epoch 53/200
Epoch 54/200
Epoch 55/200
Epoch 56/200
Epoch 57/200
Epoch 58/200
Epoch 59/200
Epoch 60/200
Epoch 61/200
Epoch 62/200
Epoch 63/200
Epoch 64/200
Epoch 65/200
Epoch 66/200
Epoch 67/200
Epoch 68/200
Epoch 69/200
Epoch 70/200
Epoch 71/200
Epoch 72/200
Epoch 73/200
Epoch 74/200

In [29]:
test_X_scaled = test_X_scaled.reshape(test_X_scaled.shape[0], test_X_scaled.shape[1], 1)
y_predict = LSTM_model.predict(test_X_scaled)

In [30]:
submission = pd.read_csv('F:\\Qplus\\Kaggle\sample_submission.csv', index_col='seg_id')

In [31]:
submission['time_to_failure'] = y_predict

In [32]:
submission.to_csv('F:\\Qplus\\Kaggle\\submission.csv',index=True)