In [1]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

In [2]:
import pandas as pd
import numpy as np
import os

from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.tree import export_graphviz

import gc
import pickle as pickle

from sklearn.model_selection import KFold
from itertools import product
import tensorflow as tf
from sklearn.preprocessing import StandardScaler,MinMaxScaler 
import multiprocessing as mp
import importlib
from sklearn.model_selection import train_test_split


from keras.models import Sequential
from keras.layers import LSTM,Dense,Dropout,Flatten,GRU,Conv1D,TimeDistributed,MaxPooling1D,Flatten,CuDNNGRU,CuDNNLSTM
from keras.callbacks import EarlyStopping
from keras.layers.normalization import BatchNormalization
from keras.layers import Bidirectional
from keras.regularizers import l2

from keras import backend as K
from keras.callbacks import ModelCheckpoint


Using TensorFlow backend.


In [3]:
#tail -n +2 train.csv | split -l 150000


from os import listdir
from os.path import isfile, join

TRAIN_SPLITS='train'
splits = [f for f in listdir(TRAIN_SPLITS) if isfile(join(TRAIN_SPLITS, f))]

In [4]:
#
columns = ['acoustic_data','time_to_failure']



In [5]:
train_data, val_data = train_test_split(splits, test_size=0.1, random_state=42)

In [6]:
TIMESTEPS=150000
BATCH_SIZE=16
NUMBER_OF_BATCHES = int(np.ceil(len(train_data)/BATCH_SIZE))
NUMBER_OF_VALIDATION_STEPS = int(np.ceil(len(val_data) / BATCH_SIZE))

In [7]:
train_data_batch = np.array_split(train_data, NUMBER_OF_BATCHES)
val_data_batch = np.array_split(val_data, NUMBER_OF_VALIDATION_STEPS)

In [9]:
NOISE=0.75

def add_noise(dff, pct=NOISE):
    mu = dff['acoustic_data'].mean()
    sigma = dff['acoustic_data'].std()

    indices = np.random.choice(dff.index.values, int(len(dff)*pct))
    dff.loc[indices, 'acoustic_data'] = np.random.normal(mu, sigma, len(indices)) 
    return dff


def get_batch(list_of_files, valid=False):
#     batch = np.empty((len(list_of_files),TIMESTEPS,1),dtype=float)
#     target = np.empty((len(list_of_files),1),dtype=float)
    #print(list_of_files)
    batch = []
    target = []

    for idx, file in enumerate(list_of_files):
        #print(idx,file)
        path = f'train/{file}'
        df = pd.read_csv(path, float_precision='round_trip', header=None)
        df.columns = columns
        df[['acoustic_data']] = StandardScaler().fit_transform(df[['acoustic_data']].astype('float'))
        #print(df.head())
        #print(len(batch))
        batch.append(df['acoustic_data'].values)
        target.append(df['time_to_failure'].values[-1])
        #print(df_noise.head())
        #if not valid:
            #df_noise = add_noise(df)
            #batch.append(df_noise['acoustic_data'].values)#.reshape(-1,TIMESTEPS,1))
            #target.append(df_noise['time_to_failure'].values[-1])#.reshape(-1,1))
        #print(np.array(batch).reshape(-1,TIMESTEPS,1).shape)
        #batch = np.array(batch).reshape(-1,TIMESTEPS,1)
        #target = np.array(target).reshape(-1,1)
    return (batch, target)

In [10]:
from keras.utils import Sequence

class MY_Generator(Sequence):

    def __init__(self, list_of_files, steps,name):
        self.list_of_files = list_of_files
        self.steps = steps
        self.name = name

    #This function computes the number of batches that this generator is supposed to produce. 
    #So, we divide the number of total samples by the batch_size and return that value.    
    def __len__(self):
        return self.steps

    #Here, given the batch numberidx you need to put together a list that consists of data 
    #batch and the ground-truth (GT). In this example, we read a batch images of size 
    #self.batch and return an array of form[image_batch, GT]
    def __getitem__(self, idx):
#         if self.name == 'val':
        #print('idx', idx)
        #print("DEBUG", self.list_of_files[idx])
        #if idx == len(self.list_of_files):
            #print(idx, self.list_of_files)
        if self.name == 'val':
            valid = True
        else:
            valid=False
        train,Y = get_batch(self.list_of_files[idx], valid)
            
        #print(np.array(train).reshape(-1,TIMESTEPS,1).shape)
        #print("idx",idx)
        #print("LOLILOL")
        #print(train.shape, Y.shape)
        train = np.array(train).reshape(-1,TIMESTEPS,1)
        #print(train.shape)
        Y = np.array(Y).reshape(-1,1)

        return (train,Y)

In [12]:
#train_data, val_data, y_train, y_val = train_test_split(training, targets, test_size=0.1, random_state=42)

K.clear_session()

TIMESTEPS=150000

dropout=0.33

kernel_size=5
filters=3
strides=5
pool_size=2
regularizer=l2(0.05)

my_model = Sequential()
my_model.add(
        Conv1D(filters=filters, kernel_size=kernel_size, activation="relu",
               kernel_regularizer=regularizer,
               strides=strides, input_shape=(TIMESTEPS,1))
)
             
my_model.add(MaxPooling1D(pool_size=pool_size))
my_model.add(BatchNormalization())

my_model.add(
        Conv1D(filters=filters, kernel_size=kernel_size, activation='relu',
               kernel_regularizer=regularizer,
               strides=strides, input_shape=(TIMESTEPS,1))
)             
my_model.add(MaxPooling1D(pool_size=pool_size))
my_model.add(BatchNormalization())

my_model.add(
        Conv1D(filters=filters, kernel_size=kernel_size, activation='relu',
               kernel_regularizer=regularizer,
               strides=strides, input_shape=(TIMESTEPS,1))
)             
my_model.add(MaxPooling1D(pool_size=pool_size))
my_model.add(BatchNormalization())


my_model.add(GRU(units = 8,dropout=dropout,recurrent_dropout=dropout))

my_model.add(Dense(1))



my_model.compile(loss = 'mae',optimizer = 'adam', metrics = ['mean_absolute_error'])
my_model.summary()


filepath="weights.{epoch:02d}-{val_loss:.2f}.hdf5"

callbacks = [
    #EarlyStopping(monitor='val_loss', patience=30, verbose=0),
    ModelCheckpoint(filepath, monitor='val_mean_absolute_error', verbose=1, 
                    save_best_only=True, mode='min')
]

my_training_batch_generator = MY_Generator(train_data_batch, NUMBER_OF_BATCHES, 'train')
my_validation_batch_generator = MY_Generator(val_data_batch, NUMBER_OF_VALIDATION_STEPS, 'val')


# history = my_model.fit_generator(generator=my_training_batch_generator,
#                                       #steps_per_epoch=NUMBER_OF_BATCHES,
#                                       epochs=1000,
#                                       validation_data=my_validation_batch_generator,
#                                       #validation_steps=NUMBER_OF_VALIDATION_STEPS,
#                                       callbacks=callbacks,
#                                       shuffle=True,
#                                       #verbose=1,
#                                       #validation_data=my_validation_batch_generator,
#                                       #validation_steps=(num_validation_samples // batch_size),
#                                       use_multiprocessing=True,
#                                       workers=8,
#                                       #max_queue_size=32
#                   )


# import matplotlib.pyplot as plt


# plt.plot(history.history['loss'])
# plt.plot(history.history['val_loss'])
# plt.title('model loss')
# plt.ylabel('loss')
# plt.xlabel('epoch')
# plt.legend(['train', 'test'], loc='upper left')
# plt.show()

#import math
#print("best rmse val:", math.sqrt(my_model.history.history['val_mean_squared_error'][-1]))


my_model.load_weights('weights.515-2.17.hdf5')

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv1d_1 (Conv1D)            (None, 30000, 3)          18        
_________________________________________________________________
max_pooling1d_1 (MaxPooling1 (None, 15000, 3)          0         
_________________________________________________________________
batch_normalization_1 (Batch (None, 15000, 3)          12        
_________________________________________________________________
conv1d_2 (Conv1D)            (None, 3000, 3)           48        
_________________________________________________________________
max_pooling1d_2 (MaxPooling1 (None, 1500, 3)           0         
_________________________________________________________________
batch_normalization_2 (Batch (None, 1500, 3)           12        
_________________________________________________________________
conv1d_3 (Conv1D)            (None, 300, 3)            48        
__________

In [14]:
len(val_data)

420

In [16]:
TEST_SPLITS='test'
test_splits = [f for f in listdir(TEST_SPLITS) if isfile(join(TEST_SPLITS, f))]

In [24]:
test_splits

ids = []
preds = []

i = 0
for test_file in test_splits:
    if i % 100 == 0:
        print(i)
    path = f'test/{test_file}'
    df = pd.read_csv(path, float_precision='round_trip', header=0)
    df.columns = ['acoustic_data']
    df[['acoustic_data']] = StandardScaler().fit_transform(df[['acoustic_data']].astype('float'))
    ids.append(test_file.split(".")[0])
    preds.append(my_model.predict(df['acoustic_data'].values.reshape(1,TIMESTEPS,1))[0][0])
    i+=1

    

0
100
200
300
400
500
600
700
800
900
1000
1100
1200
1300
1400
1500
1600
1700
1800
1900
2000
2100
2200
2300
2400
2500
2600


In [25]:
submission = pd.DataFrame(ids)
submission.columns = ['seg_id']
submission['time_to_failure'] = preds

submission.to_csv('submission.csv', index=False)

In [27]:
submission["time_to_failure"].describe()

count    2624.000000
mean        5.103442
std         2.708231
min         0.005568
25%         2.771998
50%         4.416964
75%         7.524231
max        10.567600
Name: time_to_failure, dtype: float64

In [None]:
submission["time_to_failure"].describe()

In [None]:
submission.head()

In [None]:
preds

In [None]:
res[0].get()[0]

In [None]:
path = 'test/%s' % (np.random.choice(test_splits))
#

df = pd.read_csv(path, float_precision='round_trip', header=[0])


df.describe()