In [1]:
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from os import listdir
from os.path import isfile

In [2]:
# Pandas precision
pd.set_option('display.precision', 9)
# 629145480 number of rows

In [6]:
df_train = pd.DataFrame(dtype=np.float, columns=['mean', 'std', 'min', 'max', 'sum', 'abs_mean', 'abs_std', 'abs_max', 'abs_sum'])

In [7]:
def generate_features(chunk):
    mean = chunk['acoustic_data'].mean()
    std = chunk['acoustic_data'].std()
    min = chunk['acoustic_data'].min()
    max = chunk['acoustic_data'].max()
    sum = chunk['acoustic_data'].sum()
    abs_sum = chunk['acoustic_data'].abs().sum()
    abs_max = chunk['acoustic_data'].abs().max()
    abs_mean = chunk['acoustic_data'].abs().mean()
    abs_std = chunk['acoustic_data'].abs().std()
    return [mean, std, min, max, sum, abs_mean, abs_std, abs_max, abs_sum]

In [8]:
i = 0
for chunk in pd.read_csv('data/train.csv', chunksize=150000):
    time_to_failure = chunk['time_to_failure'].values[-1]
    df_train.loc[i, ['mean', 'std', 'min', 'max', 'sum', 'abs_mean', 'abs_std', 'abs_max', 'abs_sum']] = generate_features(chunk)
    df_train.loc[i, 'time_to_failure'] = time_to_failure
    i += 1

In [9]:
df_train.describe()

Unnamed: 0,mean,std,min,max,sum,abs_mean,abs_std,abs_max,abs_sum,time_to_failure
count,4195.0,4195.0,4195.0,4195.0,4195.0,4195.0,4195.0,4195.0,4195.0,4195.0
mean,4.519475158,6.547788191,-149.190941597,163.522288439,677807.531823599,5.547366504,5.750164895,170.04624553,831985.0017,5.683670383
std,0.256049486,8.50393949,265.087983548,272.930331447,39087.639872457,1.517037556,8.339211436,296.887014915,227746.1406,3.673246303
min,3.596313333,2.802720142,-5515.0,23.0,207622.0,4.147706667,2.589085218,23.0,218998.0,0.006397657
25%,4.349496667,4.478637142,-154.0,92.0,652414.0,5.061843333,3.862810034,94.0,759276.5,2.635348205
50%,4.522146667,5.618797775,-111.0,123.0,678274.0,5.380853333,4.781513433,127.0,807128.0,5.358795935
75%,4.69335,6.880903553,-79.0,170.0,704002.5,5.748553333,5.887947258,175.0,862283.0,8.177499733
max,5.391993333,153.703569356,-15.0,5444.0,808799.0,32.762073333,150.432368254,5515.0,4914311.0,16.103195567


In [10]:
df_train.head()

Unnamed: 0,mean,std,min,max,sum,abs_mean,abs_std,abs_max,abs_sum,time_to_failure
0,4.884113333,5.101106131,-98.0,104.0,732617.0,5.576566667,4.333324674,104.0,836485.0,1.430797186
1,4.725766667,6.588823782,-154.0,181.0,708865.0,5.734166667,5.732776966,181.0,860125.0,1.391498893
2,4.906393333,6.967397034,-106.0,140.0,735959.0,6.152646667,5.895944714,140.0,922897.0,1.353196095
3,4.90224,6.922305187,-199.0,197.0,735336.0,5.93396,6.0612136,199.0,890094.0,1.313797802
4,4.90872,7.30111019,-126.0,145.0,736308.0,6.110586667,6.329485314,145.0,916588.0,1.274399509


In [19]:
df_train.isna().sum()

time_to_failure    0
mean               0
std                0
min                0
max                0
sum                0
abs_mean           0
abs_std            0
abs_max            0
abs_sum            0
dtype: int64

In [11]:
X_train = df_train.drop(columns=['time_to_failure']).values
y_train = df_train['time_to_failure'].values

In [12]:
trainX_mean = np.mean(X_train, axis=0)
trainX_std = np.std(X_train, axis=0)
trainY_mean = np.mean(y_train)
trainY_std = np.std(y_train)

In [13]:
X_train = (X_train - trainX_mean) / trainX_std
y_train = (y_train - trainY_mean) / trainY_std

In [30]:
model = keras.Sequential([
    layers.Dense(100, activation=tf.nn.relu, input_shape=[X_train.shape[1]]),
    layers.Dense(100, activation=tf.nn.relu),
    layers.Dense(1)
])

optimizer = keras.optimizers.RMSprop(0.001)

model.compile(loss='mean_squared_error',
              optimizer=optimizer,
              metrics=['mean_absolute_error', 'mean_squared_error'])

In [31]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_7 (Dense)              (None, 100)               1000      
_________________________________________________________________
dense_8 (Dense)              (None, 100)               10100     
_________________________________________________________________
dense_9 (Dense)              (None, 1)                 101       
Total params: 11,201
Trainable params: 11,201
Non-trainable params: 0
_________________________________________________________________


In [32]:
class PrintDot(keras.callbacks.Callback):
    def on_epoch_end(self, epoch, logs):
        if epoch % 100 == 0:
            print('')
        print('.', end='')

EPOCHS = 1000
early_stop = keras.callbacks.EarlyStopping(monitor='val_loss', patience=10)
history = model.fit(
    X_train, y_train,
    epochs=EPOCHS, validation_split=0.2, verbose=0,
    callbacks=[early_stop, PrintDot()]
)


.........................

In [33]:
hist = pd.DataFrame(history.history)
hist['epoch'] = history.epoch
hist.tail()

Unnamed: 0,val_loss,val_mean_absolute_error,val_mean_squared_error,loss,mean_absolute_error,mean_squared_error,epoch
20,0.864863439,0.763490512,0.864863439,0.530630706,0.563312793,0.530630706,20
21,0.815388281,0.736974715,0.815388281,0.533234711,0.561719187,0.533234711,21
22,0.953771421,0.796143156,0.953771421,0.530009271,0.561830006,0.530009271,22
23,0.810728639,0.728193186,0.810728639,0.530871931,0.563422362,0.530871931,23
24,0.872694661,0.760971309,0.872694661,0.529627145,0.560147264,0.529627145,24


In [34]:
path = 'data/test/'
files = [f[:-4] for f in listdir(path) if isfile(path + f)]

In [35]:
predictions = pd.DataFrame(index=files, dtype=np.float, columns=['time_to_failure'])
predictions.index.name = 'seg_id'

In [36]:
for f in files:
    df = pd.read_csv(path+f+'.csv')
    df_test = pd.DataFrame(np.array(generate_features(df)).reshape(1,-1), columns=['mean', 'std', 'min', 'max', 'sum', 'abs_mean', 'abs_std', 'abs_max', 'abs_sum'])
    X_test = df_test.values
    X_test = (X_test - trainX_mean) / trainX_std
    y = model.predict(X_test)[0]
    predictions.loc[f, 'time_to_failure'] = y

In [37]:
predictions.to_csv('submission.csv')