In [1]:
!which python

/home/ubuntu/anaconda3/envs/kaggle/bin/python


In [2]:
import keras
import keras.backend as K
from keras.layers import LSTM,Dropout,Dense,TimeDistributed,Conv1D,MaxPooling1D,Flatten
from keras.models import Sequential
import tensorflow as tf
import gc
from numba import jit
from IPython.display import display, clear_output
from tqdm import tqdm
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import sys
sns.set_style("whitegrid")

Using TensorFlow backend.


In [3]:
import pyarrow.parquet as pq
import pandas as pd
import numpy as np

In [4]:
#Both numpy and scipy has utilities for FFT which is an endlessly useful algorithm
from numpy.fft import *
from scipy import fftpack

In [5]:
#FFT to filter out HF components and get main signal profile
def low_pass(s, threshold=1e4):
    fourier = rfft(s)
    frequencies = rfftfreq(s.size, d=2e-2/s.size)
    fourier[frequencies > threshold] = 0
    return irfft(fourier)

In [6]:
###Filter out low frequencies from the signal to get HF characteristics
def high_pass(s, threshold=1e7):
    fourier = rfft(s)
    frequencies = rfftfreq(s.size, d=2e-2/s.size)
    fourier[frequencies < threshold] = 0
    return irfft(fourier)

In [7]:
def phase_indices(signal_num):
    phase1 = 3*signal_num
    phase2 = 3*signal_num + 1
    phase3 = 3*signal_num + 2
    return phase1,phase2,phase3

In [8]:
%%time 
train_set = pq.read_pandas('../../data/train.parquet').to_pandas()

  labels = getattr(columns, 'labels', None) or [
  return pd.MultiIndex(levels=new_levels, labels=labels, names=columns.names)
  labels, = index.labels


CPU times: user 1min 28s, sys: 12.2 s, total: 1min 40s
Wall time: 7.75 s


In [9]:
%%time
meta_train = pd.read_csv('../../data/metadata_train.csv')

CPU times: user 7.28 ms, sys: 3.39 ms, total: 10.7 ms
Wall time: 9.42 ms


In [10]:
# @jit('float32(float32[:,:], int32)')

In [11]:
def feature_extractor(x, n_part=1000):
    length = len(x)
    pool = np.int32(np.ceil(length/n_part))
    output = np.zeros((n_part,))
    for j, i in enumerate(range(0,length, pool)):
        if i+pool < length:
            k = x[i:i+pool]
        else:
            k = x[i:]
        output[j] = np.max(k, axis=0) - np.min(k, axis=0)
    return output

In [None]:
x_train = []
y_train = []
for i in tqdm(meta_train.signal_id):
    idx = meta_train.loc[meta_train.signal_id==i, 'signal_id'].values.tolist()
    y_train.append(meta_train.loc[meta_train.signal_id==i, 'target'].values)
    x_train.append(abs(feature_extractor(train_set.iloc[:, idx].values, n_part=400)))

100%|██████████| 8712/8712 [01:31<00:00, 97.47it/s] 


In [None]:
y_train = np.array(y_train).reshape(-1,)
X_train = np.array(x_train).reshape(-1,x_train[0].shape[0])

In [None]:
X_train.shape

(8712, 400)

In [None]:
x_train_lp = []
x_train_hp = []
x_train_dc = []
for i in tqdm(meta_train.signal_id):
    idx = meta_train.loc[meta_train.signal_id==i, 'signal_id'].values.tolist()
    clear_output(wait=True)
    hp = high_pass(train_set.iloc[:, idx[0]])
    lp = low_pass(train_set.iloc[:, idx[0]])
    meas_id = meta_train.id_measurement[meta_train.signal_id==idx[0]].values[0]
    p1,p2,p3=phase_indices(meas_id)
    lf_signal_1,lf_signal_2,lf_signal_3 = low_pass(train_set.iloc[:,p1]), low_pass(train_set.iloc[:,p2]), low_pass(train_set.iloc[:,p3])
    dc = np.abs(lf_signal_1)+np.abs(lf_signal_2)+np.abs(lf_signal_3)
    x_train_lp.append(abs(feature_extractor(lp, n_part=400)))
    x_train_hp.append(abs(feature_extractor(hp, n_part=400)))
    x_train_dc.append(abs(feature_extractor(dc, n_part=400)))

In [None]:
x_train = np.array(x_train).reshape(-1,x_train[0].shape[0])
x_train_lp = np.array(x_train_lp).reshape(-1,x_train_lp[0].shape[0])
x_train_hp = np.array(x_train_hp).reshape(-1,x_train_hp[0].shape[0])
x_train_dc = np.array(x_train_dc).reshape(-1,x_train_dc[0].shape[0])

In [None]:
(x_train-x_train_lp).sum()

In [None]:
np.save('x_train.npy', x_train)
np.save('x_train_lp.npy', x_train_lp)
np.save('x_train_hp.npy', x_train_hp)
np.save('x_train_dc.npy', x_train_dc)

In [None]:
x_train = np.load('x_train.npy')
x_train_lp = np.load('x_train_lp.npy')
x_train_hp = np.load('x_train_hp.npy')
x_train_dc = np.load('x_train_dc.npy')

In [33]:
del train_set; gc.collect()

39024

In [None]:
%%time
test_set = pq.read_pandas('../../data/test.parquet').to_pandas()

In [None]:
%%time
meta_test = pd.read_csv('../../data/metadata_test.csv')

In [None]:
x_test = []
for i in tqdm(meta_test.signal_id.values):
   idx=i-8712
   clear_output(wait=True)
   x_test.append(abs(feature_extractor(test_set.iloc[:, idx].values, n_part=400)))

In [None]:
x_test_lp = []
x_test_hp = []
x_test_dc = []
for i in tqdm(meta_test.signal_id):
   idx = idx=i-8712
   clear_output(wait=True)
   hp = high_pass(test_set.iloc[:, idx])
   lp = low_pass(test_set.iloc[:, idx])
   meas_id = meta_test.id_measurement[meta_test.signal_id==i].values[0]
   p1,p2,p3=phase_indices(meas_id)
   lf_signal_1,lf_signal_2,lf_signal_3 = low_pass(test_set.iloc[:,p1-8712]), low_pass(test_set.iloc[:,p2-8712]), low_pass(test_set.iloc[:,p3-8712])
   dc = np.abs(lf_signal_1)+np.abs(lf_signal_2)+np.abs(lf_signal_3)
   x_test_lp.append(abs(feature_extractor(lp, n_part=400)))
   x_test_hp.append(abs(feature_extractor(hp, n_part=400)))
   x_test_dc.append(abs(feature_extractor(dc, n_part=400)))

In [None]:
x_test = np.array(x_test).reshape(-1,x_test[0].shape[0])
x_test_lp = np.array(x_test_lp).reshape(-1,x_test_lp[0].shape[0])
x_test_hp = np.array(x_test_hp).reshape(-1,x_test_hp[0].shape[0])
x_test_dc = np.array(x_test_dc).reshape(-1,x_test_dc[0].shape[0])

In [None]:
del test_set; gc.collect()

In [82]:
train = np.dstack((x_train,x_train_lp,x_train_hp,x_train_dc))
test = np.dstack((x_test,x_test_lp,x_test_hp,x_test_dc))

In [83]:
y_train = np.array(y_train).reshape(-1,)

In [88]:
def keras_auc(y_true, y_pred):
    auc = tf.metrics.auc(y_true, y_pred)[1]
    K.get_session().run(tf.local_variables_initializer())
    return auc

In [89]:
n_signals = 1 #So far each instance is one signal. We will diversify them in next step
n_outputs = 1 #Binary Classification

In [90]:
verbose, epochs, batch_size = True, 15, 16
n_signals,n_steps, n_length = 4,40, 10
train = train.reshape((train.shape[0], n_steps, n_length, n_signals))
# define model
model = Sequential()
model.add(TimeDistributed(Conv1D(filters=64, kernel_size=3, activation='relu'), input_shape=(None,n_length,n_signals)))
model.add(TimeDistributed(Conv1D(filters=64, kernel_size=3, activation='relu')))
model.add(TimeDistributed(Dropout(0.5)))
model.add(TimeDistributed(MaxPooling1D(pool_size=2)))
model.add(TimeDistributed(Flatten()))
model.add(LSTM(100))
model.add(Dropout(0.5))
model.add(Dense(100, activation='relu'))
model.add(Dense(n_outputs, activation='sigmoid'))

In [91]:
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=[keras_auc])

In [92]:
# fit network
model.fit(train, y_train, epochs=epochs, batch_size=batch_size, verbose=verbose)

Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15


<keras.callbacks.History at 0x7f44599206a0>

In [93]:
model.save_weights('model2.hdf5')

In [94]:
X_test = test.reshape((test.shape[0], n_steps, n_length, n_signals))

In [95]:
preds = model.predict(X_test)

In [96]:
threshpreds = (preds>0.5)*1

In [98]:
sub = pd.read_csv('../../data/sample_submission.csv')
sub.target = threshpreds

In [102]:
sub.to_csv('../../submissions/third_sub.csv',index=False, )