In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os
import warnings
import gc

import pyarrow.parquet as pq
import matplotlib.pyplot as plt
import seaborn as sns

from tqdm import tqdm_notebook as tqdm
from sklearn.model_selection import train_test_split

%matplotlib inline
warnings.filterwarnings('ignore')
print(os.listdir("../input"))
print(os.listdir("./"))

In [None]:
BATCH_SIZE = 64
MERGE_SIZE = 400

In [None]:
metadata_train = pd.read_csv('../input/metadata_train.csv')

In [None]:
def read_wave_data(parquet_path,col_nums,end_col_num, merge_size=800):
    df_diff = None
    for i, col_num in tqdm(enumerate(col_nums)):
        start = col_num
        if i == len(col_nums) - 1:
            end = end_col_num
        else:
            end = col_nums[i + 1]
        columns = [str(j) for j in range(start,end)]
        tmp_df = pq.read_pandas(parquet_path, columns=columns).to_pandas()
        group_id = np.repeat(range(len(tmp_df) // merge_size), merge_size)
        tmp_df['group_id'] = pd.Series(group_id)
        tmp_diff = (tmp_df.groupby('group_id').max() - tmp_df.groupby('group_id').min()) / 256
        if df_diff is None:
            df_diff = tmp_diff
        else:
            df_diff = pd.concat([df_diff, tmp_diff], axis=1)
    df_diff = df_diff.astype('float16')
    return df_diff

In [None]:
train_parquet_path = '../input/train.parquet'
end_col_num = metadata_train['signal_id'].values[-1] + 1
col_nums = metadata_train['signal_id'].values[::500].tolist()
train_diff = read_wave_data(train_parquet_path,col_nums,end_col_num,merge_size=MERGE_SIZE)
print(train_diff.shape)

In [None]:
def train_data_gen(metadata_train, train_diff, batch_size=128, is_reverse=False):
    np.random.seed(1)
    while True:
        x_train = []
        y_train = []
        true_sample = metadata_train[metadata_train['target']==1].sample(batch_size // 2)
        neg_sample = metadata_train[metadata_train['target']==0].sample(batch_size // 2)
    
        sample_signal_id = np.concatenate([true_sample['signal_id'].values,neg_sample['signal_id'].values])
        np.random.shuffle(sample_signal_id)
        for signal_id in sample_signal_id:
            diffs = train_diff[str(signal_id)].values.T
            
            if is_reverse:
                diffs = diffs[::-1]
            
            data = diffs[:, np.newaxis]
            x_train.append(data)
            y_train.append(metadata_train[metadata_train['signal_id']==signal_id]['target'].values[0])
            
        x_train = np.array(x_train)
        y_train = np.array(y_train)
        yield x_train, y_train

In [None]:
metadata_train, metadata_val = train_test_split(metadata_train, test_size=0.2, random_state=42)
print(metadata_train.shape)
print(metadata_val.shape)

In [None]:
x_val = []
y_val = []
for signal_id in metadata_val['signal_id'].values:
    diffs = train_diff[str(signal_id)].values.T
    data = diffs[:, np.newaxis]
    x_val.append(data)
    y_val.append(metadata_val[metadata_val['signal_id']==signal_id]['target'].values[0])
x_val = np.array(x_val)
y_val = np.array(y_val)    
print(x_val.shape)
print(y_val.shape)

In [None]:
from keras.models import Sequential
from keras import layers

from keras.optimizers import Adam
from keras.callbacks import ModelCheckpoint, LearningRateScheduler, EarlyStopping, ReduceLROnPlateau
import keras.models as models
import keras.backend as K

In [None]:
from sklearn.metrics import confusion_matrix

def mcc(y_true, y_pred):
    cm = confusion_matrix(y_true, y_pred)
    TP = cm[0][0]
    FP = cm[0][1]
    FN = cm[1][0]
    TN = cm[1][1]
    val = ((TP * TN) - (FP * FN)) / ((TP + FP)*(TP + FN)*(TN + FP)*(TN + FN))**0.5
    return val

In [None]:
def matthews_corr_coeff(y_true, y_pred):
    y_pos_pred = K.round(K.clip(y_pred, 0, 1))
    y_pos_true = K.round(K.clip(y_true, 0, 1))
    
    y_neg_pred = 1 - y_pos_pred
    y_neg_true = 1 - y_pos_true

    tp = K.sum(y_pos_true * y_pos_pred)
    tn = K.sum(y_neg_true * y_neg_pred)
    fp = K.sum(y_neg_true * y_pos_pred)
    fn = K.sum(y_pos_true * y_neg_pred)
    return (tp * tn - fp * fn) / (K.sqrt((tp + fp) * (tp + fn) * (tn + fp) * (tn + fn)) + K.epsilon())

In [None]:
length_of_sequence = train_diff.shape[0]
drop_out_rate = 0.2
recurrent_dropout = 0.5
STEPS_PER_EPOCH = 100
EPOCHS = 50

In [None]:
# Create Model

model = Sequential()
model.add(layers.Conv1D(32, 8, 
                 padding='same',
                 input_shape=(length_of_sequence, 1),
                 activation='relu'))
model.add(layers.MaxPooling1D(2, padding='same'))
model.add(layers.Conv1D(64, 8, padding='same', activation='relu'))
model.add(layers.MaxPooling1D(2, padding='same'))
model.add(layers.Conv1D(128, 8, padding='same', activation='relu'))
model.add(layers.MaxPooling1D(2, padding='same'))
model.add(layers.Conv1D(256, 8, padding='same', activation='relu'))
model.add(layers.LSTM(64, 
#               return_sequences=True,
               dropout = drop_out_rate,
               recurrent_dropout = recurrent_dropout
              ))
               #batch_input_shape=(None, 2, length_of_sequence)))
#model.add(layers.LSTM(128, 
#               dropout = drop_out_rate,
#               recurrent_dropout = recurrent_dropout
#              ))
#model.add(layers.Dense(100,activation='relu'))
model.add(layers.Dense(1,activation='sigmoid'))

model.compile(loss='binary_crossentropy',
              optimizer='rmsprop',
              metrics=['accuracy',matthews_corr_coeff])
model.summary()

In [None]:
weight_path="{}_weights.best.hdf5".format('lstm_model')
early = EarlyStopping(monitor="val_loss", 
                      mode="min", 
                      patience=10) # probably needs to be more patient, but kaggle time is limited
lr = ReduceLROnPlateau(monitor='val_loss', factor=0.2, patience=2, min_lr=0.001)
checkpoint = ModelCheckpoint(weight_path, monitor='val_loss', verbose=1, 
                             save_best_only=True, mode='min', save_weights_only = True)

callbacks_list = [checkpoint, early, lr]

In [None]:
train_gen = train_data_gen(metadata_train, train_diff, batch_size=BATCH_SIZE)

In [None]:
history = model.fit_generator(
                train_gen,
                steps_per_epoch=STEPS_PER_EPOCH,
                epochs=EPOCHS,
                validation_data=(x_val,y_val),
                callbacks=callbacks_list)

In [None]:
model.load_weights('lstm_model_weights.best.hdf5')

In [None]:
y_val_pred = model.predict(x_val)

In [None]:
y_val_pred = y_val_pred.flatten()
y_val_pred[y_val_pred >= 0.5] = 1
y_val_pred[y_val_pred < 0.5] = 0
y_val_pred.sum()

In [None]:
y_val.sum()

In [None]:
mcc(y_val,y_val_pred)

Add predict label to test data and train again.

In [None]:
metadata_test = pd.read_csv('../input/metadata_test.csv')
metadata_train = pd.read_csv('../input/metadata_train.csv')

In [None]:
test_parquet_path = '../input/test.parquet'
end_col_num = metadata_test['signal_id'].values[-1] + 1
col_nums = metadata_test['signal_id'].values[::500].tolist()
test_diff = read_wave_data(test_parquet_path,col_nums,end_col_num,merge_size=MERGE_SIZE)
print(test_diff.shape)

In [None]:
x_test = []
for c in test_diff.columns:
    diffs = test_diff[c].values.T
    data = diffs[:, np.newaxis]
    x_test.append(data)
    
x_test = np.array(x_test)
print(x_test.shape)

In [None]:
y_test = model.predict(x_test)
y_test = y_test.flatten()
y_test[y_test >= 0.5] = 1
y_test[y_test < 0.5] = 0
print(y_test.sum())

In [None]:
#y_test = np.array(y_test,dtype='bool')
metadata_test['target'] = pd.Series(y_test)

In [None]:
metadata_all = pd.concat([metadata_train, metadata_test])
print(metadata_all.shape)
metadata_all.head()

In [None]:
all_diff = pd.concat([train_diff,test_diff],axis=1)
print(all_diff.shape)

In [None]:
metadata_train, metadata_val = train_test_split(metadata_all, test_size=0.2, random_state=42)
print(metadata_train.shape)
print(metadata_val.shape)

In [None]:
x_val = []
y_val = []
for signal_id in metadata_val['signal_id'].values:
    diffs = all_diff[str(signal_id)].values.T
    data = diffs[:, np.newaxis]
    x_val.append(data)
    y_val.append(metadata_val[metadata_val['signal_id']==signal_id]['target'].values[0])
x_val = np.array(x_val)
y_val = np.array(y_val)    
print(x_val.shape)
print(y_val.shape)

In [None]:
# Create Model

model_2 = Sequential()
model_2.add(layers.Conv1D(32, 8, 
                 padding='same',
                 input_shape=(length_of_sequence, 1),
                 activation='relu'))
model_2.add(layers.MaxPooling1D(2, padding='same'))
model_2.add(layers.Conv1D(64, 8, padding='same', activation='relu'))
model_2.add(layers.MaxPooling1D(2, padding='same'))
model_2.add(layers.Conv1D(128, 8, padding='same', activation='relu'))
model_2.add(layers.MaxPooling1D(2, padding='same'))
model_2.add(layers.Conv1D(256, 8, padding='same', activation='relu'))
model_2.add(layers.LSTM(64, 
               return_sequences=True,
               dropout = drop_out_rate,
               recurrent_dropout = recurrent_dropout
              ))
               #batch_input_shape=(None, 2, length_of_sequence)))
model_2.add(layers.LSTM(128, 
               dropout = drop_out_rate,
               recurrent_dropout = recurrent_dropout
              ))
#model.add(layers.Dense(100,activation='relu'))
model_2.add(layers.Dense(1,activation='sigmoid'))

model_2.compile(loss='binary_crossentropy',
              optimizer='rmsprop',
              metrics=['accuracy',matthews_corr_coeff])
model_2.summary()

In [None]:
weight_path="{}_weights.best.hdf5".format('lstm_model_2')
lr = ReduceLROnPlateau(monitor='val_loss', factor=0.2, patience=2, min_lr=0.001)
checkpoint = ModelCheckpoint(weight_path, monitor='val_loss', verbose=1, 
                             save_best_only=True, mode='min', save_weights_only = True)

callbacks_list = [checkpoint, lr]

In [None]:
train_gen = train_data_gen(metadata_all, all_diff, batch_size=BATCH_SIZE)

In [None]:
history = model_2.fit_generator(
                train_gen,
                steps_per_epoch=STEPS_PER_EPOCH,
                epochs=EPOCHS,
                validation_data=(x_val,y_val),
                callbacks=callbacks_list)

In [None]:
model_2.load_weights('lstm_model_2_weights.best.hdf5')

In [None]:
y_val_pred = model_2.predict(x_val)
y_val_pred = y_val_pred.flatten()
y_val_pred[y_val_pred >= 0.5] = 1
y_val_pred[y_val_pred < 0.5] = 0
print(y_val_pred.sum())
print(y_val.sum())
print(mcc(y_val,y_val_pred))

In [None]:
y_test_1 = model.predict(x_test)
y_test_2 = model_2.predict(x_test)

y_test = (y_test_1.flatten()) * 0.5 + (y_test_2.flatten()) * 0.5
#y_test = y_test_2.flatten()
y_test[y_test >= 0.5] = 1
y_test[y_test < 0.5] = 0
print(y_test.sum())

In [None]:
submit_df = pd.read_csv('../input/sample_submission.csv')

In [None]:
y_test = np.array(y_test,dtype='bool')

In [None]:
submit_df['target'] = pd.Series(y_test)
submit_df['target'].astype('bool')
submit_df.head()

In [None]:
submit_df.to_csv('submission.csv',index=False)