In [36]:
# Load the libraries
import pandas as pd
import pyarrow.parquet as pq # Used to read the data
import os 
import numpy as np
from keras.layers import *
from keras.models import Model
from sklearn.model_selection import train_test_split 
from keras import backend as K 
from keras import optimizers
import tensorflow as tf
from sklearn.model_selection import GridSearchCV, StratifiedKFold
from keras.callbacks import *
%matplotlib inline
import matplotlib.pyplot as plt


In [2]:
# # select how many folds will be created
# N_SPLITS = 5
# # it is just a constant with the measurements data size
# sample_size = 800000

In [54]:
# load the training set metadata, defines which signals are in which order in the data
train_meta = pd.read_csv('../vsb-power-line-fault-detection/metadata_train.csv')
# set index, it makes the data access much faster
train_meta = train_meta.set_index(['id_measurement', 'phase'])
train_meta.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,signal_id,target
id_measurement,phase,Unnamed: 2_level_1,Unnamed: 3_level_1
0,0,0,0
0,1,1,0
0,2,2,0
1,0,3,1
1,1,4,1


In [55]:
# load the test set metadata, defines which signals are in which order in the data
test_meta = pd.read_csv('../vsb-power-line-fault-detection/metadata_test.csv')
# set index, it makes the data access much faster
#test_meta = test_meta.set_index(['id_measurement', 'phase'])
test_meta.head()

Unnamed: 0,signal_id,id_measurement,phase
0,8712,2904,0
1,8713,2904,1
2,8714,2904,2
3,8715,2905,0
4,8716,2905,1


In [5]:
# Load the training dataset
df_train_pre = pd.read_csv("../code/my_train.csv.gz", compression="gzip")
df_train_pre.shape

(1393920, 20)

In [6]:
# Load the test dataset
df_test_pre = pd.read_csv("../code/my_test.csv.gz", compression="gzip")
df_test_pre.shape

(3253920, 20)

In [7]:
# Drop the extra column in each of the datasets above
df_train_pre.drop("Unnamed: 0", axis=1, inplace=True)
df_test_pre.drop("Unnamed: 0", axis=1, inplace=True)

In [8]:
# Change the shape of the dataset into a 3D format for the LSTM
# The sequence size is 160
X = df_train_pre.values.reshape(8712, 160, 19)
X.shape

(8712, 160, 19)

In [9]:
# Change the shape of the dataset into a 3D format for the LSTM
# The sequence size is 160
X_test = df_test_pre.values.reshape(20337, 160, 19)
X_test.shape

(20337, 160, 19)

In [10]:
# Get the labels for the training dataset
y = train_meta["target"]
y.shape

(8712,)

In [11]:
eval_preds = np.zeros(X.shape[0])
label_predictions = []

In [82]:
# Define the one-way LSTM model
def create_model(input_data):
    input_shape = input_data.shape
    inp = Input(shape=(input_shape[1], input_shape[2],), name="input_signal")
    x = LSTM(128, return_sequences=True, name="lstm1")(inp)
    x = LSTM(64, return_sequences=False, name="lstm2")(x)   
    x = Dense(128, activation="relu", name="dense1")(x)
    x = Dense(64, activation="relu", name="dense2")(x)
    x = Dense(1, activation='sigmoid', name="output")(x)
    model = Model(inputs=inp, outputs=x)
    model.compile(loss='binary_crossentropy', optimizer='adam',metrics = ['mae', 'acc'])
    return model

In [83]:
splits = list(StratifiedKFold(n_splits=2, shuffle=True, random_state=2019).split(X, y))
preds_val = []
y_val = []
for idx, (train_idx, val_idx) in enumerate(splits):
    K.clear_session()
    print("Beginning fold {}".format(idx+1))
    train_X, train_y, val_X, val_y = X[train_idx], y[train_idx], X[val_idx], y[val_idx]
    model = create_model(train_X)
    monitor = ModelCheckpoint('best_model.h5', save_best_only=True, verbose=1, monitor='val_loss', mode='min')
    model.fit(train_X, train_y,validation_data=[val_X, val_y],callbacks=[monitor],batch_size=128,epochs=50)
    

Beginning fold 1
Train on 4355 samples, validate on 4357 samples
Epoch 1/50

Epoch 00001: val_loss improved from inf to 0.16567, saving model to best_model.h5
Epoch 2/50

Epoch 00002: val_loss improved from 0.16567 to 0.15331, saving model to best_model.h5
Epoch 3/50

Epoch 00003: val_loss did not improve from 0.15331
Epoch 4/50

Epoch 00004: val_loss did not improve from 0.15331
Epoch 5/50

Epoch 00005: val_loss improved from 0.15331 to 0.13796, saving model to best_model.h5
Epoch 6/50

Epoch 00006: val_loss improved from 0.13796 to 0.13667, saving model to best_model.h5
Epoch 7/50

Epoch 00007: val_loss improved from 0.13667 to 0.13619, saving model to best_model.h5
Epoch 8/50

Epoch 00008: val_loss did not improve from 0.13619
Epoch 9/50

Epoch 00009: val_loss improved from 0.13619 to 0.12998, saving model to best_model.h5
Epoch 10/50

Epoch 00010: val_loss did not improve from 0.12998
Epoch 11/50

Epoch 00011: val_loss improved from 0.12998 to 0.12607, saving model to best_model.h5


Epoch 00031: val_loss did not improve from 0.11881
Epoch 32/50

Epoch 00032: val_loss did not improve from 0.11881
Epoch 33/50

Epoch 00033: val_loss did not improve from 0.11881
Epoch 34/50

Epoch 00034: val_loss did not improve from 0.11881
Epoch 35/50

Epoch 00035: val_loss did not improve from 0.11881
Epoch 36/50

Epoch 00036: val_loss did not improve from 0.11881
Epoch 37/50

Epoch 00037: val_loss did not improve from 0.11881
Epoch 38/50

Epoch 00038: val_loss did not improve from 0.11881
Epoch 39/50

Epoch 00039: val_loss did not improve from 0.11881
Epoch 40/50

Epoch 00040: val_loss did not improve from 0.11881
Epoch 41/50

Epoch 00041: val_loss did not improve from 0.11881
Epoch 42/50

Epoch 00042: val_loss did not improve from 0.11881
Epoch 43/50

Epoch 00043: val_loss did not improve from 0.11881
Epoch 44/50

Epoch 00044: val_loss did not improve from 0.11881
Epoch 45/50

Epoch 00045: val_loss did not improve from 0.11881
Epoch 46/50

Epoch 00046: val_loss did not improve f


Epoch 00013: val_loss did not improve from 0.11176
Epoch 14/50

Epoch 00014: val_loss did not improve from 0.11176
Epoch 15/50

Epoch 00015: val_loss did not improve from 0.11176
Epoch 16/50

Epoch 00016: val_loss improved from 0.11176 to 0.10951, saving model to best_model.h5
Epoch 17/50

Epoch 00017: val_loss did not improve from 0.10951
Epoch 18/50

Epoch 00018: val_loss did not improve from 0.10951
Epoch 19/50

Epoch 00019: val_loss improved from 0.10951 to 0.10897, saving model to best_model.h5
Epoch 20/50

Epoch 00020: val_loss did not improve from 0.10897
Epoch 21/50

Epoch 00021: val_loss did not improve from 0.10897
Epoch 22/50

Epoch 00022: val_loss did not improve from 0.10897
Epoch 23/50

Epoch 00023: val_loss did not improve from 0.10897
Epoch 24/50

Epoch 00024: val_loss did not improve from 0.10897
Epoch 25/50

Epoch 00025: val_loss did not improve from 0.10897
Epoch 26/50

Epoch 00026: val_loss did not improve from 0.10897
Epoch 27/50

Epoch 00027: val_loss improved fr

In [None]:
# The function converged at 28 epochs with early stopping.
# with model checkpoint at 29th epoch the validation loss stopped improving. in the first fold
# 2nd fold at epoch 45
#3rd fold at epoch 49
# 15 mins approx for each fold

In [57]:
# load a saved model
from keras.models import load_model
# Test data, prediction

preds = []
for i in range(1):
    model = load_model('best_model.h5')
    pred = model.predict(X_test, verbose=1)
    pred_3 = []
    for pred_scalar in pred:
        for i in range(3):
            pred_3.append(pred_scalar)
    preds.append(pred_3)
threshold = 0.5
preds_test = (np.squeeze(np.mean(preds, axis=0)) > threshold).astype(np.int)
# submission['target'] = preds
# submission.to_csv('submission_{}.csv'.format(seed), index=False)
# submission.head()



In [71]:
preds

[[array([0.1041708], dtype=float32),
  array([0.1041708], dtype=float32),
  array([0.1041708], dtype=float32),
  array([0.05182594], dtype=float32),
  array([0.05182594], dtype=float32),
  array([0.05182594], dtype=float32),
  array([0.16023049], dtype=float32),
  array([0.16023049], dtype=float32),
  array([0.16023049], dtype=float32),
  array([0.01475626], dtype=float32),
  array([0.01475626], dtype=float32),
  array([0.01475626], dtype=float32),
  array([0.00903603], dtype=float32),
  array([0.00903603], dtype=float32),
  array([0.00903603], dtype=float32),
  array([0.00253165], dtype=float32),
  array([0.00253165], dtype=float32),
  array([0.00253165], dtype=float32),
  array([0.00270239], dtype=float32),
  array([0.00270239], dtype=float32),
  array([0.00270239], dtype=float32),
  array([0.00298837], dtype=float32),
  array([0.00298837], dtype=float32),
  array([0.00298837], dtype=float32),
  array([0.00362104], dtype=float32),
  array([0.00362104], dtype=float32),
  array([0.0036

In [60]:
sum(preds_test)/3

616.0

In [33]:
pd_pred = []
for p in pd_p:
    if p == 1:
        pd_pred.append(pd)


In [72]:
first_results = pd.concat([test_meta,pd.DataFrame(preds_test[0:20336])], axis = 1)

In [73]:
first_results.columns

Index(['signal_id', 'id_measurement', 'phase', 0], dtype='object')

In [74]:
first_results[first_results[0] == 1]

Unnamed: 0,signal_id,id_measurement,phase,0
108,8820,2940,0,1.0
109,8821,2940,1,1.0
110,8822,2940,2,1.0
111,8823,2941,0,1.0
112,8824,2941,1,1.0
113,8825,2941,2,1.0
114,8826,2942,0,1.0
115,8827,2942,1,1.0
116,8828,2942,2,1.0
126,8838,2946,0,1.0


In [None]:
from sklearn.metrics import matthews_corrcoef, precision_recall_fscore_support