In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)


# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# Any results you write to the current directory are saved as output.

import matplotlib.pyplot as plt
import random
import math
from scipy.signal import resample
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from keras.models import Sequential
from keras.layers import Input, Dense, Conv1D, MaxPooling1D, Softmax, Add, Flatten, Activation
from keras.models import Model
from keras.callbacks import LearningRateScheduler, ModelCheckpoint
from keras.optimizers import Adam
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, label_ranking_average_precision_score, label_ranking_loss, coverage_error 
import imblearn
from sklearn import utils

In [None]:
data_mit_train = pd.read_csv("../input/heartbeat/mitbih_train.csv", header=None)
data_mit_test = pd.read_csv("../input/heartbeat/mitbih_test.csv", header=None)
data_pt_abnormal = pd.read_csv("../input/heartbeat/ptbdb_abnormal.csv", header=None)
data_pt_normal = pd.read_csv("../input/heartbeat/ptbdb_normal.csv", header=None)

In [None]:
data_mit = pd.concat([data_mit_train, data_mit_test], axis=0)
data_pt = pd.concat([data_pt_abnormal, data_pt_normal], axis=0)

In [None]:
data_mit.head()

In [None]:
data_mit.info()

In [None]:
data_mit[187].value_counts()

In [None]:
data_pt.info()

In [None]:
data_pt[187].value_counts()

In [None]:
data_mit_numpy = data_mit.to_numpy()
data_mit_x = data_mit_numpy[:,  :-1]
data_mit_y = data_mit_numpy[:, -1].astype(int)

In [None]:
data_pt_numpy = data_pt.to_numpy()
data_pt_x = data_pt_numpy[:,  :-1]
data_pt_y = data_pt_numpy[:, -1].astype(int)

**Get the rows corresponding to the catgories**

In [None]:
C0 = np.argwhere(data_mit_y==0).flatten()
C1 = np.argwhere(data_mit_y==1).flatten()
C2 = np.argwhere(data_mit_y==2).flatten()
C3 = np.argwhere(data_mit_y==3).flatten()
C4 = np.argwhere(data_mit_y==4).flatten()

In [None]:
P0 = np.argwhere(data_pt_y==0).flatten()
P1 = np.argwhere(data_pt_y==1).flatten()

In [None]:
x = np.arange(0,187)
plt.plot(x, data_mit_x[C0, :][1])
plt.plot(x, data_mit_x[C1, :][1])
plt.plot(x, data_mit_x[C2, :][1])
plt.plot(x, data_mit_x[C3, :][1])
plt.plot(x, data_mit_x[C4, :][1])
plt.plot(x, data_pt_x[P0, :][1])
plt.plot(x, data_pt_x[P1, :][1])

In [None]:
def stretch(data):
    temp_random = int(187 * (1 + (random.random() - 0.5)/3))
    y = resample(data, temp_random)
    if temp_random < 187:
        y_ = np.zeros(shape=(187, ))
        y_[:temp_random] = y
    else:
        y_ = y[:187]
    return y_

def amplify(data):
    alpha = (random.random()-0.5)
    factor = -alpha*data + (1+alpha)
    return data*factor

def augment(data):
    result = np.zeros(shape=(4, 187))
    for i in range(3):
        temp_random = random.random()
        if temp_random < 0.33:
            result[i, :] = stretch(data)
        elif temp_random < 0.66:
            result[i, :] = amplify(data)
        else:
            result[i, :] = stretch(data)
            result[i, :] = amplify(result[i, :])
    return result

In [None]:
plt.plot(data_mit_x[1, :])
plt.plot(stretch(data_mit_x[1, :]))
plt.plot(amplify(data_mit_x[1, :]))
result = augment(data_mit_x[1, :])

* **augment class 3 **

In [None]:
result = np.apply_along_axis(augment, axis=1, arr=data_mit_x[C3]).reshape(-1, 187)
result_label = np.ones(result.shape[0], dtype=int) * 3
data_mit_x_augment = np.vstack((data_mit_x, result))
data_mit_y_augment = np.hstack((data_mit_y, result_label))

In [None]:
unique, counts = np.unique(data_mit_y_augment, return_counts=True)
print(unique, counts)

* **train and test datasets split**

https://www.kaggle.com/shahules/tackling-class-imbalance 

https://www.kaggle.com/c/ieee-fraud-detection/discussion/100268

处理数据不平衡的几个方法：
* 欠采样
* 过采样
* 混合采样
* cost-sensitive(waiting to try)
* 集成学习方法(waiting to try)

过采样和欠采样建议使用过采样，因为欠采样可能会损失部分信息。

In [None]:
data_mit_x_augment_train, data_mit_x_augment_test, data_mit_y_augment_train, data_mit_y_augment_test = train_test_split(data_mit_x_augment, data_mit_y_augment, test_size=0.1, random_state=42)

In [None]:
data_pt_x_train, data_pt_x_test, data_pt_y_train, data_pt_y_test = train_test_split(data_pt_x, data_pt_y, test_size=0.1, random_state=42)

# **up-sample 对样本数少的样本上采样**

1. divide by the class

In [None]:
C0_augment = np.argwhere(data_mit_y_augment_train == 0).flatten()
C1_augment = np.argwhere(data_mit_y_augment_train == 1).flatten()
C2_augment = np.argwhere(data_mit_y_augment_train == 2).flatten()
C3_augment = np.argwhere(data_mit_y_augment_train == 3).flatten()
C4_augment = np.argwhere(data_mit_y_augment_train == 4).flatten()

data_mit_y_augment_train_C0 = data_mit_y_augment_train[C0_augment]
data_mit_y_augment_train_C1 = data_mit_y_augment_train[C1_augment]
data_mit_y_augment_train_C2 = data_mit_y_augment_train[C2_augment]
data_mit_y_augment_train_C3 = data_mit_y_augment_train[C3_augment]
data_mit_y_augment_train_C4 = data_mit_y_augment_train[C4_augment]

data_mit_x_augment_train_C0 = data_mit_x_augment_train[C0_augment, :]
data_mit_x_augment_train_C1 = data_mit_x_augment_train[C1_augment, :]
data_mit_x_augment_train_C2 = data_mit_x_augment_train[C2_augment, :]
data_mit_x_augment_train_C3 = data_mit_x_augment_train[C3_augment, :]
data_mit_x_augment_train_C4 = data_mit_x_augment_train[C4_augment, :]


In [None]:
C0_augment.shape

In [None]:
data_mit_x_augment_train_C0.shape

combine x and y-lable

In [None]:
data_mit_augment_train_C0 = np.hstack((data_mit_x_augment_train_C0, data_mit_y_augment_train_C0.reshape((data_mit_y_augment_train_C0.shape[0], 1))))
data_mit_augment_train_C1 = np.hstack((data_mit_x_augment_train_C1, data_mit_y_augment_train_C1.reshape((data_mit_y_augment_train_C1.shape[0], 1))))
data_mit_augment_train_C2 = np.hstack((data_mit_x_augment_train_C2, data_mit_y_augment_train_C2.reshape((data_mit_y_augment_train_C2.shape[0], 1))))
data_mit_augment_train_C3 = np.hstack((data_mit_x_augment_train_C3, data_mit_y_augment_train_C3.reshape((data_mit_y_augment_train_C3.shape[0], 1))))
data_mit_augment_train_C4 = np.hstack((data_mit_x_augment_train_C4, data_mit_y_augment_train_C4.reshape((data_mit_y_augment_train_C4.shape[0], 1))))

2. up-sample with respect to 4 less-classes

In [None]:
data_mit_augment_train_C1_resample = utils.resample(data_mit_augment_train_C1,
                          replace=True, # sample with replacement
                          n_samples=C0_augment.shape[0], # match number in majority class
                          random_state=27) # reproducible results
data_mit_augment_train_C2_resample = utils.resample(data_mit_augment_train_C2,
                          replace=True, # sample with replacement
                          n_samples=C0_augment.shape[0], # match number in majority class
                          random_state=27) # reproducible results
data_mit_augment_train_C3_resample = utils.resample(data_mit_augment_train_C3,
                          replace=True, # sample with replacement
                          n_samples=C0_augment.shape[0], # match number in majority class
                          random_state=27) # reproducible results
data_mit_augment_train_C4_resample = utils.resample(data_mit_augment_train_C4,
                          replace=True, # sample with replacement
                          n_samples=C0_augment.shape[0], # match number in majority class
                          random_state=27) # reproducible results

3.combine the datasets after resampling

In [None]:
data_mit_augment_resample_train = np.vstack((data_mit_augment_train_C0, data_mit_augment_train_C1_resample, data_mit_augment_train_C2_resample, data_mit_augment_train_C3_resample, data_mit_augment_train_C4_resample))
print(data_mit_augment_resample_train.shape)

4. spit into x and y

In [None]:
data_mit_x_augment_train = data_mit_augment_resample_train[:,  :-1]
data_mit_y_augment_train = data_mit_augment_resample_train[:, -1].astype(int)

print(data_mit_x_augment_train.shape)
print(data_mit_y_augment_train.shape)

In [None]:
print(data_mit_x_augment_train.shape)
print(data_mit_x_augment_test.shape)
print(data_mit_y_augment_train.shape)
print(data_mit_y_augment_test.shape)

print(data_pt_x_train.shape)
print(data_pt_x_test.shape)
print(data_pt_y_train.shape)
print(data_pt_y_test.shape)

In [None]:
data_mit_x_augment_train_expand = np.expand_dims(data_mit_x_augment_train, axis=2)
data_mit_x_augment_test_expand = np.expand_dims(data_mit_x_augment_test, axis=2)
data_mit_y_augment_train_expand = np.expand_dims(data_mit_y_augment_train, axis=1)
data_mit_y_augment_test_expand = np.expand_dims(data_mit_y_augment_test, axis=1)

In [None]:
data_pt_x_train_expand = np.expand_dims(data_pt_x_train, axis=2)
data_pt_x_test_expand = np.expand_dims(data_pt_x_test, axis=2)
data_pt_y_train_expand = np.expand_dims(data_pt_y_train, axis=1)
data_pt_y_test_expand = np.expand_dims(data_pt_y_test, axis=1)

In [None]:
data_mit_x_augment_train_expand.shape

In [None]:
data_pt_x_train_expand.shape

**OneHotEncoder labels**

In [None]:
ohe = OneHotEncoder()
data_mit_y_augment_train_expand_ohe = ohe.fit_transform(data_mit_y_augment_train_expand)
data_mit_y_augment_test_expand_ohe = ohe.transform(data_mit_y_augment_test_expand)

**PTB datasets onehotdecoder is not used**

In [None]:
ohe_pt = OneHotEncoder()
data_pt_y_train_expand_ohe = ohe_pt.fit_transform(data_pt_y_train_expand)
data_pt_y_test_expand_ohe = ohe_pt.transform(data_pt_y_test_expand)

In [None]:
data_mit_y_augment_train_expand_ohe[0, 4]

In [None]:
data_pt_y_train_expand_ohe[0, 1]

In [None]:
obj_nums, feature_nums, depth = data_mit_x_augment_train_expand.shape
print("dataset_size:", obj_nums, "feature_nums", feature_nums, "depth:", depth)
batch_size=200

In [None]:
model = Sequential()
inp = Input(shape=(feature_nums, depth))
C = Conv1D(filters=32, kernel_size=5, strides=1)(inp)

C11 = Conv1D(filters=32, kernel_size=5, strides=1, padding='same')(C)
A11 = Activation("relu")(C11)
C12 = Conv1D(filters=32, kernel_size=5, strides=1, padding='same')(A11)
S11 = Add()([C12, C])
A12 = Activation("relu")(S11)
M11 = MaxPooling1D(pool_size=5, strides=2)(A12)


C21 = Conv1D(filters=32, kernel_size=5, strides=1, padding='same')(M11)
A21 = Activation("relu")(C21)
C22 = Conv1D(filters=32, kernel_size=5, strides=1, padding='same')(A21)
S21 = Add()([C22, M11])
A22 = Activation("relu")(S11)
M21 = MaxPooling1D(pool_size=5, strides=2)(A22)


C31 = Conv1D(filters=32, kernel_size=5, strides=1, padding='same')(M21)
A31 = Activation("relu")(C31)
C32 = Conv1D(filters=32, kernel_size=5, strides=1, padding='same')(A31)
S31 = Add()([C32, M21])
A32 = Activation("relu")(S31)
M31 = MaxPooling1D(pool_size=5, strides=2)(A32)


C41 = Conv1D(filters=32, kernel_size=5, strides=1, padding='same')(M31)
A41 = Activation("relu")(C41)
C42 = Conv1D(filters=32, kernel_size=5, strides=1, padding='same')(A41)
S41 = Add()([C42, M31])
A42 = Activation("relu")(S41)
M41 = MaxPooling1D(pool_size=5, strides=2)(A42)


C51 = Conv1D(filters=32, kernel_size=5, strides=1, padding='same')(M41)
A51 = Activation("relu")(C51)
C52 = Conv1D(filters=32, kernel_size=5, strides=1, padding='same')(A51)
S51 = Add()([C52, M41])
A52 = Activation("relu")(S51)
M51 = MaxPooling1D(pool_size=5, strides=2)(A52)

F1 = Flatten()(M51)

D1 = Dense(32)(F1)
A6 = Activation("relu")(D1)
D2 = Dense(32)(A6)
D3 = Dense(5)(D2)
A7 = Softmax()(D3)

model = Model(inputs=inp, outputs=A7)

model.summary()

In [None]:
def exp_decay(epoch):
    initial_lr = 0.001
    k = 0.75
    t = epoch * obj_nums // (10000 * batch_size)
    lr = initial_lr * math.exp(-k * t)
    
    return lr

callback = LearningRateScheduler(exp_decay)

In [None]:
adam = Adam(learning_rate=0.001, beta_1=0.9, beta_2=0.999)

In [None]:
model.compile(optimizer=adam, loss='categorical_crossentropy', metrics=['accuracy'])

In [None]:
modelcheckpoint = ModelCheckpoint("/kaggle/working/best_mit.hdf5", save_best_only=True, verbose=1)

In [None]:
model.fit(data_mit_x_augment_train_expand, data_mit_y_augment_train_expand_ohe, 
          epochs=10, batch_size=batch_size, verbose=2,
          validation_data=(data_mit_x_augment_test_expand, data_mit_y_augment_test_expand_ohe),
          callbacks=[callback, modelcheckpoint])

In [None]:
model.save_weights('/kaggle/working/best_mit.h5')

In [None]:
y_pred = model.predict(data_mit_x_augment_test_expand, batch_size=1000)

In [None]:
print(classification_report(data_mit_y_augment_test_expand_ohe.argmax(axis=1), y_pred.argmax(axis=1)))

In [None]:
cnf_matrix = confusion_matrix(data_mit_y_augment_test_expand_ohe.argmax(axis=1), y_pred.argmax(axis=1))

In [None]:
print(cnf_matrix)

In [None]:
obj_nums_ptb, feature_nums_ptb, depth_ptb = data_pt_x_train_expand.shape
print("dataset_size:", obj_nums_ptb, "feature_nums", feature_nums_ptb, "depth:", depth_ptb)
batch_size=200

In [None]:
inp = Input(shape=(feature_nums_ptb, depth_ptb))
C = Conv1D(filters=32, kernel_size=5, strides=1)(inp)

C11 = Conv1D(filters=32, kernel_size=5, strides=1, padding='same')(C)
A11 = Activation("relu")(C11)
C12 = Conv1D(filters=32, kernel_size=5, strides=1, padding='same')(A11)
S11 = Add()([C12, C])
A12 = Activation("relu")(S11)
M11 = MaxPooling1D(pool_size=5, strides=2)(A12)


C21 = Conv1D(filters=32, kernel_size=5, strides=1, padding='same')(M11)
A21 = Activation("relu")(C21)
C22 = Conv1D(filters=32, kernel_size=5, strides=1, padding='same')(A21)
S21 = Add()([C22, M11])
A22 = Activation("relu")(S11)
M21 = MaxPooling1D(pool_size=5, strides=2)(A22)


C31 = Conv1D(filters=32, kernel_size=5, strides=1, padding='same')(M21)
A31 = Activation("relu")(C31)
C32 = Conv1D(filters=32, kernel_size=5, strides=1, padding='same')(A31)
S31 = Add()([C32, M21])
A32 = Activation("relu")(S31)
M31 = MaxPooling1D(pool_size=5, strides=2)(A32)


C41 = Conv1D(filters=32, kernel_size=5, strides=1, padding='same')(M31)
A41 = Activation("relu")(C41)
C42 = Conv1D(filters=32, kernel_size=5, strides=1, padding='same')(A41)
S41 = Add()([C42, M31])
A42 = Activation("relu")(S41)
M41 = MaxPooling1D(pool_size=5, strides=2)(A42)


C51 = Conv1D(filters=32, kernel_size=5, strides=1, padding='same')(M41)
A51 = Activation("relu")(C51)
C52 = Conv1D(filters=32, kernel_size=5, strides=1, padding='same')(A51)
S51 = Add()([C52, M41])
A52 = Activation("relu")(S51)
M51 = MaxPooling1D(pool_size=5, strides=2)(A52)

C.trainable = False
for i in range(1,6):
    vars()["C{}1".format(i)].trainable = False
    vars()["A{}1".format(i)].trainable = False
    vars()["C{}2".format(i)].trainable = False
    vars()["S{}1".format(i)].trainable = False
    vars()["A{}2".format(i)].trainable = False
    vars()["M{}1".format(i)].trainable = False

F1_PTB = Flatten()(M51)

D1_PTB = Dense(32)(F1_PTB)
A6_PTB = Activation("relu")(D1_PTB)
D2_PTB = Dense(32)(A6_PTB)
D3_PTB = Dense(2)(D2_PTB)
A7_PTB = Softmax()(D3_PTB)

model_ptb = Model(inputs=inp, outputs=A7_PTB)

model_ptb.summary()

In [None]:
model_ptb.compile(optimizer=adam, loss='categorical_crossentropy', metrics=['accuracy'])

In [None]:
# model_ptb.load_weights("../input/heartbeat-categorization-dataset-output-mit-train/best_mit.h5")
 model_ptb.load_weights("/kaggle/working/best_mit.h5", by_name=True)

In [None]:
model_ptb.fit(data_pt_x_train_expand, data_pt_y_train_expand_ohe, 
          epochs=10, batch_size=batch_size, verbose=2,
          validation_data=(data_pt_x_test_expand, data_pt_y_test_expand_ohe),
          callbacks=[callback])

In [None]:
y_pred_ptb = model_ptb.predict(data_pt_x_test_expand, batch_size=1000)

In [None]:
print(classification_report(data_pt_y_test_expand_ohe.argmax(axis=1), y_pred_ptb.argmax(axis=1)))