In [1]:
import codecs, glob, os
import numpy as np
import pandas as pd

import paddle.nn.functional as F
import paddle
import paddle.nn as nn
from paddle.io import DataLoader, Dataset
import paddle.optimizer as optim
from paddlenlp.data import Pad

import scipy.io as sio

In [2]:
train_mat = glob.glob('./train/*.mat')
train_mat.sort()
# train_mat = [sio.loadmat(x)['ecgdata'].reshape(1, 12, 5000) for x in train_mat]
train_mat = [sio.loadmat(x)['ecgdata'] for x in train_mat]

test_mat = glob.glob('./val/*.mat')
test_mat.sort()
# test_mat = [sio.loadmat(x)['ecgdata'].reshape(1, 12, 5000) for x in test_mat]
test_mat = [sio.loadmat(x)['ecgdata'] for x in test_mat]

train_df = pd.read_csv('trainreference.csv')
train_df['tag'] = train_df['tag'].astype(np.float32)

In [3]:
class MyDataset(Dataset):
    def __init__(self, mat, label, mat_dim=5000):
        super(MyDataset, self).__init__()
        self.mat = mat
        self.label = label
        self.mat_dim = mat_dim

    def __len__(self):
        return len(self.mat)

    def __getitem__(self, index):
        # idx = np.random.randint(0, 5000-self.mat_dim)
        # idy = np.random.choice(range(12), 9)
        return paddle.to_tensor(self.mat[index][:, 0:self.mat_dim]), self.label[index]



In [4]:
class TextCNN_bak(paddle.nn.Layer):
    def __init__(self, kernel_num=30, kernel_size=[3, 4, 5], dropout=0.5):
        super(TextCNN, self).__init__()
        self.kernel_num = kernel_num
        self.kernel_size = kernel_size
        self.dropout = dropout

        self.convs = nn.LayerList([nn.Conv2D(1, self.kernel_num, (kernel_size_, 3000)) 
                for kernel_size_ in self.kernel_size])
        self.dropout = nn.Dropout(self.dropout)
        self.linear = nn.Linear(3 * self.kernel_num, 1)

    def forward(self, x):
        convs = [nn.ReLU()(conv(x)).squeeze(3) for conv in self.convs]
        pool_out = [nn.MaxPool1D(block.shape[2])(block).squeeze(2) for block in convs]
        pool_out = paddle.concat(pool_out, 1)
        logits = self.linear(pool_out)

        return logits

class TextCNN(nn.Layer):
    def __init__(self):
        super(TextCNN, self).__init__()
        # input 
        self.conv1 = nn.Conv1D(12, 10, 50)
        self.conv2 = nn.Conv1D(12, 10, 200)
        self.conv3 = nn.Conv1D(12, 10, 500)
        self.conv4 = nn.Conv1D(12, 10, 1000)
        # self.pooling = nn.MaxPool2D((1, 200))
        self.pooling = nn.MaxPool1D(200)
        self.fc1 = nn.Linear(900, 64)
        self.fc2 = nn.Linear(64, 1)

    def forward(self, x):
        batch_size = x.shape[0]
        
        out1 = self.pooling(F.relu(self.conv1(x)))
        out2 = self.pooling(F.relu(self.conv2(x)))
        out3 = self.pooling(F.relu(self.conv3(x)))
        out4 = self.pooling(F.relu(self.conv4(x)))

        # # out = torch.cat([out1, out2, out3, out4], 2)  
        out = paddle.concat([out1, out2, out3, out4], 2)
        # out = out.view(batch_size, -1)
        out = paddle.reshape(out, [batch_size, -1])
        out = self.fc1(out)
        out = F.relu(out)
        out = F.dropout(out, p=0.5)
        out = self.fc2(out)

        return out

In [5]:
model = TextCNN()

BATCH_SIZE = 30
EPOCHS = 200
LEARNING_RATE = 0.0001
device = paddle.device.get_device()
print(device)

W1127 17:49:23.307835   180 device_context.cc:404] Please NOTE: device: 0, GPU Compute Capability: 7.0, Driver API Version: 10.1, Runtime API Version: 10.1
W1127 17:49:23.313158   180 device_context.cc:422] device: 0, cuDNN Version: 7.6.


gpu:0


In [11]:
# paddle.summary(model, (64, 1, 9, 3000))

In [6]:
from sklearn.model_selection import StratifiedKFold
skf = StratifiedKFold(n_splits=10)

In [7]:
fold_idx = 0
for tr_idx, val_idx in skf.split(train_mat, train_df['tag'].values):
    Train_Loader = DataLoader(MyDataset(np.array(train_mat)[tr_idx], paddle.to_tensor(train_df['tag'].values[tr_idx])), batch_size=BATCH_SIZE, shuffle=True)
    Val_Loader = DataLoader(MyDataset(np.array(train_mat)[val_idx], paddle.to_tensor(train_df['tag'].values[val_idx])), batch_size=BATCH_SIZE, shuffle=True)
    model = TextCNN()

    optimizer = optim.Adam(parameters=model.parameters(), learning_rate=LEARNING_RATE)
    criterion = nn.BCEWithLogitsLoss()

    Test_best_Acc = 0
    for epoch in range(0, EPOCHS):
        Train_Loss, Test_Loss = [], []
        Train_Acc, Test_Acc = [], []
        model.train()
        for i, (x, y) in enumerate(Train_Loader):
            if device == 'gpu':
                x = x.cuda()
                y = y.cuda()
            # print(x.shape)
            pred = model(x)
            loss = criterion(pred, y)
            Train_Loss.append(loss.item())

            pred = (paddle.nn.functional.sigmoid(pred)>0.5).astype(int)
            Train_Acc.append((pred.numpy() == y.numpy()).mean())
            loss.backward()
            optimizer.step()
            optimizer.clear_grad()
        model.eval()

        for i, (x, y) in enumerate(Val_Loader):
            if device == 'gpu':
                x = x.cuda()
                y = y.cuda()
            
            pred = model(x)
            Test_Loss.append(criterion(pred, y).item())
            pred = (paddle.nn.functional.sigmoid(pred)>0.5).astype(int)
            Test_Acc.append((pred.numpy() == y.numpy()).mean())
        
        if epoch % 10 == 0:
            print(
                "Epoch: [{}/{}] TrainLoss/TestLoss: {:.4f}/{:.4f} TrainAcc/TestAcc: {:.4f}/{:.4f}".format( \
                epoch + 1, EPOCHS, \
                np.mean(Train_Loss), np.mean(Test_Loss), \
                np.mean(Train_Acc), np.mean(Test_Acc) \
                )
            )

        if Test_best_Acc < np.mean(Test_Acc):
            print(f'Fold {fold_idx} Acc imporve from {Test_best_Acc} to {np.mean(Test_Acc)} Save Model...')
            paddle.save(model.state_dict(), f"model_{fold_idx}.pdparams")
            Test_best_Acc = np.mean(Test_Acc)

    fold_idx += 1

Epoch: [1/200] TrainLoss/TestLoss: 1.1079/0.9065 TrainAcc/TestAcc: 0.5292/0.4722
Fold 0 Acc imporve from 0 to 0.47222222222222215 Save Model...
Fold 0 Acc imporve from 0.47222222222222215 to 0.5222222222222223 Save Model...
Fold 0 Acc imporve from 0.5222222222222223 to 0.5944444444444444 Save Model...
Fold 0 Acc imporve from 0.5944444444444444 to 0.6499999999999999 Save Model...
Fold 0 Acc imporve from 0.6499999999999999 to 0.6555555555555556 Save Model...
Fold 0 Acc imporve from 0.6555555555555556 to 0.7444444444444445 Save Model...
Fold 0 Acc imporve from 0.7444444444444445 to 0.7611111111111111 Save Model...
Epoch: [11/200] TrainLoss/TestLoss: 0.5972/0.5881 TrainAcc/TestAcc: 0.6972/0.6889
Fold 0 Acc imporve from 0.7611111111111111 to 0.7888888888888889 Save Model...
Epoch: [21/200] TrainLoss/TestLoss: 0.4133/0.5311 TrainAcc/TestAcc: 0.8208/0.7444
Fold 0 Acc imporve from 0.7888888888888889 to 0.8333333333333335 Save Model...
Epoch: [31/200] TrainLoss/TestLoss: 0.3384/0.5149 TrainAcc/

In [8]:
test_perd = np.zeros(len(test_mat))
tta_count = 20

for fold_idx in range(10):
    Test_Loader = DataLoader(MyDataset(test_mat, paddle.to_tensor([0]*len(test_mat))), 
                    batch_size=BATCH_SIZE, shuffle=False)

    layer_state_dict = paddle.load(f"model_{fold_idx}.pdparams")
    model.set_state_dict(layer_state_dict)
    
    for tta in range(tta_count):
        test_pred_list = []
        for i, (x, y) in enumerate(Test_Loader):
            if device == 'gpu':
                x = x.cuda()
                y = y.cuda()
            
            pred = model(x)
            test_pred_list.append(
                paddle.nn.functional.sigmoid(pred).numpy()
            )

        test_perd += np.vstack(test_pred_list)[:, 0]
        
test_perd /= tta_count * 10

In [9]:
test_path = glob.glob('./val/*.mat')
test_path = [os.path.basename(x)[:-4] for x in test_path]
test_path.sort()

test_answer = pd.DataFrame({
    'name': test_path,
    'tag': (test_perd > 0.5).astype(int)
}).to_csv('answer.csv', index=None)

!rm -rf  answer.csv.zip
!zip answer.csv.zip answer.csv

  adding: answer.csv (deflated 80%)
