In [1]:
import pandas as pd
import numpy as np
import os
import shutil
import random
import torch
import torchvision
from matplotlib import pyplot as plt
from sklearn.model_selection import train_test_split
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
import torch.nn.functional as F
import torch.nn as nn
from sklearn.metrics import f1_score


In [2]:
class Data(Dataset):
    def __init__(self, X, Y = None):
        self.X = torch.tensor(X, dtype=torch.float32)
        self.Y = torch.tensor(Y, dtype=torch.float32)

    def __getitem__(self, index):

      return self.X[index] , self.Y[index]



    def __len__(self):
        return len(self.X) #same amount of data points

In [3]:
#load the files google colab mode
"""

from google.colab import drive
drive.mount('/content/drive/')
path_drive = r'drive/MyDrive/Fault Detection/'

!unzip drive/MyDrive/Fault_Detection/Data_diff

"""

Mounted at /content/drive/
Archive:  drive/MyDrive/Fault_Detection/Data_diff.zip
  inflating: Features.npy            
  inflating: Features_fine.npy       
  inflating: Features_val.npy        
  inflating: Target.npy              
  inflating: Target_fine.npy         


In [3]:
X_train = np.load(r'Features.npy')
Y_train = np.load(r'Target.npy')

X_fine = np.load(r'Features_fine.npy')
Y_fine = np.load(r'Target_fine.npy')

X_train = np.concatenate((X_train, X_fine, X_fine, X_fine, X_fine, X_fine, X_fine, X_fine, X_fine), axis = 0)
Y_train = np.concatenate((Y_train, Y_fine, Y_fine, Y_fine, Y_fine, Y_fine, Y_fine, Y_fine, Y_fine), axis = 0)

print(X_train.shape)
print(Y_train.shape)

FileNotFoundError: [Errno 2] No such file or directory: 'Features.npy'

In [6]:
def f1_loss(y_true:torch.Tensor, y_pred:torch.Tensor, is_training=False) -> torch.Tensor:
    '''Calculate F1 score. Can work with gpu tensors
    source : - https://gist.github.com/EdisonLeeeee/803c2f91effa9f3fd4e1b3f4870d9842
    The original implmentation is written by Michal Haltuf on Kaggle.

    Returns
    -------
    torch.Tensor
        `ndim` == 1. 0 <= val <= 1

    Reference
    ---------
    - https://www.kaggle.com/rejpalcz/best-loss-function-for-f1-score-metric
    - https://scikit-learn.org/stable/modules/generated/sklearn.metrics.f1_score.html#sklearn.metrics.f1_score
    - https://discuss.pytorch.org/t/calculating-precision-recall-and-f1-score-in-case-of-multi-label-classification/28265/6
    '''
    assert y_true.ndim == 1
    assert y_pred.ndim == 1 or y_pred.ndim == 2

    if y_pred.ndim == 2:
        y_pred = y_pred.argmax(dim=1)


    tp = (y_true * y_pred).sum().to(torch.float32)
    tn = ((1 - y_true) * (1 - y_pred)).sum().to(torch.float32)
    fp = ((1 - y_true) * y_pred).sum().to(torch.float32)
    fn = (y_true * (1 - y_pred)).sum().to(torch.float32)

    epsilon = 1e-7

    precision = tp / (tp + fp + epsilon)
    recall = tp / (tp + fn + epsilon)

    f1 = 2* (precision*recall) / (precision + recall + epsilon)
    #f1.requires_grad = is_training
    return 1-f1

In [17]:
random_seed = 42
torch.manual_seed(random_seed)
np.random.seed(random_seed)

dataset = Data(X_train, Y_train)


learning_rate = 0.0001
batch_size = 64
amount_train = 0.8
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

training_data_size = len(X_train)
train_size = int(amount_train*training_data_size)
test_size = training_data_size - train_size

train_dataset, test_dataset = torch.utils.data.random_split(dataset, [train_size, test_size])

data_train = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
data_test = DataLoader(test_dataset, batch_size=batch_size, shuffle =False)

In [8]:
class NN(nn.Module):
  def __init__(self):
      super(NN, self).__init__()

      self.lc1 = nn.Linear(40, 512)
      self.lc2 = nn.Linear(512, 1024)
      self.lc3 = nn.Linear(1024, 256)
      self.lc4 = nn.Linear(256, 1)

      self.dropout = nn.Dropout(p=0.5)

  def forward(self, X):

      X = X.view(-1,40)

      X = F.relu(self.lc1(X))
      X = self.dropout(X)

      X = F.relu(self.lc2(X))
      X = self.dropout(X)

      X = F.relu(self.lc3(X))
      X = self.dropout(X)

      X = torch.sigmoid(self.lc4(X))

      X = X.flatten()
      return X

In [18]:
import torch.optim.lr_scheduler as lr_scheduler
DNN = NN().to(device)

optimizer = torch.optim.Adam(DNN.parameters(), lr = learning_rate)
epochs = 50

scheduler = lr_scheduler.StepLR(optimizer, step_size=1, gamma=0.96)

In [19]:
def train(epochs, DNN, loss_fn, optimizer,
          data_train, data_test, device, scheduler):
    print('Starting training..')
    for e in range(0, epochs):
        print('='*20)
        print(f'Starting epoch {e + 1}/{epochs}')
        print('='*20)

        train_loss = 0.
        val_loss = 0.

        DNN.train()
        # set model to training phase

        for train_step, (X ,labels) in enumerate(data_train):

            X = X.to(device)

            labels = labels.to(device)
            # set the gradients to zero (you can do so by accessing the optimizer)
            optimizer.zero_grad()
            # compute outputs
            output = DNN(X).to(device)


            # compute loss  (you have defined the loss_fn above!)
            loss = f1_loss(output, labels, True)

            #  use backward() so that the whole graph is differentiated w.r.t. the loss
            loss.backward()
            # performs a parameter update based on the current gradient (Note: you need to do it through the optimizer)
            optimizer.step()

            train_loss += loss.item()

            if e % 2 == 0 and train_step % 8000 == 0 and train_step > 0:

                print('Evaluating at step', train_step)

                accuracy = 0

                f_1 = 0

                DNN.eval()          # set model to eval phase

                for val_step, (X ,labels) in enumerate(data_test):
                    X = X.to(device)
                    labels = labels.to(device)

                    outputs = DNN(X).to(device)
                             # compute outputs

                    loss = f1_loss(outputs, labels, True)
                    loss = loss.to(device)
                        # compute loss
                    val_loss += loss.item()

                    preds = (outputs > 0.5).float()
                    accuracy += sum((preds.cpu() == labels.cpu()).numpy())
                    f_1 += f1_score(labels.cpu().numpy(), preds.cpu().numpy())


                val_loss /= (val_step + 1)
                accuracy = accuracy/(val_step *64)

                f_1  /= (val_step + 1)
                print(f'F1 Loss: {val_loss:.4f}, Accuracy: {accuracy:.4f}, F1 Score: {f_1:.4f}')

                DNN.train()



        train_loss /= (train_step + 1)


        scheduler.step()

        print(f'Training Loss: {train_loss:.4f}')
    print('Training complete..')

In [20]:
train(epochs, DNN, loss_fn=None, optimizer=optimizer,
      data_train=data_train, data_test=data_test, device = device,
      scheduler = scheduler
      )

Starting training..
Starting epoch 1/50
Evaluating at step 8000
F1 Loss: 0.3168, Accuracy: 0.5700, F1 Score: 0.6834
Training Loss: 0.3306
Starting epoch 2/50
Training Loss: 0.3103
Starting epoch 3/50
Evaluating at step 8000
F1 Loss: 0.3030, Accuracy: 0.6134, F1 Score: 0.6969
Training Loss: 0.3069
Starting epoch 4/50
Training Loss: 0.3046
Starting epoch 5/50
Evaluating at step 8000
F1 Loss: 0.3005, Accuracy: 0.6239, F1 Score: 0.6995
Training Loss: 0.3039
Starting epoch 6/50
Training Loss: 0.3030
Starting epoch 7/50
Evaluating at step 8000
F1 Loss: 0.2992, Accuracy: 0.6279, F1 Score: 0.7010
Training Loss: 0.3024
Starting epoch 8/50
Training Loss: 0.3011
Starting epoch 9/50
Evaluating at step 8000
F1 Loss: 0.2999, Accuracy: 0.6176, F1 Score: 0.7002
Training Loss: 0.3006
Starting epoch 10/50
Training Loss: 0.3002
Starting epoch 11/50
Evaluating at step 8000
F1 Loss: 0.2979, Accuracy: 0.6311, F1 Score: 0.7021
Training Loss: 0.2998
Starting epoch 12/50
Training Loss: 0.2993
Starting epoch 13

In [21]:
#save models
#torch.save(DNN.state_dict(), r'drive/MyDrive/Fault_Detection/fault_dnn_dict_v9.pt')
#torch.save(DNN, r'drive/MyDrive/Fault_Detection/fault_dnn_v9.pt')

In [None]:
###DNN.load_state_dict(torch.load(r'drive/MyDrive/Fault_Detection/fault_dnn_dict.pt'))

<All keys matched successfully>

In [22]:


X_val = np.load('Features_val.npy')

Y_val = np.zeros(len(X_val))

data_val = Data(X_val, Y_val)

val_loader = DataLoader(data_val, batch_size=batch_size, shuffle =False)

DNN.eval()

target = np.array([])
for step, (X ,labels) in enumerate(val_loader):

    X = X.to(device)


    outputs = DNN(X).to(device)
                             # compute outputs
    preds = (outputs > 0.5).float()

    target = np.append(target, preds.to('cpu').numpy(), axis = 0)


sub_file = pd.read_csv(r'SampleSubmission.csv')


In [23]:
sub_file['data_rate_t+1_trend'] = target

sub_file.to_csv('sub_9.csv', index = False)