This kernel uses 1D convolutions on signals from power lines to identify partial faults

In [None]:
import pandas as pd
import pyarrow.parquet as pq
import os

os.listdir('../input/vsb-power-line-fault-detection')

Read the parquet file. The full length of each signal is 800000. We will halve it to 400000 readings to create the pipeline.

In [None]:
subset_train = pq.read_pandas('../input/vsb-power-line-fault-detection/train.parquet',columns=[str(i) for i in range(5000)]).to_pandas() #, columns=[str(i) for i in range(10)]).to_pandas()

In [None]:
subset_train = subset_train.iloc[200000:600000,:]
subset_train.info()

Now read the metadata file.

In [None]:
subset_train.head()

In [None]:
metadata_train = pd.read_csv('../input/vsb-power-line-fault-detection/metadata_train.csv')
metadata_train.info()

In [None]:
metadata_train.head()

Import plotting libraries and create some basic plots.

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

#plt.hist(metadata_train['target'])
sns.countplot(metadata_train['target'])

This is a plot of the target values. As expected, a faulty power line is a kind of rare event. Let's visualize some negative and positive (faulty) signals.

In [None]:
fig = plt.figure(figsize=(10,8))

plt.subplot(431 )    #画一个4*3的画布，竖着4个图横着3个图。431等效于4，3，1 。 这里开始画第一个图，就是下面那句
plt.plot(subset_train['5'])    # 第一个图的内容是subset_train数据集里面列标题是“5”的那一列。
plt.subplot(432)
plt.boxplot(subset_train['5'])
plt.subplot(433)
plt.hist(subset_train['5'])
    
plt.subplot(434)
plt.plot(subset_train['1'])
plt.subplot(435)
plt.boxplot(subset_train['1'])
plt.subplot(436)
plt.hist(subset_train['1'])

plt.subplot(437)
plt.plot(subset_train['3'])
plt.subplot(438)
plt.boxplot(subset_train['3'])
plt.subplot(439)
plt.hist(subset_train['3'])

plt.subplot(4,3,10)
plt.plot(subset_train['4'])
plt.subplot(4,3,11)
plt.boxplot(subset_train['4'])
plt.subplot(4,3,12)
plt.hist(subset_train['4'])

In [None]:
fig = plt.figure(figsize=(22,10))

plt.subplot(361 )    #画一个4*3的画布，竖着4个图横着3个图。431等效于4，3，1 。 这里开始画第一个图，就是下面那句
plt.plot(subset_train['0'])    # 第一个图的内容是subset_train数据集里面列标题是“5”的那一列。
plt.subplot(362)
plt.boxplot(subset_train['0'])
plt.subplot(363)
plt.hist(subset_train['0'])
    
plt.subplot(364)
plt.plot(subset_train['1'])
plt.subplot(365)
plt.boxplot(subset_train['1'])
plt.subplot(366)
plt.hist(subset_train['1'])

plt.subplot(367)
plt.plot(subset_train['2'])
plt.subplot(368)
plt.boxplot(subset_train['2'])
plt.subplot(369)
plt.hist(subset_train['2'])

plt.subplot(3, 6 ,10)
plt.plot(subset_train['3'])
plt.subplot(3,6,11)
plt.boxplot(subset_train['3'])
plt.subplot(3,6,12)
plt.hist(subset_train['3'])

plt.subplot(3,6 ,13)
plt.plot(subset_train['4'])
plt.subplot(3,6,14)
plt.boxplot(subset_train['4'])
plt.subplot(3,6,15)
plt.hist(subset_train['4'])

plt.subplot(3,6 ,16)
plt.plot(subset_train['5'])
plt.subplot(3,6,17)
plt.boxplot(subset_train['5'])
plt.subplot(3,6,18)
plt.hist(subset_train['5'])

At least from these couple of plots, we notice that the faulty signals (last 2) have relatively more outliers than the non-faulty ones. We will analyze this further with more data.

Let's separate the positive and negative signals for further analysis. I'm going to reduce the sample sizes to make sure we don't run out of memory limits.

In [None]:
import numpy as np
S_decimation = subset_train.iloc[0:25000:8,:]
# Temporarily reduce data size to build the pipeline
small_subset_train = S_decimation
small_subset_train = small_subset_train.transpose()
small_subset_train.index = small_subset_train.index.astype(np.int32)
train_dataset = metadata_train.join(small_subset_train, how='right')

# Uncomment the following to train on the full dataset
#subset_train = subset_train.transpose()
#subset_train.index = subset_train.index.astype(np.int32)
#train_dataset = metadata_train.join(subset_train, how='right')

In [None]:
positive_samples = train_dataset[train_dataset['target']==1]
positive_samples = positive_samples.iloc[:,3:]
positive_samples.info()

In [None]:
positive_samples.head()

In [None]:
positive_samples.shape

Now let's visualize the positive (faulty) signals using a boxplot for several of them.

In [None]:
plt.figure(figsize=(10,4))
plt.subplot(151)
plt.boxplot(positive_samples.iloc[201,1:])
plt.subplot(152)
plt.boxplot(positive_samples.iloc[150,1:])
plt.subplot(153)
plt.boxplot(positive_samples.iloc[110,1:])



We see that the data values differ a lot. Let's normalize the data first, this will also be needed for training some type of models later.

In [None]:
# Normalize the data set
from sklearn.preprocessing import StandardScaler
y_train_pos = positive_samples.iloc[:, 0]
X_train_pos = positive_samples.iloc[:, 1:]
scaler = StandardScaler()         #通过去除均值并将其缩放为单位方差来标准化特征样本
scaler.fit(X_train_pos.T)            #对
X_train_pos = scaler.transform(X_train_pos.T).T

Let's visualize the boxplots again using this normalized data.

In [None]:
X_train_pos

In [None]:
plt.figure(figsize=(10,4))
plt.subplot(151)
plt.boxplot(X_train_pos[0,:])
plt.subplot(152)
plt.boxplot(X_train_pos[1,:])
plt.subplot(153)
plt.boxplot(X_train_pos[2,:])
plt.subplot(154)
plt.boxplot(X_train_pos[3,:])
plt.subplot(155)
plt.boxplot(X_train_pos[4,:])

Again we notice that there are a lot of outliers in the positive (faulty) signals.

Now let's extract the negative (non-faulty) samples and visualize the same boxplots, and see if we can notice any apparent difference.

In [None]:
negative_samples = train_dataset[train_dataset['target']==0]
negative_samples = negative_samples.iloc[:,3:]

y_train_neg = negative_samples.iloc[:, 0]
X_train_neg = negative_samples.iloc[:, 1:]
scaler.fit(X_train_neg.T)
X_train_neg = scaler.transform(X_train_neg.T).T

plt.figure(figsize=(10,4))
plt.subplot(151)
plt.boxplot(X_train_neg[0,:])
plt.subplot(152)
plt.boxplot(X_train_neg[1,:])
plt.subplot(153)
plt.boxplot(X_train_neg[2,:])
plt.subplot(154)
plt.boxplot(X_train_neg[3,:])
plt.subplot(155)
plt.boxplot(X_train_neg[4,:])

In [None]:
negative_samples.head()

In [None]:
negative_samples.shape


In [None]:
positive_samples.shape

In [None]:
X_train_neg

The negative (non-faulty) signals have much fewer outliers, and their magnitudes also seem to be very low. Seems like the number of outliers could be a promising feature.

Now let's create the test/train split for training a Conv1D model.

In [None]:
from sklearn.model_selection import train_test_split

X_train_pos, X_valid_pos, y_train_pos, y_valid_pos = train_test_split(X_train_pos, y_train_pos, 
                                                                    test_size=0.2,
                                                                    random_state = 0,
                                                                    shuffle=True)

X_train_neg, X_valid_neg, y_train_neg, y_valid_neg = train_test_split(X_train_neg, y_train_neg, 
                                                                    test_size=0.2,
                                                                    random_state = 0,
                                                                    shuffle=True)

In [None]:
X_train_pos.shape, X_train_neg.shape

As we know, the positive samples are fewer, so we will only select a subset of negative samples for training.

In [None]:
# Combine positive and negative samples for training...
def combine_positive_and_negative_samples(pos_samples, neg_samples, y_pos, y_neg):
    X_combined = np.concatenate((pos_samples, neg_samples)) 
                                                    # don't select all negative samples, to
                                                    # keep the samples balanced
    y_combined = np.concatenate((y_pos, y_neg))
    #X_train_combined.shape, y_train_combined.shape
    combined_samples = np.hstack((X_combined, y_combined.reshape(y_combined.shape[0],1)))
    np.random.shuffle(combined_samples)
    return combined_samples

# Only use 500 negative samples, to create a balanced dataset with the positive samples...
train_samples = combine_positive_and_negative_samples(X_train_pos, X_train_neg[:500, :], y_train_pos, y_train_neg[:500])
X_train = train_samples[:,:-1]
y_train = train_samples[:,-1]
X_train.shape, y_train.shape

In [None]:
train_samples.shape

In [None]:
# Create the validation set
#X_valid_combined = np.concatenate((X_valid_pos, X_valid_neg[:500,:])) # don't select all negative samples, to
                                                  # keep the samples balanced
#y_valid_combined = np.concatenate((y_valid_pos, y_valid_neg[:500]))
#X_valid_combined.shape, y_valid_combined.shape
#validation_samples = np.hstack((X_valid_combined, y_valid_combined.reshape(y_valid_combined.shape[0],1)))
#np.random.shuffle(validation_samples)

validation_samples = combine_positive_and_negative_samples(X_valid_pos, X_valid_neg[:500,:], y_valid_pos, y_valid_neg[:500])
X_valid = validation_samples[:,:-1]
y_valid = validation_samples[:,-1]
X_valid.shape, y_valid.shape

In [None]:
X_train.shape, X_valid.shape

A 1-D ConvNet would be an interesting model to try out on this signal. Earlier we saw that there are a lot of outliers in fauty signals. Since the actual signal value differs at different times, the outliers are relative to this mean signal value. A 1-D ConvNet can analyze the signal in various windows of increasing lengths and create high-level features out of that to classify on.

In [None]:
# Reshape training and validation data for keras input layer
X_train = X_train.reshape(-1,1,3125, 1)
X_valid = X_valid.reshape(-1,1,3125, 1)

X_train.shape, X_valid.shape, y_train.shape, y_valid.shape
#print(X_train)

In [None]:
X_valid = X_valid.astype(np.float32)
y_valid = y_valid.astype(np.float32)
X_train = X_train.astype(np.float32)
y_train = y_train.astype(np.float32)
X_train.dtype

In [None]:
np.save('X_valid.npy',X_valid)
np.save('y_valid.npy',y_valid)
np.save('X_train.npy',X_train)
np.save('y_train.npy',y_train)

In [None]:
import numpy as np
X_valid = np.load('X_valid.npy')
y_valid = np.load('y_valid.npy')
X_train = np.load('X_train.npy')
y_train = np.load('y_train.npy')

In [None]:
np.mean(X_train, axis = 0), np.max(X_train, axis = 0), np.min(X_train, axis = 0), np.std(X_train, axis = 0)

In [None]:
np.sum(y_valid), np.sum(y_train)

In [None]:
def feature_normalize(data):
    mu = np.mean(data,axis=0)
    std = np.std(data,axis=0)
    return (data - mu)/std

X_valid = feature_normalize(X_valid)
X_train = feature_normalize(X_train)

In [None]:
import torch
import torchvision
from torchvision import datasets, transforms
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import torch.utils.data as Data
print(torch.__version__)

In [None]:
class torch_Dataset(Data.Dataset): # 需要继承 data.Dataset
    def __init__(self, x, y):
        self.x = torch.from_numpy(x)
        self.y = torch.from_numpy(y)
    def __getitem__(self, index):
        data = (self.x[index], self.y[index])
        return data
    def __len__(self):
        return len(self.y)

In [None]:
def training_loader(train_data, batch_size, shuffle):
    return torch.utils.data.DataLoader(train_data, batch_size, shuffle)


In [None]:
Train_dataset = torch_Dataset(X_train, y_train)
test_dataset = torch_Dataset(X_valid, y_valid)
train_loader = training_loader(Train_dataset, batch_size=1, shuffle=True)
test_loader = training_loader(test_dataset, batch_size=1, shuffle=True)  


In [None]:
class CNN(nn.Module):
    def __init__(self):
        super(CNN,self).__init__()
        self.conv1 = nn.Conv2d(1, 32, kernel_size=(16,1), padding=0, stride=(2,1))
        self.bn1 = nn.BatchNorm2d(32)
        self.rl1 =  nn.ReLU()
        self.pool1 = nn.MaxPool2d((2,1))
        self.do1 =    nn.Dropout(0.2)
        self.conv2 = nn.Conv2d(32, 128, kernel_size=(16,1), padding=0, stride=(2,1))
        self.bn2 = nn.BatchNorm2d(128)
        self.rl2 =  nn.ReLU()
        self.pool2 = nn.MaxPool2d((2,1))
        self.do2 =    nn.Dropout(0.2)
        self.conv3 = nn.Conv2d(128, 256, kernel_size=(10,1), padding=0, stride=(2,1))
        self.bn3 = nn.BatchNorm2d(256)
        self.rl3 =  nn.ReLU()
        self.pool3 = nn.MaxPool2d((2,1))
        self.do3 =    nn.Dropout(0.2)
        self.conv6 = nn.Conv2d(256, 64, kernel_size=(10,1), padding=0, stride=(2,1))
        self.rl6 =  nn.ReLU()
        
        self.fc1   = nn.Linear(1152, 512)
        self.rl7 =  nn.ReLU()
        self.do7 =    nn.Dropout(0.2)
        self.fc2   = nn.Linear(512, 128)
        self.rl8 =  nn.ReLU()
        self.do8 =    nn.Dropout(0.2)
        self.fc3   = nn.Linear(128, 1)
      
            
        
    def forward(self, x):
        x = self.conv1(x)
        x = self.bn1(x)
        x = self.rl1(x)
        x = self.pool1(x)
        x = self.do1(x)
        x = self.conv2(x)
        x = self.bn2(x)
        x = self.rl2(x)
        x = self.pool2(x)
        x = self.do2(x)
        x = self.conv3(x)
        x = self.bn3(x)
        x = self.rl3(x)
        x = self.pool3(x)
        x = self.do3(x)
        x = self.conv6(x)
        x = self.rl6(x)
        x = x.view(-1,1152)
   
        x = self.fc1(x)
        x = self.rl7(x)
        x = self.do7(x)
        x = self.fc2(x)
        x = self.rl8(x)
        x = self.do8(x)
        x = self.fc3(x)
       # print(x.shape)
        x = torch.sigmoid(x)
        
        return x


In [None]:
    if torch.cuda.is_available():
        device = torch.device( "cuda")
    else:
        device = torch.device( "cpu")
    print(f"training on {device} device.")
    model = CNN().to(device)
    optimizer = optim.Adam(model.parameters(), lr=0.0001) #1e-2)
    loss_fn = nn.BCELoss()

    print(model)

In [None]:
def training_loop(n_epochs, optimizer, model, loss_fn, train_loader, dev):
    for epoch in range(1, n_epochs + 1):
        loss_train = 0.0
        for imgs, labels in train_loader:
            imgs = imgs.to(device=dev)
            #imgs = imgs.float() 
            labels = labels.to(device=dev)
            outputs = model(imgs)
            #outputs = outputs.squeeze(-1)       
            #print("train output size ",outputs.size())
           # print("label output size ",labels.size())
            #print(outputs)
            optimizer.zero_grad()
            loss = loss_fn(outputs, labels)
            loss.backward()
            optimizer.step()
            loss_train += loss.item()
        if epoch == 1 or epoch % 2 == 0:
            print('{0} Epoch {1:3d}, Training loss {2:.6f}'.format(
                datetime.datetime.now(), epoch,
                loss_train / len(train_loader)))
    return

In [None]:
    import warnings
    import datetime
    warnings.filterwarnings('ignore') 
    training_loop(
        n_epochs = 20, #100,
        optimizer = optimizer,
        model = model,
        loss_fn = loss_fn,
        train_loader = train_loader,
        dev = device
    )

In [None]:

print('Trained model written to ', 'torch_chkp.pt')
torch.save(model.state_dict(), 'torch_chkp.pt')
loaded_model = CNN()  # <1>
loaded_model.load_state_dict(torch.load('torch_chkp.pt'))

In [None]:
def validate(model, train_loader, val_loader, dev):
    accdict = {}
    for name, loader in [("train dataset", train_loader), ("test dataset  ", val_loader)]:
        correct = 0
        total = 0
        with torch.no_grad():
            for imgs, labels in loader:
                imgs = imgs.float() 
                imgs = imgs.to(device=dev)
                labels = labels.to(device=dev)
                outputs = model(imgs)
                
                predicted = torch.max(outputs) # <1>
                print(predicted)
                if(predicted>0.5):
                   #print(predicted)
                    falt_detected =1
                else:
                    falt_detected =0
                total += labels.shape[0]
                correct += int((falt_detected == labels).sum())
                print("predict value:", falt_detected, "real value:" , labels)
                
        print("Accuracy {0}: {1:.2f}(%)".format(name , 100*(correct/total)))
        accdict[name] = correct / total
    return accdict

In [None]:
def fit(model, train_loader, val_loader, dev):
    accdict = {}
    for name, loader in [("train dataset", train_loader), ("test dataset  ", val_loader)]:
        correct = 0
        total = 0
        with torch.no_grad():
            for imgs, labels in loader:
                imgs = imgs.float() 
                imgs = imgs.to(device=dev)
                labels = labels.to(device=dev)
                outputs = model(imgs)
                
                predicted = torch.max(outputs) # <1>
                #print(predicted)
  
    return accdict

In [None]:
validate(model, train_loader, test_loader, device)

In [None]:
import time
import collections
loaded_model = CNN()  # <1>
loaded_model.load_state_dict(torch.load('torch_chkp.pt'))
all_acc_dict = collections.OrderedDict()
time_start = time.time()
all_acc_dict["baseline"] = validate(model, train_loader, test_loader, device)
time_end = time.time()
timetotal = time_end - time_start
runTotal = len(train_loader)
fps = float(runTotal / timetotal)
print("FPS=%.2f, total frames = %.0f , time=%.4f seconds" %(fps,runTotal, timetotal))