# VSB Power Grid Fault Detection

In [None]:
from IPython.display import Image
Image(url='https://upload.wikimedia.org/wikipedia/commons/thumb/e/e0/Three_Phase_Electric_Power_Transmission.jpg/1200px-Three_Phase_Electric_Power_Transmission.jpg')

Data Source: https://www.kaggle.com/c/vsb-power-line-fault-detection

Useful read: https://en.wikipedia.org/wiki/Three-phase_electric_power


### Importing Libraries

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import pyarrow.parquet as pq #reading parquet files 
import matplotlib.pyplot as plt
import os
import seaborn as sns

### Taking first 2000 rows (due to computational limitations) for EDA and visualization

In [None]:
INIT_DIR = '../input'
SIZE = 2001

In [None]:
train = pq.read_pandas(os.path.join(INIT_DIR, 'vsb-power-line-fault-detection/train.parquet'), columns=[str(i) for i in range(SIZE)]).to_pandas()
metadata = pd.read_csv('../input/vsb-power-line-fault-detection/metadata_train.csv')

In [None]:
train.head()

In [None]:
train.shape

In [None]:
metadata.head()

In [None]:
metadata.shape

In [None]:
train_metadata = metadata[:SIZE]

In [None]:
train_metadata.shape

#### Observation:

As each column represent a signal, it will be  better if we transpose the dataframe

### Transposing the dataframe as each column represents one data point.

In [None]:
train = train.T

In [None]:
train.head(2)

### Adding signal id to the main data frame

In [None]:
train['signal_id'] = list(train_metadata['signal_id'])

In [None]:
train.head(2)

### Merging Metadata and Signal Data based on signal_id

In [None]:
train = train.merge(train_metadata, on='signal_id')

In [None]:
train.head(2)

### Checking for null values in the dataframe

In [None]:
train.isnull().sum().sum()

#### Observation: 

There is no null values

### Plotting count vs target plots to check data imbalance

In [None]:
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(14, 4))
sns.countplot(x="target", data=train, ax=ax1)
sns.countplot(x="target", data=train, hue="phase", ax=ax2);

#### Observation:

Data is highly imbalances as the target with value 1 are much less than the target with value 0

### Percentage of positive and negative target values

In [None]:
# https://www.w3resource.com/graphics/matplotlib/piechart/matplotlib-piechart-exercise-2.php
plt.rcParams["figure.figsize"] = (40,6.5)
data = train['target'].value_counts()
labels = ['Target 0', 'Target 1']
colors = ["#1f77b4", "#ff7f0e"]
title = 'Count of signals distributed by phase'
explodes = [0, 0.1]
plt.pie(data,explode=explodes, labels=labels, colors=colors, shadow=True, startangle=20, autopct='%.1f%%')
plt.title(title, bbox={'facecolor':'0.8', 'pad':5})
plt.show()

In [None]:
target_count = train.target.value_counts()
print("negative(target=0) target: {}".format(target_count[0]))
print("positive(target=1) target: {}".format(target_count[1]))
print("positive data {:.3}%".format((target_count[1]/(target_count[0]+target_count[1]))*100))

#### Observation:

Data is imbalanced and the faulty signals are only 6.3% of the total signals

### Checking if there are different values of target in the different phase of same signal

In [None]:
train[['id_measurement', 'phase']]

In [None]:
target_mismatch = train[["id_measurement", "target"]].groupby(["id_measurement"]).sum().query("target != 3 & target != 0")
print("Target values not all postive or negative for same signal: {}".format(target_mismatch.shape[0]))
target_mismatch

#### Checking target for id _measurement==67 where target value is different in different phase

In [None]:
train[train['id_measurement'] == 67]

#### Observation:

Target values can be different for same signal in different phases

### Finding the Unique values of id_measurement in our dataset

In [None]:
print("id_measurement have {} unique values".format(train.id_measurement.nunique()))

#### Observation:

The unique value of the id_measurement is as expected : (total signals) / 3 , as there are three phases of each signal

### Basic description of the id_measurement column 

In [None]:
train.id_measurement.value_counts().describe()

#### Observation:

Values are all as expected.

Max count and min count of each signal is 3, as there are three phases of each signal

count is total/3, as each signal is having three phases.

### Printing unique values of phase column

In [None]:
print("phase have {} unique values {} in train".format(len(train.phase.unique()),train.phase.unique()))

In [None]:
sns.countplot(train['phase']);

In [None]:
# https://www.w3resource.com/graphics/matplotlib/piechart/matplotlib-piechart-exercise-2.php
data = train['phase'].value_counts()
labels = ['Phase 0', 'Phase 1', 'Phase 3']
colors = ["#1f77b4", "#ff7f0e", "#2ca02c"]
title = 'Count of signals distributed by phase'
plt.pie(data, labels=labels, colors=colors, shadow=True, startangle=90, autopct='%.1f%%')
plt.title(title, bbox={'facecolor':'0.8', 'pad':5})
plt.show()

#### Observation:
Phase columns is having only 3 values 1,2,3 for each signal as there are three phases of each signal

## Plotting 2d plots using t-SNE using different values of perplexity and learning rate

#### Plotting the t-SNE plots only for 1/4th of the points due to computational limitations

#### 1. Using perplexity: 30 and learning rate: 200

In [None]:
from sklearn.manifold import TSNE

tsne = TSNE(n_components=2, perplexity=30, learning_rate=200, random_state=42)

X_embedding = tsne.fit_transform(train[:500])
y = np.array(train['target'][:500])

for_tsne = np.hstack((X_embedding, y.reshape(-1,1)))
for_tsne_df = pd.DataFrame(data=for_tsne, columns=['Dimension_x','Dimension_y','Score'])
colors = {0:'red', 1:'blue', 2:'green'}
plt.scatter(for_tsne_df['Dimension_x'], for_tsne_df['Dimension_y'], c=for_tsne_df['Score'].apply(lambda x: colors[x]))
plt.show()

del(tsne)

#### 2. Using perplexity: 50 and learning rate:200

In [None]:
from sklearn.manifold import TSNE

tsne = TSNE(n_components=2, perplexity=50, learning_rate=200, random_state=42)

X_embedding = tsne.fit_transform(train[:500])
y = np.array(train['target'][:500])

for_tsne = np.hstack((X_embedding, y.reshape(-1,1)))
for_tsne_df = pd.DataFrame(data=for_tsne, columns=['Dimension_x','Dimension_y','Score'])
colors = {0:'red', 1:'blue', 2:'green'}
plt.scatter(for_tsne_df['Dimension_x'], for_tsne_df['Dimension_y'], c=for_tsne_df['Score'].apply(lambda x: colors[x]))
plt.show()

del(tsne)

#### 3. Using Perplexity:100 and learning rate: 150

In [None]:
from sklearn.manifold import TSNE

tsne = TSNE(n_components=2, perplexity=100, learning_rate=150, random_state=42)

X_embedding = tsne.fit_transform(train[:500])
y = np.array(train['target'][:500])

for_tsne = np.hstack((X_embedding, y.reshape(-1,1)))
for_tsne_df = pd.DataFrame(data=for_tsne, columns=['Dimension_x','Dimension_y','Score'])
colors = {0:'red', 1:'blue', 2:'green'}
plt.scatter(for_tsne_df['Dimension_x'], for_tsne_df['Dimension_y'], c=for_tsne_df['Score'].apply(lambda x: colors[x]))
plt.show()

del(tsne)

### Observation:

The points are not well seperated in 2-dimensions as observed by these t-sne plots

## Plotting signals 

#### Plotting normal Signal

In [None]:
#signal with target 0 (normal signal)
train.loc[1]['target']

In [None]:
plt.figure(figsize=(24, 8))
plt.plot((train.loc[1].values), alpha=0.7);
plt.ylim([-100, 100])

#### Plotting Faulty signal

In [None]:
#signal with target 1 (Faulty Signal)
train.loc[201]['target']

In [None]:
plt.figure(figsize=(24, 8))
plt.plot((train.loc[201].values), alpha=0.7);
plt.ylim([-100, 100])

#### Observation:

Faulty signal has more noise

### Plotting all three phases of a signal

#### Plotting all three phases of a normal signal

In [None]:
#signal with target 0 (Normal Signal)
train.loc[0:2][['target', 'id_measurement']]

In [None]:
plt.figure(figsize=(24, 8))
plt.plot((train.loc[0].values), alpha=0.7);
plt.plot((train.loc[1].values), alpha=0.7);
plt.plot((train.loc[2].values), alpha=0.7);
plt.ylim([-100, 100])

#### Plotting all three phases of a faulty signal

In [None]:
#signal with target 1 (Faulty Signal)
train.loc[3:5][['target', 'id_measurement']]

In [None]:
plt.figure(figsize=(24, 8))
plt.plot((train.loc[3].values), alpha=0.7);
plt.plot((train.loc[4].values), alpha=0.7);
plt.plot((train.loc[5].values), alpha=0.7);
plt.ylim([-100, 100])

#### Observation:

Faulty signal have more noise than the normal signal. 
Hence, noise can be a very useful feature for fault detection

## Flatiron

#### reference: https://www.kaggle.com/miklgr500/flatiron

The idea of flatiron is similar to High Pass Filter. It allows high frequency to pass. It can be useful for noise extraction

In [None]:
def flatiron(x, alpha=50, beta=1):
    new_x = np.zeros_like(x)
    zero = x[0]
    for i in range(1, len(x)):
        zero = zero*(alpha-beta)/alpha + beta*x[i]/alpha
        new_x[i] =  x[i] - zero
    return new_x

#### Plotting normal signal with flattened normal signal

In [None]:
#Flattening a Normal signal
normal_sample_filt =  [None] * 3
normal_sample_filt[0] = flatiron(train.loc[0].values)
normal_sample_filt[1] = flatiron(train.loc[1].values)
normal_sample_filt[2] = flatiron(train.loc[2].values)

In [None]:
normal_sample_filt

In [None]:
#Code to plot faulty signal with flattened faulty signal
f, ax = plt.subplots(1, 2, figsize=(24, 8))

ax[0].plot((train.loc[0].values), alpha=0.7);
ax[0].plot((train.loc[1].values), alpha=0.7);
ax[0].plot((train.loc[2].values), alpha=0.7);
ax[0].set_title('Normal signal')
ax[0].set_ylim([-100, 100])

ax[1].plot((normal_sample_filt)[0], alpha=0.7);
ax[1].plot((normal_sample_filt)[1], alpha=0.7);
ax[1].plot((normal_sample_filt)[2], alpha=0.7);
ax[1].set_title('filtered Normal signal')
ax[1].set_ylim([-100, 100])

del(normal_sample_filt)

#### Observation:

We are able to flatten the signal and are able to visualize the noise in the signal more easily

#### Plotting faulty signal with flattened faulty signal

In [None]:
#Flattening a Faulty signal
fault_sample_filt =  [None] * 3
fault_sample_filt[0] = flatiron(train.loc[3].values)
fault_sample_filt[1] = flatiron(train.loc[4].values)
fault_sample_filt[2] = flatiron(train.loc[5].values)

In [None]:
fault_sample_filt

In [None]:
#Code to plot faulty signal with flattened faulty signal
f, ax = plt.subplots(1, 2, figsize=(24, 8))

ax[0].plot((train.loc[3].values), alpha=0.7);
ax[0].plot((train.loc[4].values), alpha=0.7);
ax[0].plot((train.loc[5].values), alpha=0.7);
ax[0].set_title('fault signal')
ax[0].set_ylim([-100, 100])

ax[1].plot((fault_sample_filt)[0], alpha=0.7);
ax[1].plot((fault_sample_filt)[1], alpha=0.7);
ax[1].plot((fault_sample_filt)[2], alpha=0.7);
ax[1].set_title('filtered fault signal')
ax[1].set_ylim([-100, 100])

del(fault_sample_filt)

#### Observation:

Faulty signal has more noise than normal signal