In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from plotly.offline import init_notebook_mode
init_notebook_mode()

## Train metadata

In [None]:
train = pd.read_csv('../input/metadata_train.csv')

In [None]:
train.head(10)

In [None]:
train.shape

In [None]:
train.columns

In [None]:
print(train.signal_id.dtype)
print(train.id_measurement.dtype)
print(train.phase.dtype)
print(train.target.dtype)

In [None]:
train.phase.value_counts()

Check for missing values

In [None]:
train.isna().any()

Check distribution of target

In [None]:
train.target.value_counts()

In [None]:
train['target'].value_counts().plot.bar()

Check how many ids have at least 1 fault

In [None]:
train.groupby(["id_measurement"]).sum().query("target > 0").shape[0]

How many ids in total?

In [None]:
train['id_measurement'].unique().shape[0]

In [None]:
print('{} out of {} ids contain a fault in at least one of three phases. This is {:.0f}%'.format(
      train.groupby(["id_measurement"]).sum().query("target > 0").shape[0],
      train['id_measurement'].unique().shape[0],
      (train.groupby(["id_measurement"]).sum().query("target > 0").shape[0]*100)/train['id_measurement'].unique().shape[0]))

In [None]:
train.groupby(["id_measurement"]).sum()['target'].value_counts().plot.bar()

Looks like if a fault exists, all three lines usually have a fault simultaneously

In [None]:
train.groupby(["id_measurement"]).sum()['target'].value_counts()

## Test metadata

In [None]:
test = pd.read_csv('../input/metadata_test.csv')

In [None]:
test.head()

In [None]:
test.shape

In [None]:
test.phase.value_counts()

# Train signal data

In [None]:
train_sig = pd.read_parquet('../input/train.parquet')

In [None]:
train_sig.head()

In [None]:
train_sig.shape

Signals with no fault

In [None]:
fig, ax = plt.subplots(figsize=(12,10))
for i in range(3):
    sns.lineplot(train_sig.index, train_sig[str(i)])

Signals with faults

In [None]:
fig, ax = plt.subplots(figsize=(12,10))
for i in range(3,6):
    sns.lineplot(train_sig.index, train_sig[str(i)])

Presumably the fault lies at the beginning of the signals in the second plot but there is not a huge amount of difference between the two graphs. Looks like a significant amount of interference in the power lines in the first plot have led to noisy signals, much like:

![samples](https://storage.googleapis.com/kaggle-forum-message-attachments/445388/10942/samples.png)

From [this](https://www.kaggle.com/c/vsb-power-line-fault-detection/discussion/75771) discussion and other [EDA kernels posted](https://www.kaggle.com/go1dfish/basic-eda), it looks like this isn't an isolated case. It would be worth looking at these signals again after wavelet transformations and denoising as per Tomas' problem description thread.