In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec
import seaborn as sns
from pylab import rcParams

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# Exploring the dataset

In [None]:
healthyDataset = pd.read_csv("../input/gearbox-fault-diagnosis-elaborated-datasets/gearbox-fault-diagnosis-elaborated-datasets/stdev/healthy30hz_stdev_100.csv")
healthyDataset

In [None]:
brokenDataset = pd.read_csv("../input/gearbox-fault-diagnosis-elaborated-datasets/gearbox-fault-diagnosis-elaborated-datasets/stdev/broken30hz_stdev_100.csv")
brokenDataset

## Join healthy & broken datasets in a single dataframe

In [None]:
dataset = pd.concat([healthyDataset, brokenDataset], axis=0)
dataset.describe()

## Rows count per % load

In [None]:
fig, axes = plt.subplots(1, 2, figsize=(15, 5), sharey=True)
   
fig.suptitle('%Load count of rows', fontsize=20)
sns.set(style='whitegrid', palette='muted', font_scale=1.5)

# Broken gearbox
broken = sns.countplot(ax=axes[0], x='load',data=brokenDataset)
axes[0].set_title('Broken gearbox', fontsize=15)
broken.set_xlabel('Load %',fontsize=15)
broken.set_ylabel('Count of rows',fontsize=15)

# Healthy gearbox
healthy = sns.countplot(ax=axes[1], x='load',data=healthyDataset)
axes[1].set_title('Healthy gearbox' , fontsize=15)
healthy.set_xlabel('Load %',fontsize=15)
healthy.set_ylabel('Count of rows',fontsize=15)

## Compared histograms
### Overall (all loads)

In [None]:
plt.figure(figsize=(15,28*4))
gs = gridspec.GridSpec(28, 1)
for i, cn in enumerate(dataset[['a1','a2','a3','a4']]):
    ax = plt.subplot(gs[i])
    sns.distplot(dataset[cn][dataset.failure == 1], bins=50)
    sns.distplot(dataset[cn][dataset.failure == 0], bins=50)
    ax.set_xlabel('')
    plt.legend(['broken', 'healthy'])
    ax.set_title('histogram for ' + str(cn))
plt.show()

### Separated per loads 0%, 50%, 90%

In [None]:
plt.figure(figsize=(15,28*4))
gs = gridspec.GridSpec(28, 1)
for i, cn in enumerate(dataset[['a1','a2','a3','a4']]):
    ax = plt.subplot(gs[i])
    sns.distplot(dataset[cn][dataset.load == 0], bins=50)
    sns.distplot(dataset[cn][dataset.load == 50], bins=50)
    sns.distplot(dataset[cn][dataset.load == 90], bins=50)
    ax.set_xlabel('')
    plt.legend(['0%', '50%', '90%'])
    ax.set_title('histogram for ' + str(cn) + ': healthy & broken')
    ax.set_title('histogram for ' + str(cn))
plt.show()

## Correlation matrix

In [None]:
rcParams['figure.figsize'] = 15, 8

columns = ['failure','a1', 'a2', 'a3', 'a4', 'load']
sns.heatmap(dataset[columns].corr(),annot=True,cmap='RdYlGn')
fig=plt.gcf()
plt.show()