In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import seaborn as sns
from pylab import rcParams
import matplotlib.pyplot as plt

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

## Fault type identification
There are 10 types of faults, linked to each bearing deffect:
*   **Ball_007_1**: Ball defect (0.007 inch)
*   **Ball_014_1**: Ball defect (0.014 inch)
*   **Ball_021_1**: Ball defect (0.021 inch)
*   **IR_007_1**:   Inner race fault (0.007 inch)
*   **IR_014_1**:   Inner race fault (0.014 inch)
*   **IR_021_1**:   Inner race fault (0.021 inch)
*   **Normal_1**:   Normal
* **OR_007_6_1**:   Outer race fault (0.007 inch, data collected from 6 O'clock position)
* **OR_014_6_1**:   Outer race fault (0.014 inch, 6 O'clock)
* **OR_021_6_1**:   Outer race fault (0.021 inch, 6 O'clock)

## Get the data
The file we will read is the result of preprocessing the raw data files (folder `/kaggle/input/cwru-bearing-datasets/raw/`).

Time series segments contains 2048 points each. Given that the sampling frequency is 48kHz each time serie covers 0.04 seconds.

In [None]:
data_time = pd.read_csv("../input/cwru-bearing-datasets/feature_time_48k_2048_load_1.csv")
data_time.head()

## Encode fault types with numbers

In [None]:
from sklearn.preprocessing import OrdinalEncoder

ord_enc = OrdinalEncoder()
data_time["fault_code"] = ord_enc.fit_transform(data_time[["fault"]])
data_time[["fault", "fault_code"]]

In [None]:
# List labels of bearing deffects
data_time.fault_code.unique() # Equivalent: data_time['fault_code'].unique()

In [None]:
# How many data points are there for each defect type?
data_time[['fault_code', 'fault']].value_counts()

In [None]:
# View some rows of the time serie for 'Ball_007_1' defect
data_time[data_time['fault'] == "Ball_007_1"]

# Exploratory data analysis (EDA)

In [None]:
# Dataframe keeps only the labeled column (with an ordinal number)
data_time_labeled = data_time.drop('fault', 1)

## Correlation matrix

In [None]:
rcParams['figure.figsize'] = 12, 10
sns.heatmap(data_time_labeled.corr(),annot=True,cmap='RdYlGn')
fig=plt.gcf()
plt.show()

## 1st task: Which faults are better classified with the accelerometer signal


In [None]:
# Split bearing defects according to their location
ball_defects = ['Normal_1','Ball_007_1','Ball_014_1','Ball_021_1']
IR_defects =   ['Normal_1','IR_007_1','IR_014_1','IR_021_1']
OR_defects =  ['Normal_1','OR_007_6_1','OR_014_6_1','OR_021_6_1']

In [None]:
# Split features in two groups for better visualization
features_basic_subset =    data_time[['max','min','mean','sd','rms', 'fault']]
features_advanced_subset = data_time[['skewness','kurtosis','crest','form', 'fault']]

In [None]:
# Filter rows for the selected list of defects
defect_subset = features_basic_subset

ball_defects_data = defect_subset.loc[defect_subset['fault'].isin(ball_defects)]
IR_defects_data =   defect_subset.loc[defect_subset['fault'].isin(IR_defects)]
OR_defects_data =   defect_subset.loc[defect_subset['fault'].isin(OR_defects)]

ball_defects_data['fault'].value_counts()

In [None]:
rcParams['figure.figsize'] = 6, 5
sns.pairplot(IR_defects_data,hue='fault',palette='Dark2')

## 2nd task: Which features are more relevant for the fault identification
Select a group of columns in **data_time** dataframe, including the defect label (column `fault`). Then explore how the selected features are able to discriminate the fault type. 

In [None]:
# You can take any combination of features. Here it's selected the advanced list for demonstration purposes
#defect_subset = features_basic_subset OR features_advanced_subset
# And this would be the best set looking at the correlation matrix
features_custom_subset = data_time[['max','mean','kurtosis','crest', 'form','fault']]
defect_subset = features_custom_subset

In [None]:
# Filter rows for the selected list of defects
ball_defects_data = defect_subset.loc[defect_subset['fault'].isin(ball_defects)]
IR_defects_data =   defect_subset.loc[defect_subset['fault'].isin(IR_defects)]
OR_defects_data =   defect_subset.loc[defect_subset['fault'].isin(OR_defects)]

In [None]:
rcParams['figure.figsize'] = 8, 6
sns.pairplot(IR_defects_data,hue='fault',palette='Dark2')