In [None]:

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from scipy import stats

import matplotlib.pyplot as plt
from matplotlib.pyplot import figure

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# Basic Exploratory Data Analysis (EDA)

In [None]:
# Read the CSV file and set first column as the dataframe index
dataset = pd.read_csv("../input/nasa-bearing-dataset-dataset-for-set-no-2/merged_dataset_BearingTest_2.csv", index_col=0)
dataset.describe()

In [None]:
## Column of index 0 is the timestamp (name of the file in the raw dataset)

# Last rows shows the bearing failure (acceleration -> 0) 
dataset.tail()

In [None]:
# Plot the signals
bearing1 = dataset['Bearing 1']
bearing1_index = bearing1.index.values
bearing2 = dataset['Bearing 2']
bearing2_index = bearing1.index.values
bearing3 = dataset['Bearing 3']
bearing3_index = bearing1.index.values
bearing4 = dataset['Bearing 4']
bearing4_index = bearing1.index.values

figure(figsize=(15, 4), dpi=80)

bearing1.plot(color='green', label='Bearing 1')
bearing2.plot(color='yellow', label='Bearing 2')
bearing3.plot(color='orange', label='Bearing 3')
bearing4.plot(color='blue', label='Bearing 4')
plt.xlabel('Timestamp')
plt.ylabel('Acceleration')
plt.legend(loc="upper left")  
plt.title('Time series for all 4 accelerometers', fontweight ="bold")
plt.show()

# Normalize the dataset

In [None]:
from sklearn import preprocessing

# Dataset is scaled so that maximum for every column is 1
scaler = preprocessing.MinMaxScaler()
dataset_scaled = pd.DataFrame(scaler.fit_transform(dataset), 
                              columns=dataset.columns, 
                              index=dataset.index)
dataset_scaled.describe()

# Exploratory Data Analysis (EDA) & features selection
## Extract each bearing's acceleration data

In [None]:
bearing1 = dataset_scaled['Bearing 1']
bearing1_index = bearing1.index.values

bearing2 = dataset_scaled['Bearing 2']
bearing2_index = bearing1.index.values

bearing3 = dataset_scaled['Bearing 3']
bearing3_index = bearing1.index.values

bearing4 = dataset_scaled['Bearing 4']
bearing4_index = bearing1.index.values

#### Then plot the 4 signals (normalized) together

In [None]:
figure(figsize=(15, 4), dpi=80)

bearing1.plot(color='green', label='Bearing 1')
bearing2.plot(color='yellow', label='Bearing 2')
bearing3.plot(color='blue', label='Bearing 3')
bearing4.plot(color='orange', label='Bearing 4')
plt.xlabel('Timestamp')
plt.ylabel('Acceleration')
plt.legend(loc="upper left")  
plt.title('Time series for all 4 accelerometers (normalized signals)', fontweight ="bold")
plt.show()

## Features selection
We're going to carry out a simplified analysis using only in two dimensions. So we have to select 2 out 4 signals.
 - We'll take the 2 curves that have the more *singular* shapes. For this we are assuming that non-selected features have a similar shape to one of the selected
 - From the figure above we can select **Bearings 1 and 3**:
     - `Bearing 4` shows a similar shape to `Bearing 1`
     - `Bearing 2` is something between `Bearing 1` and `Bearing 3` (it would be right like that if the signal could be obtained as a linear combination of curves 1 and 3)

Let's remove signals 2 and 4 and plot again.

In [None]:
figure(figsize=(15, 4), dpi=80)

bearing1.plot(color='green', label='Bearing 1')
bearing3.plot(color='blue', label='Bearing 3')
plt.xlabel('Timestamp')
plt.ylabel('Acceleration')
plt.legend(loc="upper left")  
plt.title('Time series for accelerometers 1 and 3 (normalized signals)', fontweight ="bold")
plt.show()

**Bearing 3** show overall greater and number of outliers' values. So select it as the second dimension

In [None]:
# VISUALIZE THE DATA IN THE 2 DIMENSIONAL SPACE
# Transform dataframe columns to np arrays
sample_size = dataset.shape[0]

dim1_arr = np.array(bearing1)[:sample_size]
dim2_arr = np.array(bearing3)[:sample_size]
# Create a meshgrid
xx, yy = np.meshgrid(np.linspace(0, 1, 200),
                     np.linspace(0, 1, 200))
# scatter plot
figure(figsize=(7, 6), dpi=80)
plt.scatter(dim1_arr, dim2_arr, marker='x')
plt.xlim((0,1))
plt.ylim((0,1))
plt.xlabel('Dimension X (bearing 1)')
plt.ylabel('Dimension Y (bearing 3)')
plt.title('Scatter plot: '+str(sample_size)+' points from the beginning', fontweight ="bold")
plt.show()

Hence the **reduced dataset** are the columns `Bearing 1` and `Bearing 3` of the whole dataset:

In [None]:
dataset_reduced = dataset_scaled[['Bearing 1','Bearing 3']]

# KNN algorithm for outliers' identification

In [None]:
!pip install pyod

In [None]:
from pyod.models.knn import KNN 
from pyod.utils.data import get_outliers_inliers
import matplotlib.font_manager

In [None]:
# Setting the percentage of outliers !!!!!! ATENCIÓN!!
outlier_fraction = 0.25
#ESTE ES UNO DE LOS PARÁMETROS MODIFICABLES. Cuanto mayor sea, más conservador eres
#Es decir, cuánto mayor sea, más puntos anómalos vas a tener fuera de tu clúster.

#Al aumentar el outlier fraction, salen más puntos anómalos, y eres más conservador, y al bajarlo, eres menos. La elección de un valor u otro va a depender de la antelación con la que tengas que saber la anomalía.

In [None]:
# Training the classifier
clf = KNN(contamination = outlier_fraction)
clf.fit(dataset_reduced)

## Compute scores and threshold for labeling outliers

In [None]:
scores_pred = clf.decision_function(dataset_reduced)*-1
print ("Scores' predictions range from", "{:.4f}".format(min(scores_pred)), "to {:.4f}".format(max(scores_pred)) )
print ("\nScores' predictions for first 5 points:       ", scores_pred[:5] )
print ("Scores' predictions for 5 intermediate points:", scores_pred[850:855] )
print ("Scores' predictions for  last 5 points:       ", scores_pred[-5:] )

In [None]:
# threshold value to consider a datapoint as outlier
threshold = stats.scoreatpercentile(scores_pred, 100 * outlier_fraction)
print ("Threshold value to label outliers is" , "{:.4f}".format(threshold) )

In [None]:
y_pred = clf.predict(dataset_reduced) # Dataset is not labeled, so this prediction cannot be used to compute any error
# !!! We're applying unsupervised learning, just find the index from which the remaining points are labeled as outliers
print ("Outliers prediction for first 5 points:       ", y_pred[:5] )
print ("Outliers prediction for 5 intermediate points:", y_pred[850:855] )
print ("Outliers predictions for last 5 points:       ", y_pred[-5:] )

## Extract outliers

In [None]:
## We'll use y_pred to identify outliers
# Storing the outliers and inliners in different numpy arrays
X_outliers, X_inliers = get_outliers_inliers(np.array(dataset_reduced), y_pred)
n_inliers = len(X_inliers)
n_outliers = len(X_outliers)
print("There are", n_inliers, "inliers and", n_outliers, "outliers")

## Scatter plot of outliers

In [None]:
figure(figsize=(7, 6), dpi=80)

# Reduced dataset
plt.scatter(dataset_reduced.iloc[:,0], dataset_reduced.iloc[:,1], marker='x')
# Encircle outliers
plt.scatter(X_outliers[:,0],X_outliers[:,1],marker="o",facecolor="none",edgecolor="r",s=70)

plt.xlim((0,1))
plt.ylim((0,1))
plt.xlabel('Dimension X (bearing 1)')
plt.ylabel('Dimension Y (bearing 3)')
plt.title('Scatter plot encircling the outliers', fontweight ="bold")
plt.show()

## Time series plot highlighting outliers

In [None]:
# Add a column tagging each point as inlier (0) or outlier (1)
dataset_reduced['outlier'] = y_pred

# Subset of data points with only outliers
dataset_outliers = dataset_reduced[ dataset_reduced['outlier']==1 ]

print("There are", dataset_reduced.shape[0], "data points, of which", dataset_outliers.shape[0], "are outliers")

In [None]:
dimensionX = dataset_reduced.iloc[:,0]
dimensionY = dataset_reduced.iloc[:,1]
dimensionX_outliers = dataset_outliers.iloc[:,0]
dimensionY_outliers = dataset_outliers.iloc[:,1]

In [None]:
figure(figsize=(15, 4), dpi=80)

tail_size    = 900
x_ticks_span = 150

timestamps = dataset_reduced.index[-tail_size:]

dimX_arr = dimensionX[-tail_size:]
plt.plot(timestamps, dimX_arr, color='green', label='(bearing 1) acceleration')
plt.plot(dimensionX_outliers.index, dimensionX_outliers, 'v', color='red', label='(bearing 1) outliers')

dimY_arr = dimensionY[-tail_size:]
plt.plot(timestamps, dimY_arr, color='blue', label='(bearing 3) acceleration')
plt.plot(dimensionY_outliers.index, dimensionY_outliers, 'v', color='orange', label='(bearing 3) outliers')

plt.xlabel('Timestamp')
plt.xlim(0,tail_size)
plt.xticks(np.arange(0, tail_size+1, x_ticks_span), fontsize=10, rotation = 45)
plt.ylabel('Acceleration')

plt.legend(loc="upper left")  
plt.title('Time series for accelerometers 1 and 3 showing outliers', fontweight ="bold")
plt.show()

# 