In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score
from sklearn.covariance import EllipticEnvelope
from sklearn.neighbors import LocalOutlierFactor
from sklearn.ensemble import IsolationForest
from sklearn.svm import OneClassSVM
import seaborn as sns
from pickle import load
from pickle import dump
def print_scores(y, y_pred, model):
    print(
        'Accuracy score: {:.02%}, Precision: {:.02%}, Recall: {:.02%}, F1 score: {:.02%} '.format(
            accuracy_score(y, y_pred),
            precision_score(y, y_pred, pos_label=1),
            recall_score(y, y_pred, pos_label=1),
            f1_score(y, y_pred, pos_label=1)
        ), model
    )
# Data understanding
# Importing the dataset
normal = pd.read_csv('Datasets/bearings/NB.csv')
normal['Fault'] = 1
print('Size of normal data: ',normal.shape)

abnormal = pd.read_csv('Datasets/bearings/IR - 7.csv')
abnormal['Fault'] = -1
print('Size of abnormal data: ',abnormal.shape)
normal.head()
abnormal.head()
# combine normal and abnormal data and reset index for later use
dataset = normal.append(abnormal)
dataset = dataset.reset_index(drop=True)
dataset
normal = dataset[dataset.Fault == 1]
abnormal = dataset[dataset.Fault == -1]
## check null and duplicates
# Checking for null values
features = ['DE', 'FE', 'Fault']
N_null = sum(dataset[features].isnull().sum())
print("The dataset contains {} null values".format(N_null)) 

# Removing duplicates if there exist
N_dupli = sum(dataset.duplicated(keep='first'))
dataset = dataset.drop_duplicates(keep='first').reset_index(drop=True)
print("The dataset contains {} duplicates".format(N_dupli))

# Number of samples in the dataset
N = dataset.shape[0]
print('Size of cleaned dataset: ', N)
6941 duplicates are removed
## Basic statistics
dataset.info()
All data type are numetrical, there is no need to encode feature
dataset.describe()
Statistics are visualized as plot later for better understanding
# save table as csv to folder
dataset.describe().to_csv(r'Statistics/anomaly-detection-bearing-statistics.csv', index = True)
## EDA
# Boxpot and histogram of each feature
for (columnName, columnData) in dataset.iteritems():

    # Creating an empty chart
    fig, ((ax1, ax2)) = plt.subplots(1, 2,  figsize=(15, 4))

    # Extracting the feature values
    x = columnData

    # Boxplot
    ax1.boxplot(x)
    ax1.set_title( 'Boxplot for {}'.format(columnName) )

    # Histogram
    ax2.hist(x, bins=20)
    ax2.set_title( 'Histogram for {}'.format(columnName) )

    # Display
    plt.show()
def plot_feature(data, dataName):
    x = list(range(len(data.index)))
    y = data

    # plot the humidity data
    fig = plt.figure(figsize=(18, 6))
    plt.plot(x,y)
    plt.ylabel(dataName)
    plt.xlabel('Index')
    plt.xlim((0,len(data.index)))
for (columnName, columnData) in dataset.iteritems():
    plot_feature(columnData, columnName)
## Correlation matrix and heatmap
dataset.corr()
# save table as csv to folder
dataset.corr().to_csv(r'Statistics/bearing-corr.csv', index = True)
# make correlation matrix to heatmap
f, ax = plt.subplots(figsize=(6, 6))
sns.heatmap(dataset.corr(), cmap='YlGnBu', vmax = .9, square = True, annot=True)
# Data preparation
### Feature scaling and Split data for novelty detection
sc = StandardScaler()
X_inliers = sc.fit_transform(normal[['DE', 'FE']])
X = sc.transform(dataset[['DE', 'FE']])
y = dataset.iloc[:, 2]
X
y
X_inliers
# pickle data for later use
dump(X, open("pickle/dataset/bearing-anomaly-detection/X.pkl", "wb"))
dump(y, open("pickle/dataset/bearing-anomaly-detection/y.pkl", "wb"))
dump(X_inliers, open('pickle/dataset/bearing-anomaly-detection/X_inliers.pkl', 'wb'))
### Feature scaling and Split data for outliers detection
X_train, X_test = train_test_split(normal, test_size = 0.2, shuffle=False, random_state = 0)
y_train, y_test = train_test_split(abnormal, test_size = 0.2, shuffle=False, random_state = 0)
train = X_train.append(y_train)
train = train.reset_index(drop=True)
train
test = X_test.append(y_test)
test = test.reset_index(drop=True)
test
# pickle data for later use
dump(train, open("pickle/dataset/bearing-anomaly-detection/train.pkl", "wb"))
dump(test, open("pickle/dataset/bearing-anomaly-detection/test.pkl", "wb"))
sc = StandardScaler()
train_data = sc.fit_transform(train[['DE', 'FE']])
test_data = sc.transform(test[['DE', 'FE']])
dump(train_data, open("pickle/dataset/bearing-anomaly-detection/train_data.pkl", "wb"))
dump(test_data, open("pickle/dataset/bearing-anomaly-detection/test_data.pkl", "wb"))
# Novelty detection
train with normal and test with mix => novelty detection
## LOF, novelty detection
# load data
X = load(open('pickle/dataset/bearing-anomaly-detection/X.pkl', 'rb'))
X_inliers = load(open('pickle/dataset/bearing-anomaly-detection/X_inliers.pkl', 'rb'))
y = load(open('pickle/dataset/bearing-anomaly-detection/y.pkl', 'rb'))
lof = LocalOutlierFactor(n_neighbors=50,novelty=True)
lof.fit(X_inliers) 
dataset['pred_anomaly'] = lof.predict(X)
dataset
print_scores(y, dataset['pred_anomaly'], lof)
pred_anomalies = dataset[dataset['pred_anomaly'] == -1]
f, (ax1) = plt.subplots(figsize=(18, 6))
ax1.scatter(pred_anomalies.index, pred_anomalies.DE, label='DE', color='red', s=10)
ax1.plot(dataset.index, dataset.DE, label='DE');
plt.xlim((0,len(dataset.index)))

plt.title('Local Outlier Factor')
plt.legend();
plt.show();
# Outliers detection
Outliers detection train with both inliers and outliers
## Isolation forest
# load data
train = load(open('pickle/dataset/bearing-anomaly-detection/train.pkl', 'rb'))
test = load(open('pickle/dataset/bearing-anomaly-detection/test.pkl', 'rb'))
train_data = load(open('pickle/dataset/bearing-anomaly-detection/train_data.pkl', 'rb'))
test_data = load(open('pickle/dataset/bearing-anomaly-detection/test_data.pkl', 'rb'))
isolationForest =  IsolationForest()
isolationForest.fit(train_data) 
train['pred_anomaly'] = isolationForest.predict(train_data)

# visualization
train_anomalies = train[train['pred_anomaly'] == -1]

f, (ax1) = plt.subplots(figsize=(18, 6))
ax1.scatter(train_anomalies.index, train_anomalies.DE, label='pred_anomaly', color='red', s=10)
ax1.plot(train.index, train.DE, label='DE');
plt.xlim((0,len(train.index)))

plt.title('Isolation Forest')
plt.legend();
plt.show();
print training scores
print_scores(train['Fault'], train['pred_anomaly'], isolationForest)
test['pred_anomaly'] = isolationForest.predict(test_data)

# visualization
test_anomalies = test[test['pred_anomaly'] == -1]

f, (ax1) = plt.subplots(figsize=(18, 6))
ax1.scatter(test_anomalies.index, test_anomalies.DE, label='pred_anomaly', color='red', s=10)
ax1.plot(test.index, test.DE, label='DE');
plt.xlim((0,len(test.index)))

plt.title('Isolation Forest')
plt.legend();
plt.show();
print_scores(test['Fault'], test['pred_anomaly'] , isolationForest)
## Elliptic Envelope
# load data
train = load(open('pickle/dataset/bearing-anomaly-detection/train.pkl', 'rb'))
test = load(open('pickle/dataset/bearing-anomaly-detection/test.pkl', 'rb'))
train_data = load(open('pickle/dataset/bearing-anomaly-detection/train_data.pkl', 'rb'))
test_data = load(open('pickle/dataset/bearing-anomaly-detection/test_data.pkl', 'rb'))
# Elliptic Envelope
ee =  EllipticEnvelope(random_state=0)
train['pred_anomaly'] = ee.fit_predict(train_data)
# visualization
train_anomalies = train[train['pred_anomaly'] == -1]

f, (ax1) = plt.subplots(figsize=(18, 6))
ax1.scatter(train_anomalies.index, train_anomalies.DE, label='pred_anomaly', color='red', s=10)
ax1.plot(train.index, train.DE, label='DE');
plt.xlim((0,len(train.index)))

plt.title('Elliptic Envelope')
plt.legend();
plt.show();
### print training scores
print_scores(train['Fault'], train['pred_anomaly'], ee)
### Predict on testing data
test['pred_anomaly'] = ee.predict(test_data)

# visualization
test_anomalies = test[test['pred_anomaly'] == -1]

f, (ax1) = plt.subplots(figsize=(18, 6))
ax1.scatter(test_anomalies.index, test_anomalies.DE, label='pred_anomaly', color='red', s=10)
ax1.plot(test.index, test.DE, label='DE');
plt.xlim(0, len(test.index))

plt.title('Elliptic Envelope')
plt.legend();
plt.show();
### Print the testing scores
print_scores(test['Fault'], test['pred_anomaly'], ee)
## One class SVM
# load data
train = load(open('pickle/dataset/bearing-anomaly-detection/train.pkl', 'rb'))
test = load(open('pickle/dataset/bearing-anomaly-detection/test.pkl', 'rb'))
train_data = load(open('pickle/dataset/bearing-anomaly-detection/train_data.pkl', 'rb'))
test_data = load(open('pickle/dataset/bearing-anomaly-detection/test_data.pkl', 'rb'))
svm = OneClassSVM(nu = 0.01,kernel="rbf", gamma=0.01)
pred_train = svm.fit(train_data)
train['pred_anomaly'] = pd.Series(svm.predict(train_data))

train_anomalies = train[train['pred_anomaly'] == -1]

f, (ax3) = plt.subplots(figsize=(18, 6))
ax3.scatter(train_anomalies.index, train_anomalies.DE, label='pred_anomaly', color='red', s=10)
ax3.plot(train.index, train.DE, label='DE');
plt.xlim((0,len(train.index)))

plt.title('One Class SVM')
plt.legend();
plt.show();
### Print training scores
print_scores(train['Fault'], train['pred_anomaly'], svm)
### Predict on test data
test['pred_anomaly'] = svm.predict(test_data)

test_anomalies = test[test['pred_anomaly'] == -1]

f, (ax3) = plt.subplots(figsize=(18, 6))
ax3.scatter(test_anomalies.index, test_anomalies.DE, label='pred_anomaly', color='red', s=10)
ax3.plot(test.index, test.DE, label='DE');
plt.xlim(0, len(test.index))

plt.title('One Class SVM')
plt.legend();
plt.show();
print_scores(test['Fault'], test['pred_anomaly'], svm)
### Try default nu
# load data
train = load(open('/Users/yi-chenlin/Desktop/Final project/pickle/dataset/bearing/train.pkl', 'rb'))
test = load(open('/Users/yi-chenlin/Desktop/Final project/pickle/dataset/bearing/test.pkl', 'rb'))
train_data = load(open('/Users/yi-chenlin/Desktop/Final project/pickle/dataset/bearing/train_data.pkl', 'rb'))
test_data = load(open('/Users/yi-chenlin/Desktop/Final project/pickle/dataset/bearing/test_data.pkl', 'rb'))
svm = OneClassSVM(kernel="rbf", gamma=0.01)
pred_train = svm.fit(train_data)
train['pred_anomaly'] = pd.Series(svm.predict(train_data))

train_anomalies = train[train['pred_anomaly'] == -1]

f, (ax3) = plt.subplots(figsize=(18, 6))
ax3.scatter(train_anomalies.index, train_anomalies.DE, label='pred_anomaly', color='red', s=10)
ax3.plot(train.index, train.DE, label='DE');
plt.xlim((0,len(train.index)))

plt.title('One Class SVM')
plt.legend();
plt.show();
print_scores(train['Fault'], train['pred_anomaly'], svm)
test['pred_anomaly'] = svm.predict(test_data)

test_anomalies = test[test['pred_anomaly'] == -1]

f, (ax3) = plt.subplots(figsize=(18, 6))
ax3.scatter(test_anomalies.index, test_anomalies.DE, label='pred_anomaly', color='red', s=10)
ax3.plot(test.index, test.DE, label='DE');
plt.xlim(0, len(test.index))

plt.title('One Class SVM')
plt.legend();
plt.show();
print_scores(test['Fault'], test['pred_anomaly'], svm)