In [None]:
# import package
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.datasets import make_classification
from scipy.io import loadmat
from sklearn.preprocessing import MinMaxScaler
from sklearn import metrics
import matplotlib.pyplot as plt
import seaborn as sns
from pickle import dump
from pickle import load
# import data
# docker: /opt/nb/Desktop/...
# local: (copy file path and paste)
train = pd.read_csv('Datasets/bearings/NB.csv')
train['Fault'] = 0

test = pd.read_csv('Datasets/bearings/IR - 7.csv')
test['Fault'] = 1

dataset = train.append(test)
dataset
## Data understanding
dataset.info()
### Basic statistics
dataset.describe()
### EDA
# Boxpot and histogram of each feature
for (columnName, columnData) in dataset.iteritems():
    # Creating an empty chart
    fig, ((ax1, ax2)) = plt.subplots(1, 2,  figsize=(15, 4))

    # Extracting the feature values
    x = columnData

    # Boxplot
    ax1.boxplot(x)
    ax1.set_title( 'Boxplot for {}'.format(columnName) )

    # Histogram
    ax2.hist(x, bins=20)
    ax2.set_title( 'Histogram for {}'.format(columnName) )

    # Display
    plt.show()
### Null and duplicated value check
# Checking for null values
features = ['DE', 'FE', 'Fault']
N_null = sum(dataset[features].isnull().sum())
print("The dataset contains {} null values".format(N_null)) #0 null values

# Removing duplicates if there exist
N_dupli = sum(dataset.duplicated(keep='first'))
dataset = dataset.drop_duplicates(keep='first').reset_index(drop=True)
print("The dataset contains {} duplicates".format(N_dupli))

# Number of samples in the dataset
N = dataset.shape[0]
6941 duplicated values are dropped
dataset.info()
### Correlation matrix and heatmap
dataset.corr()
# make correlation matrix to heatmap
f, ax = plt.subplots(figsize=(5, 5))
sns.heatmap(dataset.corr(), cmap='YlGnBu', vmax = .9, square = True, annot=True)
def plot_feature(data, dataName):
    x = list(range(len(data.index)))
    y = data

    # plot the humidity data
    fig = plt.subplots(figsize=(18, 6))
    #fig = plt.figure()
    plt.plot(x,y)
    plt.ylabel(dataName)
    plt.xlabel('Index')
for (columnName, columnData) in dataset.iteritems():
    plot_feature(columnData, columnName)
## Data preparation
### Feature engineering
### Split data
#split the data into independent 'X' and dependent 'Y' variables
X = dataset.iloc[:, 0:2].values
y = dataset.iloc[:, 2]
#split the data set into 80% training and 20% testing
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)
##### Save traing data and testing data to file for later use
dump(X_train, open("pickle/dataset/bearing-failure-prediction/X_train.pkl", "wb"))
dump(y_train, open("pickle/dataset/bearing-failure-prediction/y_train.pkl", "wb"))
dump(X_test, open("pickle/dataset/bearing-failure-prediction/X_test.pkl", "wb"))
dump(y_test, open("pickle/dataset/bearing-failure-prediction/y_test.pkl", "wb"))
#### Scaler
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train_sc = sc.fit_transform(X_train)
X_test_sc = sc.transform(X_test)
from sklearn.preprocessing import MinMaxScaler
mm = MinMaxScaler()
X_train_mm = mm.fit_transform(X_train)
X_test_mm = mm.transform(X_test)
#### Save scaled training and testing data
dump(X_train_sc, open("pickle/dataset/bearing-failure-prediction/X_train_sc.pkl", "wb"))
dump(X_test_sc, open("pickle/dataset/bearing-failure-prediction/X_test_sc.pkl", "wb"))
dump(X_train_mm, open("pickle/dataset/bearing-failure-prediction/X_train_mm.pkl", "wb"))
dump(X_test_mm, open("pickle/dataset/bearing-failure-prediction/X_test_mm.pkl", "wb"))
## Modeling
def train_models(X_train, y_train):
  #use logistic regression
  from sklearn.linear_model import LogisticRegression
  log = LogisticRegression(random_state = 0)
  log.fit(X_train, y_train)

  #use KNeighbors
  from sklearn.neighbors import KNeighborsClassifier
  knn = KNeighborsClassifier(n_neighbors = 5, metric = 'minkowski', p=2)
  knn.fit(X_train, y_train)

  #use SVC (liner kernel)
  from sklearn.svm import SVC
  svc_lin = SVC(kernel = 'linear', random_state = 0)
  svc_lin.fit(X_train, y_train)

  #use SVC (RBF kernel)
  from sklearn.svm import SVC
  svc_rbf = SVC(kernel = 'rbf', random_state = 0)
  svc_rbf.fit(X_train, y_train)

  #use GaussianNB
  from sklearn.naive_bayes import GaussianNB
  gauss = GaussianNB()
  gauss.fit(X_train, y_train)

  #use Decision Tree
  from sklearn.tree import DecisionTreeClassifier
  tree = DecisionTreeClassifier(criterion = 'entropy', random_state = 0)
  tree.fit(X_train, y_train)

  #use the RandomForestClassifier
  from sklearn.ensemble import RandomForestClassifier
  forest = RandomForestClassifier(n_estimators = 10, criterion = 'entropy', random_state = 0)
  forest.fit(X_train, y_train)

  #print the tranning accurancy of each model
  print('[0]Logistic Regression Tranning Accurancy: ', log.score(X_train, y_train))
  print('[1]KNeighbors Tranning Accurancy: ', knn.score(X_train, y_train))
  print('[2]SVC (liner kernel) Tranning Accurancy: ', svc_lin.score(X_train, y_train))
  print('[3]SVC (RBF kernel) Tranning Accurancy: ', svc_rbf.score(X_train, y_train))
  print('[4]GaussianNB Tranning Accurancy: ', gauss.score(X_train, y_train))
  print('[5]Decision Tree Tranning Accurancy: ', tree.score(X_train, y_train))
  print('[6]RandomForestClassifier Tranning Accurancy: ', forest.score(X_train, y_train))

  return log, knn, svc_lin, svc_rbf, gauss, tree, forest
### Try with standard scaled training data first
# load data
X_train_sc = load(open('pickle/dataset/bearing-failure-prediction/X_train_sc.pkl', 'rb'))
X_test_sc = load(open('pickle/dataset/bearing-failure-prediction/X_test_sc.pkl', 'rb'))
y_train = load(open('pickle/dataset/bearing-failure-prediction/y_train.pkl', 'rb'))
y_test = load(open('pickle/dataset/bearing-failure-prediction/y_test.pkl', 'rb'))
models_sc = train_models(X_train_sc, y_train)
for i in range(len(models_sc)):
    # save model
    fileName = 'pickle/models/bearing_classifiers/standard_scaled/' + str(models_sc[i].__class__.__name__) + '.pkl'
    dump(models_sc[i], open(fileName, 'wb'))
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score, confusion_matrix
def print_scores(y, y_pred, model):
    print(
        'Accuracy score: {:.02%}, Precision: {:.02%}, Recall: {:.02%}, F1 score: {:.02%} '.format(
            accuracy_score(y, y_pred),
            precision_score(y, y_pred, pos_label=1),
            recall_score(y, y_pred, pos_label=1),
            f1_score(y, y_pred, pos_label=1)
        ), model
    )
for i in range(len(models_sc)):
  y_pred = models_sc[i].predict(X_test_sc)
  cm = confusion_matrix(y_test, y_pred)
  index = ['Normal','Inner Race (0.007")']  
  columns = ['Normal','Inner Race (0.007")']  
  cm_df = pd.DataFrame(cm,columns,index)  


  plt.figure(figsize=(10,4))
  sns.set(font_scale=1.4) # for label size
  sns.heatmap(cm_df, annot=True, fmt='g') # font size
  plt.title('Confusion matrix')
  plt.xlabel('Predicted')
  plt.ylabel('True')
  plt.show()

  print_scores(y_test, y_pred, models_sc[i])
## Try with Min_Max_scaled data
# load data
X_train_mm = load(open('pickle/dataset/bearing-failure-prediction/X_train_mm.pkl', 'rb'))
X_test_mm = load(open('pickle/dataset/bearing-failure-prediction/X_test_mm.pkl', 'rb'))
y_train = load(open('pickle/dataset/bearing-failure-prediction/y_train.pkl', 'rb'))
y_test = load(open('pickle/dataset/bearing-failure-prediction/y_test.pkl', 'rb'))
min_max_scaled_models = train_models(X_train_mm, y_train)
for i in range(len(min_max_scaled_models)):
    # save model
    fileName = 'pickle/models/bearing_classifiers/min_max_scaled/' + str(models_sc[i].__class__.__name__) + '.pkl'
    dump(min_max_scaled_models[i], open(fileName, 'wb'))
for i in range(len(min_max_scaled_models)):
  y_pred = min_max_scaled_models[i].predict(X_test_mm)
  cm = confusion_matrix(y_test, y_pred)
  index = ['Normal','Inner Race (0.007")']  
  columns = ['Normal','Inner Race (0.007")']  
  cm_df = pd.DataFrame(cm,columns,index)  


  plt.figure(figsize=(10,4))
  sns.set(font_scale=1.4) # for label size
  sns.heatmap(cm_df, annot=True, fmt='g') # font size
  plt.title('Confusion matrix')
  plt.xlabel('Predicted')
  plt.ylabel('True')
  plt.show()

  print_scores(y_test, y_pred, min_max_scaled_models[i])

### TPOT
# load data
X_train = load(open('pickle/dataset/bearing-failure-prediction/X_train.pkl', 'rb'))
X_test = load(open('pickle/dataset/bearing-failure-prediction/X_test.pkl', 'rb'))
y_train = load(open('pickle/dataset/bearing-failure-prediction/y_train.pkl', 'rb'))
y_test = load(open('pickle/dataset/bearing-failure-prediction/y_test.pkl', 'rb'))
from sklearn.datasets import make_classification
from sklearn.model_selection import RepeatedStratifiedKFold
from tpot import TPOTClassifier

# define model evaluation
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
# define search
model = TPOTClassifier(generations=5, population_size=50, cv=cv, scoring='accuracy', verbosity=2, random_state=1, n_jobs=-1)
# perform the search
model.fit(X_train, y_train)
# export the best model
model.export('bearing_classification_tpot_best_model.py')
model.export('/Users/yi-chenlin/Desktop/Final project/TPOT/bearing_classifiers/bearing_classification_tpot_best_model.py')
model.score(X_train, y_train)
the following matrix should be re-made since the scores are not corrrect
y_pred_tpot = model.predict(X_test)
cm = confusion_matrix(y_test, y_pred_tpot)
index = ['Normal','Inner Race (0.007")']  
columns = ['Normal','Inner Race (0.007")']  
cm_df = pd.DataFrame(cm,columns,index)  


plt.figure(figsize=(10,4))
sns.set(font_scale=1.4) # for label size
sns.heatmap(cm_df, annot=True, fmt='g') # font size
plt.title('Confusion matrix')
plt.xlabel('Predicted')
plt.ylabel('True')
plt.show()

print_scores(y_pred_tpot, y_test, model)
model.score(X_test, y_test)
from sklearn.metrics import classification_report
cr = classification_report(y_test, y_pred_tpot, target_names=['Normal','Inner Race (0.007")'])
print(cr)
## Auto-sklearn
# print autosklearn version
import autosklearn
print('autosklearn: %s' % autosklearn.__version__)
X_train = load(open('/opt/nb/Desktop/Predcitive Maintenance with Machine Learning_Yi-Chen Lin/pickle/dataset/bearing-failure-prediction/X_train.pkl', 'rb'))
y_train = load(open('/opt/nb/Desktop/Predcitive Maintenance with Machine Learning_Yi-Chen Lin/pickle/dataset/bearing-failure-prediction/y_train.pkl', 'rb'))
X_test = load(open('/opt/nb/Desktop/Predcitive Maintenance with Machine Learning_Yi-Chen Lin/pickle/dataset/bearing-failure-prediction/X_test.pkl', 'rb'))
y_test = load(open('/opt/nb/Desktop/Predcitive Maintenance with Machine Learning_Yi-Chen Lin/pickle/dataset/bearing-failure-prediction/y_test.pkl', 'rb'))
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from autosklearn.classification import AutoSklearnClassifier

# define search
model = AutoSklearnClassifier(time_left_for_this_task=2*60, per_run_time_limit=30, n_jobs=8)
# perform the search
model.fit(X_train, y_train)
# summarize
print(model.sprint_statistics())
# evaluate best model
y_hat = model.predict(X_test)
acc = accuracy_score(y_test, y_hat)
print("Accuracy: %.3f" % acc)
dump(model, open('/opt/nb/Desktop/Predcitive Maintenance with Machine Learning_Yi-Chen Lin/pickle/models/bearing_classifiers/auto_sklearn.pkl', 'wb'))
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score, confusion_matrix
def print_scores(y, y_pred, model):
    print(
        'Accuracy score: {:.02%}, Precision: {:.02%}, Recall: {:.02%}, F1 score: {:.02%} '.format(
            accuracy_score(y, y_pred),
            precision_score(y, y_pred, pos_label=1),
            recall_score(y, y_pred, pos_label=1),
            f1_score(y, y_pred, pos_label=1)
        ), model
    )
cm = confusion_matrix(y_test, y_hat)
index = ['Normal','Inner Race (0.007")']  
columns = ['Normal','Inner Race (0.007")']  
cm_df = pd.DataFrame(cm,columns,index)  


plt.figure(figsize=(10,4))
sns.set(font_scale=1.4) # for label size
sns.heatmap(cm_df, annot=True, fmt='g') # font size
plt.title('Confusion matrix')
plt.xlabel('Predicted')
plt.ylabel('True')
plt.show()

print_scores(y_test, y_hat, model)