In [None]:
# Computing libraries
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Graphical libraries
import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec
import seaborn as sns
from pylab import rcParams

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

## Build healthy/broken dataset

In [None]:
path = '/kaggle'
input_path = path + '/input/gearbox-fault-diagnosis-elaborated-datasets/gearbox-fault-diagnosis-elaborated-datasets/stdev/'
broken_dataset  = "broken30hz_stdev_100.csv"
healthy_dataset = "healthy30hz_stdev_100.csv"

In [None]:
healthyDataset = pd.read_csv(input_path + healthy_dataset)
brokenDataset = pd.read_csv(input_path + broken_dataset)

dataset = pd.concat([healthyDataset, brokenDataset], axis=0)
dataset.describe()

# Evaluate classification

In [None]:
# Predictor variable (features)
columns = ['a1', 'a2', 'a3', 'a4', 'load']
X = dataset[columns]
# Target variable: Failure (boolean)
#y = dataset.iloc[:,-1]
y = dataset[['failure']]

## Setup logistic regression model

In [None]:
# Split the dataset: 80% train, 20% test
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=0)

# Logistic regression classifier
from sklearn.linear_model import LogisticRegression

# Setup the model
logis = LogisticRegression(max_iter=1000)

## Evaluate the model

In [None]:
from numpy import mean
from numpy import std
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold
# Define the model evaluation procedure
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
# evaluate the model and collect the scores
n_scores = cross_val_score(logis, X, y, scoring='accuracy', cv=cv, n_jobs=-1)
# report the model performance
print('Mean Accuracy: %.3f (%.3f)' % (mean(n_scores), std(n_scores)))

## Train and prediction

In [None]:
# Train the model
logis_trained = logis.fit(X_train,y_train)

y_pred = logis.predict(X_test)
print("Prediction for the test data (first 10 rows):", y_pred[:10])
print("Actual gearbox condition  (first 10 rows):   ", np.array(y_test)[:,0][:10])

### Probability vs. predicted class

In [None]:
# Print np arrays with 2 decimal places, without scientifc notation
np.set_printoptions(suppress=True, precision=2)

In [None]:
y_prob = logis.predict_proba(X_test)[:,1]
print("Probability for the test data (first 10 rows): \n", y_prob[:10])

In [None]:
# Select the first 100 points for clarity in the graph
n_points = 100
probability = y_prob[:n_points]
pred_class = np.array(y_pred)[:n_points]

# Plot the graph
plt.figure(figsize = (5,5), dpi=100)
plt.plot(probability, pred_class, 'o', color='red', label = "Predicted class as a function of probability")
#plt.plot(y_pred, y_prob, 'o', color='black');

plt.xlabel('Probability')
plt.ylabel('Predicted class (1: broken / 0: healthy) ')
plt.legend()
plt.show()

### Probability vs. actual class

In [None]:
# Select the first 100 points for clarity in the graph
n_points = 100
probability = y_prob[:n_points]
actual_class = np.array(y_test)[:n_points,0]

# Plot the graph
plt.figure(figsize = (5,5), dpi=100)
plt.plot(probability, actual_class, 'o', color='blue', label = "Actual state vs. probability")

plt.xlabel('Probability')
plt.ylabel('Actual class (1: broken / 0: healthy) ')

plt.legend()

plt.show()

### Probability vs. sensor data

In [None]:
plt.figure(figsize = (5,5), dpi=100)

a1 = X_test.iloc[:,0]
a2 = X_test.iloc[:,1]
a3 = X_test.iloc[:,2]
a4 = X_test.iloc[:,3]

plt.plot(a1, y_prob, 'o', color='red', label = "Broken probability vs. a1")
#plt.plot(y_pred, y_prob, 'o', color='black');

plt.xlabel('stdev(a1)')
plt.ylabel('Probability of broken')

plt.legend()

plt.show()

# ROC curve

In [None]:
# For the ROC curve we need not only the predicted class ('Outcome'), but also the scores on what the predictions are based
# * if threshold 0
#  ** predict>0 => Outcome=1
#  ** predict<0 => Outcome=0 
#  - Remember that predictions were calculated with METHOD .predict(X_test)
#  - Now we also have to include METHOD .decision_function(X_test)
y_pred_score = logis.decision_function(X_test)

In [None]:
from sklearn.metrics import roc_curve, roc_auc_score, auc

# If using the test set (X_test, y_test)
fpr,tpr,threshold= roc_curve(y_test, y_pred_score)

In [None]:
# Area Under Curve
lr_auc = auc(fpr, tpr)
print("AUC=", lr_auc)

In [None]:
# This default size comes from above
#rcParams['figure.figsize'] = 12, 8

plt.figure(figsize = (5,5), dpi=100)

plt.plot(fpr, tpr, color='red', linestyle='-', label = "Logistic Regression (auc  = %0.3f)"%lr_auc)
plt.plot([0,1],[0,1],color='blue',linestyle='--')

plt.xlabel('False Positive Rate (1-specificity)')
plt.ylabel('True Positive Rate (sensitivity)')

plt.legend()

plt.show()