In [None]:
# Computing libraries
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Graphical libraries
import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec
import seaborn as sns
from pylab import rcParams

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

## Build healthy/broken dataset

In [None]:
path = '/kaggle'
input_path = path + '/input/gearbox-fault-diagnosis-elaborated-datasets/gearbox-fault-diagnosis-elaborated-datasets/stdev/'
broken_dataset  = "broken30hz_stdev_100.csv"
healthy_dataset = "healthy30hz_stdev_100.csv"

In [None]:
healthyDataset = pd.read_csv(input_path + healthy_dataset)
brokenDataset = pd.read_csv(input_path + broken_dataset)

dataset = pd.concat([healthyDataset, brokenDataset], axis=0)
dataset.describe()

# Exploring the dataset

In [None]:
plt.figure(figsize=(12,28*4))
gs = gridspec.GridSpec(28, 1)
for i, cn in enumerate(dataset[['a1','a2','a3','a4']]):
    ax = plt.subplot(gs[i])
    sns.distplot(dataset[cn][dataset.load == 0], bins=50)
    sns.distplot(dataset[cn][dataset.load == 50], bins=50)
    sns.distplot(dataset[cn][dataset.load == 90], bins=50)
    ax.set_xlabel('')
    plt.legend(['0%', '50%', '90%'])
    ax.set_title('histogram for ' + str(cn) + ': healthy & broken')
    ax.set_title('histogram for ' + str(cn))
plt.show()

## Correlation matrix

In [None]:
# Broken and healthy gearbox at once
rcParams['figure.figsize'] = 8, 6
columns = ['failure','a1', 'a2', 'a3', 'a4', 'load']
sns.heatmap(dataset[columns].corr(),annot=True,cmap='RdYlGn')
fig=plt.gcf()
plt.show()

In [None]:
# Healthy gearbox
dataset0 = dataset[dataset.failure == 0]
columns = ['a1', 'a2', 'a3', 'a4', 'load']
sns.heatmap(dataset0[columns].corr(),annot=True,cmap='RdYlGn')
fig=plt.gcf()
plt.show()

In [None]:
# Broken gearbox
dataset1 = dataset[dataset.failure == 1]
columns = ['a1', 'a2', 'a3', 'a4', 'load']
sns.heatmap(dataset1[columns].corr(),annot=True,cmap='RdYlGn')
fig=plt.gcf()
plt.show()

## Selected histograms

In [None]:
plt.figure(figsize=(12,28*4))
gs = gridspec.GridSpec(28, 1)
for i, cn in enumerate(dataset[['a1']]): # [['a1','a2','a3','a4']]):
    ax = plt.subplot(gs[i])
    sns.distplot(dataset[cn][dataset.load == 0], bins=50)
    sns.distplot(dataset[cn][dataset.load == 50], bins=50)
    sns.distplot(dataset[cn][dataset.load == 90], bins=50)
    ax.set_xlabel('')
    plt.legend(['0%', '50%', '90%'])
    ax.set_title('histogram for ' + str(cn) + ': healthy & broken')
plt.show()

In [None]:
plt.figure(figsize=(12,28*4))
gs = gridspec.GridSpec(28, 1)
for i, cn in enumerate(dataset[['a1']]): # [['a1','a2','a3','a4']]):
    ax = plt.subplot(gs[i])
    sns.distplot(dataset[cn][ (dataset.load == 50) & (dataset.failure == 1) ], bins=50)
    sns.distplot(dataset[cn][ (dataset.load == 50) & (dataset.failure == 0)  ], bins=50)
    sns.distplot(dataset[cn][ (dataset.load == 90) & (dataset.failure == 1) ], bins=50)
    sns.distplot(dataset[cn][ (dataset.load == 90) & (dataset.failure == 0)  ], bins=50)
    ax.set_xlabel('')
    plt.legend(['50%, broken', '50%, healthy','90%, broken', '90%, healthy'])
    ax.set_title('histogram for ' + str(cn) + ': healthy & broken')
    ax.set_title('histogram for ' + str(cn))
plt.show()

# Multinomial logistic regression

In [None]:
# Predictor variable (features)
columns = ['a1', 'a2', 'a3', 'a4']

X = healthyDataset[columns]
# Target variable: Load
y = healthyDataset[['load']]

In [None]:
# Split the dataset: 80% train, 20% test
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=0)

# Logistic regression classifier
from sklearn.linear_model import LogisticRegression

# Setup the model
model = LogisticRegression(multi_class='multinomial', solver='lbfgs', max_iter=1000)

## Evaluate the model

In [None]:
from numpy import mean
from numpy import std
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold
# Define the model evaluation procedure
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
# evaluate the model and collect the scores
n_scores = cross_val_score(model, X, y, scoring='accuracy', cv=cv, n_jobs=-1)
# report the model performance
print('Mean Accuracy: %.3f (%.3f)' % (mean(n_scores), std(n_scores)))

## Training and prediction

In [None]:
# Train the model
model.fit(X_train, y_train)

# Predict Load for the test set
y_pred = model.predict(X_test)
y_prob = model.predict_proba(X_test)[:,1]

# Print np arrays with 2 decimal places, without scientifc notation
np.set_printoptions(suppress=True, precision=2)
print("Predicted probabilities test (first 10 rows): ", y_prob[:10])
print("Prediction for the test data (first 10 rows): ", y_pred[:10])
#print(X_test.iloc[:10,:])

# Actual loal for the test set
print("Actual load for the test data (first 10 rows):", np.array(y_test)[:,0][:10] )

## Evaluate the model

In [None]:
from sklearn.metrics import mean_squared_error, r2_score

In [None]:
# The coefficients
print('Coefficients: \n', model.coef_)
# The mean squared error
print('Mean squared error: %.2f'
      % mean_squared_error(y_test, y_pred))
# The coefficient of determination: 1 is perfect prediction
print('Coefficient of determination: %.2f'
      % r2_score(y_test, y_pred))

In [None]:
# Plot outputs: take X axis as the best stdev predictor (a4, see correlation matrix)
plt.plot(X_test['a4'], y_test, 'o', color='black');
plt.plot(X_test['a4'], y_pred, 'x', color='blue', linewidth=1)
plt.xlabel('stdev of acceleration a4')
plt.ylabel('load level %')

plt.show()