In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC
from sklearn.metrics import confusion_matrix,classification_report, accuracy_score

import seaborn as sns
import matplotlib.pyplot as plt

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

## Fault type identification
There are 10 types of faults, linked to each bearing deffect:

- **Ball_007_1**: Ball defect (0.007 inch)
- **Ball_014_1**: Ball defect (0.014 inch)
- **Ball_021_1**: Ball defect (0.021 inch)
- **IR_007_1**: Inner race fault (0.007 inch)
- **IR_014_1**: Inner race fault (0.014 inch)
- **IR_021_1**: Inner race fault (0.021 inch)
- **Normal_1**: Normal
- **OR_007_6_1**: Outer race fault (0.007 inch, data collected from 6 O'clock position)
- **OR_014_6_1**: Outer race fault (0.014 inch, 6 O'clock)
- **OR_021_6_1**: Outer race fault (0.021 inch, 6 O'clock)

## Get the data
The file we will read is the result of preprocessing the raw data files (folder `/kaggle/input/cwru-bearing-datasets/raw/`).

Time series segments contains 2048 points each. Given that the sampling frequency is 48kHz each time serie covers 0.04 seconds.

In [None]:
data_time = pd.read_csv("../input/cwru-bearing-datasets/feature_time_48k_2048_load_1.csv")
data_time

## Split into train and test datasets

In [None]:
train_data, test_data = train_test_split(data_time, test_size = 750, stratify = data_time['fault'], random_state = 1234)
test_data['fault'].value_counts()

## Scale features in train set

In [None]:
# Scale each column to have zero mean and standard deviation equal to 1
scaler = StandardScaler()
train_data_scaled = scaler.fit_transform(train_data.iloc[:,:-1])
pd.DataFrame(train_data_scaled).describe()

In [None]:
test_data_scaled = (test_data.iloc[:,:-1].values - scaler.mean_)/np.sqrt(scaler.var_)
pd.DataFrame(test_data_scaled).describe()

## Train a model using Support Vector Classifier
Call the SVC() model from sklearn and fit the model to the training data.

In [None]:
from sklearn.svm import SVC

In [None]:
svc_model = SVC()
svc_model.fit(train_data_scaled, train_data['fault'])

## Model Evaluation
Now get predictions from the model and create a confusion matrix and a classification report.

In [None]:
train_predictions = svc_model.predict(train_data_scaled)
test_predictions = svc_model.predict(test_data_scaled)

In [None]:
from sklearn.metrics import classification_report,confusion_matrix

Plot confusion matrixes.

In [None]:
train_confu_matrix = confusion_matrix(train_data['fault'], train_predictions)
test_confu_matrix = confusion_matrix(test_data['fault'], test_predictions)

In [None]:
fault_type = data_time.fault.unique()

plt.figure(1,figsize=(18,8))

plt.subplot(121)
sns.heatmap(train_confu_matrix, annot= True,fmt = "d",
xticklabels=fault_type, yticklabels=fault_type, cmap = "Blues", cbar = False)
plt.title('Training Confusion Matrix')
plt.xlabel('Predicted')
plt.ylabel('True')
plt.subplot(122)

plt.subplot(122)
sns.heatmap(test_confu_matrix, annot = True,
xticklabels=fault_type, yticklabels=fault_type, cmap = "Blues", cbar = False)
plt.title('Test Confusion Matrix')
plt.xlabel('Predicted')
plt.ylabel('True')

plt.show()

In [None]:
# Classification report (test set)
class_report = classification_report(y_pred = test_predictions, y_true = test_data['fault'])
print(class_report)

- **recall**    =    para cada fallo, proporción de los correctamente identificados sobre el total de los reales = `TP / (TP + sum(FN))`
- **precision** = para cada fallo, proporción de los correctamente identificados sobre el total en la predicción = `TP / (TP + sum(FP))`

Refer to [Understanding Data Science Classification Metrics in Scikit-Learn in Python](https://towardsdatascience.com/understanding-data-science-classification-metrics-in-scikit-learn-in-python-3bc336865019) for the explanation of these metrics

## Tuning hyperparameters for model optimization

We will check a grid of parameters to find the best one. For each parameter combination, 10 fold cross-validation is performed.
- Understand what [10 fold cross-validation](https://machinelearningmastery.com/k-fold-cross-validation/) is

In [None]:
parameters = {"C":[1, 10, 45, 47,49, 50, 51, 55, 100, 300, 500],
             'gamma':[0.01, 0.05, 0.1, 0.5, 1, 5],
             'kernel':["rbf"]}

In [None]:
# Define the Grid Search optimization analysis
tuned_svm_clf = GridSearchCV(SVC(),parameters,n_jobs = -1, cv= 10)
tuned_svm_clf

In [None]:
# Train the move for the every pair of hyperparameters,
#   and determine the best combination
tuned_svm_clf.fit(train_data_scaled, train_data['fault'])

In [None]:
# Hyperparameter of the best model
tuned_svm_clf.best_params_

In [None]:
#Let's select the best model and provide results on them
best_clf = tuned_svm_clf.best_estimator_
best_clf

## Best model evaluation

In [None]:
# Compute the predictions
train_predictions_best = best_clf.predict(train_data_scaled)
test_predictions_best = best_clf.predict(test_data_scaled)

In [None]:
# Compute confusion matrix for training and test datasets
train_confu_matrix_best = confusion_matrix(train_data['fault'], train_predictions_best)
test_confu_matrix_best = confusion_matrix(test_data['fault'], test_predictions_best)

In [None]:
plt.figure(1,figsize=(18,8))

plt.subplot(121)
sns.heatmap(train_confu_matrix_best, annot= True,fmt = "d",
xticklabels=fault_type, yticklabels=fault_type, cmap = "Blues", cbar = False)
plt.title('Training Confusion Matrix (best model)')
plt.xlabel('Predicted')
plt.ylabel('True')
plt.subplot(122)

plt.subplot(122)
sns.heatmap(test_confu_matrix_best, annot = True,
xticklabels=fault_type, yticklabels=fault_type, cmap = "Blues", cbar = False)
plt.title('Test Confusion Matrix (best model)')
plt.xlabel('Predicted')
plt.ylabel('True')

plt.show()

### Compare with non optimized versions

In [None]:
# Classification report (test set)
class_report_best = classification_report(y_pred = test_predictions_best, y_true = test_data['fault'])
print(class_report_best)

In [None]:
# Remember the metrics for the non-optimized model
print(class_report)

## Multinomial logistic regression
This is the alternative model for comparing with SVC performance

In [None]:
# Logistic regression classifier
from sklearn.linear_model import LogisticRegression

# Setup the model
logis_model = LogisticRegression(multi_class='multinomial', solver='lbfgs', max_iter=1000)

# Train the model
logis_model.fit(train_data_scaled, train_data['fault'])

In [None]:
# Compute the predictions
test_predictions_lr = logis_model.predict(test_data_scaled)

In [None]:
# Compute confusion matrix 
test_confu_matrix_lr = confusion_matrix(test_data['fault'], test_predictions_lr)

In [None]:
# Classification report
class_report_lr = classification_report(y_pred = test_predictions_lr, y_true = test_data['fault'])
print(class_report_lr)

In [None]:
# Compute the predictions
train_predictions_logis = logis_model.predict(train_data_scaled)
test_predictions_logis = logis_model.predict(test_data_scaled)

In [None]:
# Classification report (test set)
class_report_logis = classification_report(y_pred = test_predictions_logis, y_true = test_data['fault'])
print(class_report_logis)

In [None]:
plt.figure(1,figsize=(8,6))

sns.heatmap(test_confu_matrix_lr, annot = True,
xticklabels=fault_type, yticklabels=fault_type, cmap = "Blues", cbar = False)
plt.title('Test Confusion Matrix (logistic regression)')
plt.xlabel('Predicted')
plt.ylabel('True')

plt.show()