# Development of fault detection procedure in welding using Machine Learning (ML). The fault detection procedure is based on Remaining Useful Life (RUL) prediction of welding tool and damage identification of welding process

Modules to be imported

In [103]:
#Read the data into the editor
import numpy as np
import pandas as pd
from xgboost import XGBRegressor
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedKFold
from sklearn.metrics import mean_squared_error as MSE
from sklearn.cluster import DBSCAN
from sklearn import metrics
from sklearn.datasets import make_blobs
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

## Part 1 of the project - Remaining Useful Lifetime (RUL) prediction of the welding tool

#### Load the data into the IDE

In [None]:

Train_data='loc of train data'
Test_data_DF='Loc of test data'
Train_data_DF= pd.read_csv(Train_data,encoding='cp1252', parse_dates=True)
Train_data_DF.head()
Test_data_DF= pd.read_csv(Test_data_DF,encoding='cp1252', parse_dates=True)
Test_data_DF.head()

#### Inspect the shape of the data and check for any null values

In [None]:
print(Train_data_DF.shape)
print(list(Train_data_DF.columns))
print(Train_data_DF.isna().sum())
print(Test_data_DF.shape)
print(list(Test_data_DF.columns))
print(Test_data_DF.isna().sum())

#### Inspect the statistics to check for faulty sensos

In [None]:
print(Train_data_DF.describe().T)
print(Test_data_DF.describe().T)

#### Outlier removal steps

In [None]:
for i in list(Train_data_DF.columns):
    q75,q25 = np.percentile(Train_data_DF[i],[75,25])
    intr_qr = q75-q25
    max = q75+(1.5*intr_qr)
    min = q25-(1.5*intr_qr)
    Train_data_DF.loc[Train_data_DF[i] < min, i] = np.nan
    Train_data_DF.loc[Train_data_DF[i] > max, i] = np.nan
    Train_data_DF = Train_data_DF.dropna(axis = 0)
print(Train_data_DF.shape)
for i in list(Test_data_DF.columns):
    q75,q25 = np.percentile(Test_data_DF[i],[75,25])
    intr_qr = q75-q25
    max = q75+(1.5*intr_qr)
    min = q25-(1.5*intr_qr)
    Test_data_DF.loc[Test_data_DF[i] < min, i] = np.nan
    Test_data_DF.loc[Test_data_DF[i] > max, i] = np.nan
    Test_data_DF = Test_data_DF.dropna(axis = 0)
print(Test_data_DF.shape)

#### visualisations of each features

In [None]:
plt.figure(figsize=(10,20))
l=1
for i in list(Train_data_DF.columns):
    plt.subplot(len(list(Train_data_DF.columns)), 1, l)
    plt.plot(Train_data_DF[i])
    plt.title(i, y=0.5, loc='right')
    l += 1
plt.show()

#### Scaling the data / Data preparation

In [None]:
scaler = MinMaxScaler()
Ttrain = Train_data_DF.copy()
Ttest = Test_data_DF.copy()
Ttrain.iloc[:,0:-1] = scaler.fit_transform(Ttrain.iloc[:,0:-1])
Ttest.iloc[:,0:-1] = scaler.fit_transform(Ttest.iloc[:,0:-1])

#### Drop the metadata, label and features separation

In [None]:
TrainLabel=Ttrain['Cycle']
Ttrain=Ttrain.drop(['Elektrode Stant','Elektrode number', 'lower bound','upper bound','Cycle'], axis=1)
TestLabel=Ttest['Cycle']
Ttest=Ttest.drop(['Elektrode Stant','Elektrode number', 'lower bound','upper bound', 'Cycle'], axis=1)


#### Modeling using XGBOOST

In [None]:
model = XGBRegressor(n_estimators=1000, max_depth=7, eta=0.1, subsample=1, colsample_bytree=1)
# fit model
model.fit(Ttrain, TrainLabel)


#### K fold cross validation to inspect the performance of the model

In [None]:
cv = RepeatedKFold(n_splits=10, n_repeats=3, random_state=1)
# evaluate model
scores = cross_val_score(model, Ttrain, Ttest, scoring='mean_squared_error', cv=cv, n_jobs=-1)


#### Predict the test accuracy

In [None]:
pred = model.predict(Ttest)
rmse = np.sqrt(MSE(TestLabel, pred))

## Part 2 of the project damage detection in the welding process 

#### load the data

In [None]:
a=pd.read_csv('Loc of the data')
features=a[['RMS','COUN','ENER','DURATION']]

#### Heat Map is inspected to ensure diversity of the features

In [None]:
X_label=['RMS', 'COUN', 'ENER', 'DURATION']
Y_label=['RMS', 'COUN', 'ENER', 'DURATION']
sns.heatmap(X.corr(),xticklabels=X_label,yticklabels=Y_label)

#### scale the data and use k-NN algorithm to calculate the distance_desc. The plot of distance_desc provides an elbo shaped graph from which a hyperparameter is derived 

In [None]:
X = pd.DataFrame(StandardScaler().fit_transform(features))
from sklearn.neighbors import NearestNeighbors
import matplotlib.pyplot as plt
neighbors = 8
# X_embedded is your data
nbrs = NearestNeighbors(n_neighbors=neighbors ).fit(X)
distances, indices = nbrs.kneighbors(X)
#distance_desc = sorted(distances[:,-1], reverse=True)
mean_dis=[]
for i in range(len(distances)):
    mean_dis.append(distances[i].mean())
distance_desc = sorted(mean_dis, reverse=True)

plt.plot(distance_desc)
plt.ylabel('Distance')

#### the hyperparameter eps=0.53 (from K-NN algorithm), min_samples=8 and data is feed to DBSCAN algorithm which detects the outliers 

In [None]:
db = DBSCAN(eps=0.53, min_samples=8).fit(X)
core_samples_mask = np.zeros_like(db.labels_, dtype=bool)
core_samples_mask[db.core_sample_indices_] = True
labels = db.labels_

# Number of clusters in labels, ignoring noise if present.
n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0)
n_noise_ = list(labels).count(-1)

print("Estimated number of clusters: %d" % n_clusters_)
print("Estimated number of noise points: %d" % n_noise_)

#### PCA is used to reduce the dimension of the data to 2D for the visualisation of the result

In [None]:
pca = PCA()
pca.fit(X)
data_pca = pca.transform(X)
data_pca = pd.DataFrame(data_pca,columns=['PC1','PC2','PC3','PC4'])
data_pca.head()
import matplotlib.pyplot as plt
pd.DataFrame(pca.explained_variance_ratio_).plot.bar()
plt.legend('')
plt.xlabel('Principal Components')
plt.ylabel('Explained Varience')

#### plot of the result

In [None]:
plt.scatter(data_pca['PC1'], data_pca['PC2'], c=core_samples_mask, cmap=plt.cm.bwr, s=20)
plt.legend('Cluster')
plt.show()