In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.cluster import KMeans
from mpl_toolkits.mplot3d import Axes3D
from sklearn.preprocessing import StandardScaler

# Importing the dataset
X = pd.read_csv('../Datasets/NB.csv')
X['Fault'] = 0

y = pd.read_csv('../Datasets/IR - 7.csv')
y['Fault'] = 1

X_train, X_test = train_test_split(X, test_size = 0.2, shuffle=False, random_state = 0)
y_train, y_test = train_test_split(y, test_size = 0.2, shuffle=False, random_state = 0)

train = X_train.append(y_train)
train = train.reset_index(drop=True)

test = X_test.append(y_test)
test = test.reset_index(drop=True)

train_data = train[['DE', 'FE', 'Fault']]
n_cluster = range(1, 20)
kmeans = [KMeans(n_clusters=i).fit(train_data) for i in n_cluster]
scores = [kmeans[i].score(train_data) for i in range(len(kmeans))]
fig, ax = plt.subplots(figsize=(10,6))
ax.plot(n_cluster, scores)
plt.xlabel('Number of Clusters')
plt.ylabel('Score')
plt.title('Elbow Curve')
plt.show();

X = train[['DE', 'FE', 'Fault']]
X = X.reset_index(drop=True)
km = KMeans(n_clusters=10)
km.fit(X)
km.predict(X)
labels = km.labels_
#Plotting
fig = plt.figure(1, figsize=(7,7))
ax = Axes3D(fig, rect=[0, 0, 0.95, 1], elev=30, azim=145)
ax.scatter(X.iloc[:,0], X.iloc[:,1], X.iloc[:,2],
          c=labels.astype(np.float), edgecolor="k")
ax.set_xlabel("DE")
ax.set_ylabel("FE")
ax.set_zlabel("Fault")
plt.title("K Means", fontsize=14);

X = train_data.values
X_std = StandardScaler().fit_transform(X)
mean_vec = np.mean(X_std, axis=0)
cov_mat = np.cov(X_std.T)
eig_vals, eig_vecs = np.linalg.eig(cov_mat)
eig_pairs = [ (np.abs(eig_vals[i]),eig_vecs[:,i]) for i in range(len(eig_vals))]
eig_pairs.sort(key = lambda x: x[0], reverse= True)
tot = sum(eig_vals)
var_exp = [(i/tot)*100 for i in sorted(eig_vals, reverse=True)] # Individual explained variance
cum_var_exp = np.cumsum(var_exp) # Cumulative explained variance

plt.figure(figsize=(10, 5))
plt.bar(range(len(var_exp)), var_exp, alpha=0.3, align='center', label='individual explained variance', color = 'g')
plt.step(range(len(cum_var_exp)), cum_var_exp, where='mid',label='cumulative explained variance')
plt.ylabel('Explained variance ratio')
plt.xlabel('Principal components')
plt.legend(loc='best')
plt.show();

def getDistanceByPoint(data, model):
    distance = pd.Series()
    for i in range(0,len(data)):
        Xa = np.array(data.loc[i])
        Xb = model.cluster_centers_[model.labels_[i]-1]
        distance.at[i] = np.linalg.norm(Xa-Xb)
    return distance

outliers_fraction = 0.01
# get the distance between each point and its nearest centroid. The biggest distances are considered as anomaly
distance = getDistanceByPoint(train_data, kmeans[9])
number_of_outliers = int(outliers_fraction*len(distance))
threshold = distance.nlargest(number_of_outliers).min()
# anomaly1 contain the anomaly result of the above method Cluster (0:normal, 1:anomaly) 
train['anomaly'] = (distance >= threshold).astype(int)

# visualisation of anomaly with cluster view
fig, ax = plt.subplots(figsize=(10,6))
colors = {0:'blue', 1:'red'}
ax.scatter(train['DE'], train['FE'], c=train["anomaly"].apply(lambda x: colors[x]))
plt.xlabel('principal feature1')
plt.ylabel('principal feature2')
plt.show();

train_anomalies = train[train['anomaly'] == 1]

f, (ax2) = plt.subplots(figsize=(18, 6))
ax2.scatter(train_anomalies.index, train_anomalies.DE, label='anomaly', color='red', s=10)
ax2.plot(train.index, train.DE, label='DE');
plt.xlim((0,len(train.index)))
plt.title('K means')
plt.xlabel('Data points')
plt.ylabel('Distance from centroid')
plt.legend();
plt.show();
test_data = test[['DE', 'FE', 'Fault']]
outliers_fraction = 0.01

# Get the distance between each point and its nearest centroid. The biggest distances are considered as anomalies.
distance = getDistanceByPoint(test_data, kmeans[9])
number_of_outliers = int(outliers_fraction * len(distance))
threshold = distance.nlargest(number_of_outliers).min()

# Anomaly1 contains the anomaly result of the above method cluster (0:normal, 1:anomaly).
test['anomaly'] = (distance >= threshold).astype(int)

# Visualize the data with anomalies highlighted.
fig, ax = plt.subplots(figsize=(10, 6))
colors = {0: 'blue', 1: 'red'}
ax.scatter(test['DE'], test['FE'], c=test["anomaly"].apply(lambda x: colors[x]))
plt.xlabel('Principal Feature 1')
plt.ylabel('Principal Feature 2')
plt.show()

# Get the anomalies.
test_anomalies = test[test['anomaly'] == 1]

# Visualize the anomalies.
f, (ax1) = plt.subplots(figsize=(18, 6))
ax1.scatter(test_anomalies.index, test_anomalies.DE, label='Anomaly', color='red', s=10)
ax1.plot(test.index, test.DE, label='DE')
plt.xlim((0, len(test.index)))
plt.title('K Means')
plt.xlabel('Data Points')
plt.ylabel('Distance from Centroid')
plt.legend()
plt.show()

# Calculate the accuracy of the anomaly detection.
from sklearn.metrics import accuracy_score
score = 100 * accuracy_score(test['anomaly'], test['Fault'])
print("Accuracy: {:.2f}%".format(score))
print("Anomalies: {}".format(test_anomalies['anomaly'].count()))