In [None]:
dataset_path = None
anomaly_condition = None

In [62]:
from PS0_Files.pso import ParticleSwarmOptimizedClustering
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt
from kneed import KneeLocator
import pandas as pd
import numpy as np
import copy

In [63]:
maximum_cluster = 20
minimun_threshold = 70
#PSO parameters
n_particles = 2
max_iter = 5

In [64]:
class fit:
    def __init__(self, df, num_cluster):
        self.df = df
        self.num_cluster = num_cluster
        self.pso = ParticleSwarmOptimizedClustering(
        n_cluster = self.num_cluster, n_particles=n_particles, data=self.df, hybrid=True, max_iter=max_iter)
        self.best_centers = None
        self.clusters = None
    
    def get_labels_centers(self):
        history, self.best_centers, self.clusters  = self.pso.run()
        return history, self.best_centers, self.clusters
    
    def predict(self, arr):
        return self.pso.particles[self.pso.best_particle]._predict(arr)

In [65]:
def detector(df, percentile_threshold):
    df_array = df.values
    inertias = []
    for i in range(1,5):
        kmeans = KMeans(n_clusters=i)
        kmeans.fit(df_array)
        inertias.append(kmeans.inertia_)
    tmp = range(1, len(inertias)+1)
    kn = KneeLocator(tmp, inertias, curve='convex', direction='decreasing')
    num_cluster = kn.knee
    detector_fit_obj = fit(df_array, num_cluster)
    history, cluster_centers, kmeans_labels = detector_fit_obj.get_labels_centers()
    distances = [np.linalg.norm(df_array[i] - cluster_centers[kmeans_labels[i]]) for i in range(len(kmeans_labels))]
    threshold_distance = np.percentile(distances, percentile_threshold)
    anomalies = []
    indexes = []
    for i, distance in enumerate(distances):
        if distance > threshold_distance:
            anomalies.append(df_array[i])
            indexes.append(i)
    anomalies = np.asarray(anomalies, dtype=np.float32)
    return indexes, anomalies, distances

In [66]:
def knee_locator(df, maximum_cluster):
    inertias = []
    for i in range(1, maximum_cluster):
        kmeans = KMeans(n_clusters=i)
        kmeans.fit(df)
        inertias.append(kmeans.inertia_)
    tmp = range(1, len(inertias)+1)
    kn = KneeLocator(tmp, inertias, curve='convex', direction='decreasing')
    return kn.knee

In [67]:
def label_maker(df):
    anomals = list(df.loc[anomaly_condition].index)
    return anomals

In [68]:
def measurements(indexes, anomals):
    true_positive = len([value for value in indexes if value in anomals])
    false_negative = len([value for value in anomals if not(value in indexes)])
    false_positive = len([value for value in indexes if not(value in anomals)])
    #print(true_positive, false_positive, false_negative)
    precision = true_positive / (true_positive + false_negative)
    recall = true_positive / (true_positive + false_positive)
    f1_score = 2 * (precision * recall) / (precision + recall)
    false_positive_rate = false_positive / len(indexes)
    false_negative_rate = false_negative / len(anomals)
    return precision, recall, f1_score, false_positive_rate, false_negative_rate

In [69]:
def validate(kmeans_obj, dataset, df):
    cluster_centers = kmeans_obj.best_centers
    val_kmeans_labels = kmeans_obj.predict(df)
    val_distances = [np.linalg.norm(df[i] - cluster_centers[val_kmeans_labels[i]]) for i in range(len(val_kmeans_labels))]
    best_threshold = 0
    best_f1_score = 0
    actual_anomals = label_maker(dataset)
    length = len(actual_anomals)
    final_actual_anomals = [actual_anomals[i] - dataset.index[0] for i in range(length)]
    for percentile_threshold in range(70, 100, 2):
        val_threshold_distance = np.percentile(val_distances, percentile_threshold)
        val_anomalies = []
        val_indexes = []
        for i, val_distance in enumerate(val_distances):
            if val_distance > val_threshold_distance:
                val_anomalies.append(df[i])
                val_indexes.append(i)
        val_anomalies = np.asarray(val_anomalies, dtype=np.float32)
        val_precision, val_recall, val_f1_score, val_false_positive_rate, val_false_negative_rate = measurements(val_indexes, final_actual_anomals)
        if(best_f1_score < val_f1_score):
            best_threshold = percentile_threshold
            best_f1_score = val_f1_score
    return best_threshold

In [70]:
def test_valid(df1, df2, df3):
    df1_array = df1.values
    df2_array = df2.values
    df3_array = df3.values
    num_cluster = knee_locator(df1_array, maximum_cluster)
    kmeans_obj = fit(df1_array, num_cluster)
    kmeans_obj.get_labels_centers()
    cluster_centers = kmeans_obj.best_centers
    best_threshold = validate(kmeans_obj, df2, df2_array)
    print(best_threshold)
    test_kmeans_labels = kmeans_obj.predict(df3_array)
    test_distances = [np.linalg.norm(df3_array[i] - cluster_centers[test_kmeans_labels[i]]) for i in range(len(test_kmeans_labels))]
    test_threshold_distance = np.percentile(test_distances, best_threshold)
    test_anomalies = []
    test_indexes = []
    for i, test_distance in enumerate(test_distances):
        if test_distance > test_threshold_distance:
            test_anomalies.append(df3_array[i])
            test_indexes.append(i)
    test_anomalies = np.asarray(test_anomalies, dtype=np.float32)
    return test_indexes, test_anomalies, test_distances

In [71]:
def show(ind, ano, dis, type):
    plt.rcParams["figure.figsize"] = [10, 10]
    plt.rcParams["figure.autolayout"] = True
    x = list(range(len(dis)))
    ind.sort()
    co_indexes = copy.deepcopy(ind)
    for i in range(len(co_indexes)):
        co_indexes[i] -= i
        del x[co_indexes[i]]
    for i in range(len(x)):
        x[i] += 1
    y = [dis[idx - 1] for idx in x]
    plt.title("distances")
    if(type == 'line'):
        plt.plot(x, y, color = "red")
    elif(type == 'scatter'):
        plt.scatter(x, y, color = "red")
    x = copy.deepcopy(ind)
    for i in range(len(x)):
        x[i]+=1
    y = [dis[idx - 1] for idx in x]
    if(type == 'line'):
        plt.plot(x, y, color = "blue")
    elif(type == 'scatter'):
        plt.scatter(x, y, color = "blue")
    plt.show() 

In [72]:
row_data = pd.read_csv(dataset_path)
row_data = row_data.dropna()
data = row_data.copy(deep = True)

In [None]:
row_data

In [None]:
len(row_data.DATE.unique())

In [None]:
all_values = []
for i in range(len(row_data.columns)):
    if(isinstance(row_data.iloc[0, i], str)):
        col = row_data.columns[i]
        data.loc[:, col] = 1
        col_values = row_data[col].unique()
        all_values = np.concatenate((all_values, col_values))
all_values = list(all_values)
targets = list(np.full(len(all_values), 1))
map = {all_values[i]: targets[i] for i in range(len(targets))}
for item in list(map.keys()):
    #data = data.replace(item, map[item])
    pass
data

In [76]:
train_ratio = 0.7
val_ratio = 0.1
total_rows = data.shape[0]
train_data = data[:int(total_rows * train_ratio)]
val_data = data[int(total_rows * train_ratio): int(total_rows * (train_ratio + val_ratio))]
test_data = data[int(total_rows * (train_ratio + val_ratio)):]
data1 = train_data.copy(deep=True)
data2 = val_data.copy(deep = True)
data3 = test_data.copy(deep = True)

In [None]:
data1

In [None]:
data2

In [None]:
data3

In [None]:
indexes, anomalies, distances = detector(data, 98.0)
test_indexes, test_anomalies, test_distances = test_valid(data1, data2, data3)

In [None]:
print(anomalies)

In [None]:
print(len(indexes))

In [None]:
show(indexes, anomalies, distances, 'scatter')

In [None]:
row_data.iloc[indexes]

In [None]:
show(indexes, anomalies, distances, 'line')

Test Phase

In [86]:
anomals = label_maker(data)
precision, recall, f1_score, false_positive_rate, false_negative_rate = measurements(indexes, anomals)

In [None]:
print("precision is: ", precision)
print("recall is: ", recall)
print("f1 score is: ", f1_score)
print("false positive rate is: ", false_positive_rate)
print("false negative rate is: ", false_negative_rate)

In [88]:
actual_anomals = label_maker(data3)
length = len(actual_anomals)
final_actual_anomals = [actual_anomals[i] - data3.index[0] for i in range(length)]
precision, recall, f1_score, false_positive_rate, false_negative_rate = measurements(test_indexes, final_actual_anomals)

In [None]:
print("test precision is: ", precision)
print("test recall is: ", recall)
print("test f1 score is: ", f1_score)
print("test false positive rate is: ", false_positive_rate)
print("test false negative rate is: ", false_negative_rate)