In [1]:
import pandas as pd
from sklearn.metrics import classification_report
import numpy as np
from sklearn.preprocessing import MinMaxScaler, StandardScaler

In [3]:
df = pd.read_csv(r'C:\Users\AXW0J9E\Downloads\telemacluster\data\processed\relu\P-1_relu_final_df.csv')
print(df)

      kmeans_anomaly_score  recon_losses  lstm_error  actual_anomaly
0                 0.938890      0.019330    0.000000             0.0
1                 0.881053      0.019069    0.000000             0.0
2                 1.125757      0.019735    0.000000             0.0
3                 1.343782      0.020598    0.000000             0.0
4                 1.245892      0.020923    0.000000             0.0
...                    ...           ...         ...             ...
8500              1.850395      0.017034    6.125645             0.0
8501              6.077311      0.016994    2.138086             0.0
8502              3.060155      0.016218    2.630776             0.0
8503              1.445799      0.015940    2.169716             0.0
8504              1.165801      0.015829    1.416631             0.0

[8505 rows x 4 columns]


In [7]:
from sklearn.cluster import DBSCAN

# Assuming df is a pandas DataFrame and not a numpy array
X = df[['kmeans_anomaly_score', 'recon_losses', 'lstm_error']]

# Standardize features by removing the mean and scaling to unit variance
X_std = StandardScaler().fit_transform(X)

# Apply DBSCAN
db = DBSCAN(eps=0.3, min_samples=10).fit(X_std)
labels = db.labels_

# Assign cluster labels back to the original DataFrame
df['cluster'] = labels

# Any point not assigned to a cluster is considered an outlier in DBSCAN
anomalies = df[df['cluster'] == -1]

# You cannot directly use the classification_report because 'anomalies' is a subset of 'df'
# You need to create a binary array where 1 indicates an anomaly and 0 indicates normal
# Then you can compare this with the 'actual_anomaly' column in 'df'
anomaly_labels = (df['cluster'] == -1).astype(int)

print(classification_report(df['actual_anomaly'], anomaly_labels))

              precision    recall  f1-score   support

         0.0       0.92      0.91      0.92      7754
         1.0       0.15      0.16      0.15       751

    accuracy                           0.85      8505
   macro avg       0.53      0.53      0.53      8505
weighted avg       0.85      0.85      0.85      8505



In [8]:

# Assume df is your DataFrame and 'features' are your columns
X = df[['kmeans_anomaly_score', 'recon_losses', 'lstm_error']]

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [9]:
from sklearn.neighbors import LocalOutlierFactor
from sklearn.metrics import silhouette_score

best_score = -1
best_n = -1

for n in range(5000, 7000, 100):
    # Fit the model
    clf = LocalOutlierFactor(n_neighbors=n, contamination=0.1)
    y_pred = clf.fit_predict(X_scaled)

    # Compute the silhouette score
    score = silhouette_score(X_scaled, y_pred)

    # Update the best silhouette score and the best n_neighbors
    if score > best_score:
        best_score = score
        best_n = n

print('Best n_neighbors:', best_n)
print('Best silhouette score:', best_score)

Best n_neighbors: 6500
Best silhouette score: 0.5067255830348595


In [10]:
from sklearn.neighbors import LocalOutlierFactor
from sklearn.metrics import classification_report

# Fit the model
clf = LocalOutlierFactor(n_neighbors=6100, contamination=0.1)
y_pred = clf.fit_predict(X)

# Map the predicted values to 0 for normal, 1 for anomaly
y_pred[y_pred == 1] = 0
y_pred[y_pred == -1] = 1

# Compare the predictions with the actual anomalies
print(classification_report(df['actual_anomaly'], y_pred))

              precision    recall  f1-score   support

         0.0       0.91      0.90      0.91      7754
         1.0       0.10      0.11      0.11       751

    accuracy                           0.83      8505
   macro avg       0.51      0.51      0.51      8505
weighted avg       0.84      0.83      0.84      8505



In [11]:
from keras.models import Model
from keras.layers import Input, Dense


# Define the autoencoder
input_layer = Input(shape=(X.shape[1],))
encoded = Dense(100, activation='relu')(input_layer)
# Add another layer
encoded = Dense(50, activation='relu')(encoded)
encoded = Dense(25, activation='relu')(encoded)
decoded = Dense(X.shape[1], activation='sigmoid')(encoded)

autoencoder = Model(input_layer, decoded)

# Compile the model
autoencoder.compile(optimizer='adam', loss='binary_crossentropy')

# Train the model
autoencoder.fit(X_scaled, X_scaled, epochs=100, batch_size=256, shuffle=True)

# Use the autoencoder to reconstruct the input data
reconstructed_X = autoencoder.predict(X_scaled)

# Compute the reconstruction error
reconstruction_error = np.mean(np.power(X_scaled - reconstructed_X, 2), axis=1)

# Define a threshold for anomalies
threshold = np.quantile(reconstruction_error, 0.95)

# Classify anomalies
anomalies = reconstruction_error > threshold

# Compare the predictions with the actual anomalies
print(classification_report(df['actual_anomaly'], anomalies))

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

In [12]:
from sklearn import svm

# Assume df is your DataFrame and 'features' are your columns
X = df[['kmeans_anomaly_score', 'recon_losses', 'lstm_error']]

# Fit the model
clf = svm.OneClassSVM(nu=0.1, kernel="rbf", gamma=0.1)
clf.fit(X_scaled)

# Predict the anomalies in the data
pred = clf.predict(X_scaled)

# Map the predicted values to 0 for normal, 1 for anomaly
pred[pred == 1] = 0
pred[pred == -1] = 1

# Compare the predictions with the actual anomalies
print(classification_report(df['actual_anomaly'], pred))

              precision    recall  f1-score   support

         0.0       0.94      0.93      0.93      7754
         1.0       0.34      0.38      0.36       751

    accuracy                           0.88      8505
   macro avg       0.64      0.65      0.65      8505
weighted avg       0.89      0.88      0.88      8505



In [13]:
from sklearn.ensemble import IsolationForest
from sklearn.model_selection import GridSearchCV

# Assume df is your DataFrame and 'features' are your columns
X = df[['kmeans_anomaly_score', 'recon_losses', 'lstm_error']]

# Define the parameter grid
param_grid = {
    'n_estimators': [100, 200, 300, 400],
    'contamination': [0.05, 0.1, 0.15]
}

# Create a GridSearchCV object with scoring metric
grid = GridSearchCV(IsolationForest(), param_grid, cv=5, scoring='precision')

# Fit the model and find the best parameters
grid.fit(X_scaled)

# Predict the anomalies in the data
pred = grid.predict(X_scaled)

# Map the predicted values to 0 for normal, 1 for anomaly
pred[pred == 1] = 0
pred[pred == -1] = 1

# Compare the predictions with the actual anomalies
print(classification_report(df['actual_anomaly'], pred))

Traceback (most recent call last):
  File "c:\Users\AXW0J9E\.conda\envs\telemacluster\lib\site-packages\sklearn\model_selection\_validation.py", line 808, in _score
    scores = scorer(estimator, X_test)
TypeError: _BaseScorer.__call__() missing 1 required positional argument: 'y_true'

Traceback (most recent call last):
  File "c:\Users\AXW0J9E\.conda\envs\telemacluster\lib\site-packages\sklearn\model_selection\_validation.py", line 808, in _score
    scores = scorer(estimator, X_test)
TypeError: _BaseScorer.__call__() missing 1 required positional argument: 'y_true'

Traceback (most recent call last):
  File "c:\Users\AXW0J9E\.conda\envs\telemacluster\lib\site-packages\sklearn\model_selection\_validation.py", line 808, in _score
    scores = scorer(estimator, X_test)
TypeError: _BaseScorer.__call__() missing 1 required positional argument: 'y_true'

Traceback (most recent call last):
  File "c:\Users\AXW0J9E\.conda\envs\telemacluster\lib\site-packages\sklearn\model_selection\_validat

              precision    recall  f1-score   support

         0.0       0.92      0.96      0.94      7754
         1.0       0.23      0.13      0.17       751

    accuracy                           0.89      8505
   macro avg       0.58      0.55      0.55      8505
weighted avg       0.86      0.89      0.87      8505



In [14]:
from sklearn.ensemble import IsolationForest

# Assume df is your DataFrame and 'features' are your columns
X = df[['kmeans_anomaly_score', 'recon_losses', 'lstm_error']]

# Fit the model
clf = IsolationForest(contamination=0.1)
clf.fit(X)

# Predict the anomalies in the data
pred = clf.predict(X)

# Map the predicted values to 0 for normal, 1 for anomaly
pred[pred == 1] = 0
pred[pred == -1] = 1

# Compare the predictions with the actual anomalies
print(classification_report(df['actual_anomaly'], pred))

              precision    recall  f1-score   support

         0.0       0.92      0.91      0.92      7754
         1.0       0.20      0.22      0.21       751

    accuracy                           0.85      8505
   macro avg       0.56      0.57      0.56      8505
weighted avg       0.86      0.85      0.86      8505



In [15]:
from sklearn.cluster import KMeans
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report

# Assume df is your DataFrame and 'features' are your columns
X = df[['kmeans_anomaly_score', 'recon_losses', 'lstm_error']]

# Use KMeans to create labels
kmeans = KMeans(n_clusters=2, random_state=0).fit(X)
labels = kmeans.labels_

# Train a Decision Tree model
dt = DecisionTreeClassifier(random_state=0)
dt.fit(X, labels)

# Predict the "class" of new data
predictions = dt.predict(X)

# Compare the predictions with the actual anomalies
print(classification_report(df['actual_anomaly'], predictions))

              precision    recall  f1-score   support

         0.0       0.90      0.20      0.33      7754
         1.0       0.09      0.77      0.15       751

    accuracy                           0.25      8505
   macro avg       0.49      0.48      0.24      8505
weighted avg       0.83      0.25      0.31      8505



  super()._check_params_vs_input(X, default_n_init=10)


In [20]:
from sklearn.mixture import GaussianMixture

# Fit a Gaussian mixture model on the enhanced feature set
gmm = GaussianMixture(n_components=2)
gmm.fit(X)

# Use the GMM to identify points with low likelihoods as anomalies
scores = gmm.score_samples(X)
anomalies = scores < np.percentile(scores, 30)  # marking the lowest 5% as anomalies
# Compare the predictions with the actual anomalies
print(classification_report(df['actual_anomaly'], predictions))

              precision    recall  f1-score   support

         0.0       0.90      0.20      0.33      7754
         1.0       0.09      0.77      0.15       751

    accuracy                           0.25      8505
   macro avg       0.49      0.48      0.24      8505
weighted avg       0.83      0.25      0.31      8505

