In [1]:
from sklearn.svm import OneClassSVM
from sklearn.model_selection import KFold
import pandas as pd
import numpy as np
from sklearn.metrics import f1_score,precision_score,accuracy_score,classification_report,roc_auc_score,confusion_matrix,roc_curve,auc
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
import optuna
import matplotlib as plt

In [2]:
df = pd.read_csv('data_cleaned.csv')

## Data Preprocessing for Training

In [3]:
# Data Preprocessing
df_normal = df[df['label'] == 0]
df_anomalous = df[df['label'] == 1]
df_normal = df_normal.drop(columns=['label'])
df_anomalous = df_anomalous.drop(columns=['label'])
# Data Distribution
print(f"Normal: {len(df_normal)}, Anomalous: {len(df_anomalous)}")

Normal: 569877, Anomalous: 20663


## Training the Model

- **K-fold cross-validation** was used to ensure the model was trained on different subsets of the data. Since only a subset of the normal class was used for testing, relying on a single split might not provide the most reliable results. K-fold cross-validation helps mitigate this issue by training the model across multiple data folds, leading to a more robust evaluation.

In [None]:
# Create a KFold object with 10 splits
kf = KFold(n_splits=10, shuffle=True, random_state=42)
# Initialize an empty list to store the results
results = []
# Iterate through the folds
for train_index, test_index in kf.split(df_normal):
    # Split the data into training and test sets
    X_train, X_test = df_normal.iloc[train_index], df_normal.iloc[test_index]
    y_test = np.full(len(X_test), 0)
    # add anomalous data to the test set
    X_test = pd.concat([X_test, df_anomalous])
    y_test = np.concatenate([y_test, np.full(len(df_anomalous), 1)])
    
    # Train the One Class SVM model
    model = OneClassSVM(kernel='rbf', gamma='scale', nu=0.1)
    model.fit(X_train)
    
    # Predict the labels for the test set
    y_pred = model.predict(X_test)
    y_pred = [1 if x == -1 else 0 for x in y_pred]
    
    # Calculate the F1 score
    f1 = f1_score(y_test, y_pred, pos_label=1)
    precision = precision_score(y_test, y_pred, pos_label=1)
    accuracy = accuracy_score(y_test, y_pred)
    
    # Append the results to the list
    results.append({
        'f1': f1,
        'precision': precision,
        'accuracy': accuracy
    })
    
print(f"F1 Score: {np.mean([result['f1'] for result in results])}")
print(f"Precision: {np.mean([result['precision'] for result in results])}")
print(f"Accuracy: {np.mean([result['accuracy'] for result in results])}")
print(f"Classification Report: {classification_report(y_test, y_pred)}")

The OneClass SVM models are not working well for higer dimensional data. Its taking lot of time nearly 3-4 hours to train them and they are just giving comparable or worse results than the Isolation Forest and AutoEncoder models. Which are just taking at max few minutes to train. So the study on OneClass SVM is not done thoroughly.