<a href="https://colab.research.google.com/github/shubhamValkunde/Myportfolio/blob/main/Anomly_Detection_Using_IForest.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# Let's first load the dataset and inspect the contents to understand its structure.
import pandas as pd

# Load the dataset
file_path = '/mnt/nfvData.csv'
data = pd.read_csv(file_path)

# Show the first few rows of the dataset to understand its structure
data.head()


Unnamed: 0,timestamp,resName,cpu|demandPct
0,01-01-2020 00:04,MISBC41_SE2900_HRU1_1,22.955334
1,01-01-2020 00:09,MISBC41_SE2900_HRU1_1,31.049334
2,01-01-2020 00:14,MISBC41_SE2900_HRU1_1,28.133333
3,01-01-2020 00:19,MISBC41_SE2900_HRU1_1,25.549999
4,01-01-2020 00:24,MISBC41_SE2900_HRU1_1,23.285334


In [2]:
# Preprocessing: Convert timestamp to datetime and extract relevant features
data['timestamp'] = pd.to_datetime(data['timestamp'], format='%d-%m-%Y %H:%M')

# Extracting features from the timestamp
data['hour'] = data['timestamp'].dt.hour
data['day_of_week'] = data['timestamp'].dt.dayofweek
data['month'] = data['timestamp'].dt.month

# Label Encoding 'resName' column
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
data['resName_encoded'] = le.fit_transform(data['resName'])

# Dropping original 'timestamp' and 'resName' columns
data_cleaned = data.drop(columns=['timestamp', 'resName'])

# Display the first few rows after preprocessing
data_cleaned.head()


Unnamed: 0,cpu|demandPct,hour,day_of_week,month,resName_encoded
0,22.955334,0,2,1,0
1,31.049334,0,2,1,0
2,28.133333,0,2,1,0
3,25.549999,0,2,1,0
4,23.285334,0,2,1,0


In [4]:
from sklearn.ensemble import IsolationForest
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report

# Splitting the data into training and testing sets
X = data_cleaned.drop(columns='cpu|demandPct')
y = data_cleaned['cpu|demandPct']  # Target column for comparison (assuming CPU demand anomalies)

# For simplicity, we'll assume a threshold for anomaly detection (e.g., demand above 80% is anomalous)
threshold = 80
y_anomaly = (y > threshold).astype(int)  # Binary label for anomaly (1 for anomaly, 0 for normal)

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y_anomaly, test_size=0.3, random_state=42)

# Train the Isolation Forest model
isolation_forest = IsolationForest(n_estimators=200, max_samples='auto', contamination=0.05, random_state=42)
isolation_forest.fit(X_train)

# Predict anomalies
y_pred = isolation_forest.predict(X_test)

# Convert the output of Isolation Forest (-1 for anomaly, 1 for normal) to binary format (1 for anomaly, 0 for normal)
y_pred_binary = (y_pred == -1).astype(int)

# Evaluate the performance of the Isolation Forest model
accuracy = accuracy_score(y_test, y_pred_binary)
classification_rep = classification_report(y_test, y_pred_binary)

accuracy, classification_rep


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


(0.9508889986418076,
 '              precision    recall  f1-score   support\n\n           0       1.00      0.95      0.97     64792\n           1       0.00      0.00      0.00         0\n\n    accuracy                           0.95     64792\n   macro avg       0.50      0.48      0.49     64792\nweighted avg       1.00      0.95      0.97     64792\n')

In [6]:
import pandas as pd
from sklearn.metrics import precision_score, recall_score, f1_score

# Calculate precision, recall, and f1-score for each class
precision = precision_score(y_test, y_pred_binary, average=None, zero_division=0)
recall = recall_score(y_test, y_pred_binary, average=None, zero_division=0)
f1 = f1_score(y_test, y_pred_binary, average=None, zero_division=0)

# Determine support (number of samples for each class)
support = [y_test.value_counts().get(0, 0), y_test.value_counts().get(1, 0)]  # Use .get() to avoid KeyError

# Create a DataFrame to display results in tabular format
results_df = pd.DataFrame({
    'Class': ['Normal (0)', 'Anomaly (1)'],
    'Precision': precision,
    'Recall': recall,
    'F1-Score': f1,
    'Support': support  # Number of samples for each class
})

# Adding overall accuracy
accuracy = accuracy_score(y_test, y_pred_binary)
overall_metrics_df = pd.DataFrame({
    'Metric': ['Accuracy'],
    'Value': [accuracy]
})

# Display the tables
print("Per-Class Performance Metrics:")
print(results_df.to_string(index=False))

print("\nOverall Performance Metrics:")
print(overall_metrics_df.to_string(index=False))


Per-Class Performance Metrics:
      Class  Precision   Recall  F1-Score  Support
 Normal (0)        1.0 0.950889  0.974826    64792
Anomaly (1)        0.0 0.000000  0.000000        0

Overall Performance Metrics:
  Metric    Value
Accuracy 0.950889
