In [10]:
!pip install scikit-learn

Defaulting to user installation because normal site-packages is not writeable
Collecting scikit-learn
  Downloading scikit_learn-1.6.1-cp312-cp312-win_amd64.whl.metadata (15 kB)
Collecting scipy>=1.6.0 (from scikit-learn)
  Downloading scipy-1.15.1-cp312-cp312-win_amd64.whl.metadata (60 kB)
Collecting joblib>=1.2.0 (from scikit-learn)
  Downloading joblib-1.4.2-py3-none-any.whl.metadata (5.4 kB)
Collecting threadpoolctl>=3.1.0 (from scikit-learn)
  Downloading threadpoolctl-3.5.0-py3-none-any.whl.metadata (13 kB)
Downloading scikit_learn-1.6.1-cp312-cp312-win_amd64.whl (11.1 MB)
   ---------------------------------------- 0.0/11.1 MB ? eta -:--:--
   -------- ------------------------------- 2.4/11.1 MB 11.2 MB/s eta 0:00:01
   ---------------- ----------------------- 4.7/11.1 MB 11.4 MB/s eta 0:00:01
   ------------------------- -------------- 7.1/11.1 MB 11.5 MB/s eta 0:00:01
   --------------------------------- ------ 9.4/11.1 MB 11.5 MB/s eta 0:00:01
   -----------------------------


[notice] A new release of pip is available: 24.2 -> 25.0.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [14]:
import pandas as pd
import numpy as np
from sklearn.ensemble import IsolationForest
import matplotlib.pyplot as plt

# Step 1: Load the dataset
file_path = "withoutMQTT.csv"  # Change this to your actual file
df = pd.read_csv(file_path)

# Step 2: Preprocess (remove non-numeric columns if needed)
df = df.select_dtypes(include=[np.number])  # Keep only numerical features
df = df.dropna()  # Remove missing values

# Step 3: Train Isolation Forest
iso_forest = IsolationForest(n_estimators=100, contamination=0.05, random_state=42)
df["anomaly_score"] = iso_forest.fit_predict(df)  # -1 = anomaly, 1 = normal

# Step 4: Filter anomalies
anomalies = df[df["anomaly_score"] == -1]

# Step 5: Visualization (if 2D data)
if df.shape[1] == 3:  # Only works for 2D datasets
    plt.scatter(df.iloc[:, 0], df.iloc[:, 1], c=df["anomaly_score"], cmap="coolwarm")
    plt.xlabel(df.columns[0])
    plt.ylabel(df.columns[1])
    plt.title("Isolation Forest Anomaly Detection")
    plt.colorbar(label="Anomaly Score")
    plt.show()

# Save results
df.to_csv("anomaly_results.csv", index=False)

print(f"Total anomalies detected: {len(anomalies)}")
# print(f"Total anomalies detected: {anomalies}")
# Filter anomalies
anomalies = df[df["anomaly_score"] == -1]
pd.set_option("display.max_columns", None)  # Show all columns
pd.set_option("display.max_rows", None)     # Show all rows (be careful for large datasets)

# Print all anomaly rows with all columns
print("Anomalies detected:")
print(anomalies)

Total anomalies detected: 15633
Anomalies detected:
        frame.number  frame.len  tcp.len  tcp.srcport  tcp.dstport  \
0                  1         85     31.0      62733.0       1883.0   
2                  3         85     31.0      62733.0       1883.0   
4                  5         85     31.0      62733.0       1883.0   
6                  7         85     31.0      62733.0       1883.0   
8                  9         85     31.0      62733.0       1883.0   
12                13         85     31.0      62733.0       1883.0   
14                15         85     31.0      62733.0       1883.0   
16                17         85     31.0      62733.0       1883.0   
18                19         85     31.0      62733.0       1883.0   
20                21         85     31.0      62733.0       1883.0   
22                23         85     31.0      62733.0       1883.0   
24                25         56      2.0      62733.0       1883.0   
26                27         56      2

Import Libraries

In [15]:
import pandas as pd
import numpy as np

Load Dataset

In [35]:
file_path = "withoutMQTT.csv"  # Change this to your actual file
df = pd.read_csv(file_path)

Preprocessing

In [36]:
from sklearn.preprocessing import LabelEncoder
label_encode_cols = ["frame.time"]  # Columns for Label Encoding
one_hot_encode_cols = ["eth.src", "eth.dst", "ip.src", "ip.dst"]  # Columns for One-Hot Encoding

# Apply Label Encoding
label_encoder = LabelEncoder()
for col in label_encode_cols:
    df[col] = label_encoder.fit_transform(df[col])

# Apply One-Hot Encoding
df = pd.get_dummies(df, columns=one_hot_encode_cols, drop_first=True)  # drop_first=True avoids dummy variable trap

# Identify numeric columns (assuming numbers should not have commas)
numeric_cols = df.select_dtypes(include=[float, int]).columns.tolist()

# Drop rows where any numeric column contains a comma
df = df[~df.apply(lambda row: row.astype(str).str.contains(",").any(), axis=1)]

# Save the processed dataset
df.to_csv("preprocessed_dataset.csv", index=False)
print("Preprocessed dataset saved as 'preprocessed_dataset.csv'.")

Preprocessed dataset saved as 'preprocessed_dataset.csv'.


In [37]:
# Step 1: Store feature names before training
feature_columns = df.columns  # Store only original feature names

# Train Isolation Forest
iso_forest = IsolationForest(n_estimators=100, random_state=42)
iso_forest.fit(df[feature_columns])  # Train using only feature columns

# Step 2: Predict using the same feature set
df["anomaly_score"] = iso_forest.predict(df[feature_columns])  # -1 = anomaly, 1 = normal
df["raw_score"] = iso_forest.decision_function(df[feature_columns])  # Lower = more anomalous

# If outliers have high scores, redefine anomaly labels:
df["anomaly_score"] = df["raw_score"].apply(lambda x: -1 if x > 0 else 1)

# Move Columns
cols_to_move = ["frame.time", "tcp.srcport", "tcp.dstport", "anomaly_score", "raw_score"]  # Columns to move
df = df[[col for col in df.columns if col not in cols_to_move] + cols_to_move]

# Save results
df.to_csv("anomaly_results.csv", index=False)
print("Anomaly detection complete. Results saved in 'anomaly_results.csv'.")

Anomaly detection complete. Results saved in 'anomaly_results.csv'.
