In [1]:
import pandas as pd
from sklearn.ensemble import IsolationForest
from sklearn.preprocessing import StandardScaler
import numpy as np
import joblib

# Load the dataset
df = pd.read_csv("CTDAPD Dataset.csv")

# Preprocessing function
def preprocess_data(dataframe):
    dataframe = dataframe.fillna(dataframe.mode().iloc[0])  # Fill missing values with mode
    # Add time-based features (DayOfWeek, Hour, Month) if 'Date_Time' column exists
    if 'Date_Time' in dataframe.columns:
        dataframe['Date_Time'] = pd.to_datetime(dataframe['Date_Time'])
        dataframe['Hour'] = dataframe['Date_Time'].dt.hour
        dataframe['DayOfWeek'] = dataframe['Date_Time'].dt.dayofweek
        dataframe['Month'] = dataframe['Date_Time'].dt.month
        # Drop 'Date_Time' after extracting features
        dataframe = dataframe.drop(columns=['Date_Time'])
    return dataframe

# Apply preprocessing
df_processed = preprocess_data(df)

# Keep a copy of the original data
df_original = df.copy()

# Remove non-numeric columns (e.g., IP addresses, textual data)
df_processed = df_processed.select_dtypes(include=[np.number])

# Handle infinite values by replacing them with a large value or removing the rows
df_processed.replace([np.inf, -np.inf], np.nan, inplace=True)  # Replace inf with NaN

# Track the indices of rows that are dropped after handling NaN and inf values
valid_rows = df_processed.dropna().index

# Remove rows with NaN values
df_processed.dropna(inplace=True)

# Standardize the data (important for anomaly detection algorithms)
scaler = StandardScaler()
X_scaled = scaler.fit_transform(df_processed)

# Define the model (Isolation Forest)
model = IsolationForest(contamination=0.05, random_state=42)  # contamination is the fraction of outliers

# Fit the model
model.fit(X_scaled)

# Predict anomalies (1: normal, -1: anomaly)
y_pred = model.predict(X_scaled)

# Now, map the anomaly predictions to the original dataframe
df_original.loc[valid_rows, 'Anomaly'] = y_pred

# Display the data with anomalies detected
print("Anomalies Detected:\n")
print(df_original[df_original['Anomaly'] == -1])

# Save the model and scaler
joblib.dump(model, 'isolation_forest_model.joblib')
joblib.dump(scaler, 'scaler1.joblib')

# Save the data with the anomaly column
df_original.to_csv('processed_data_with_anomalies.csv', index=False)

print("Anomaly detection completed and saved.")


Anomalies Detected:

                      Date    Source_IP Destination_IP  Source_Port  \
33     2018-01-02 09:00:00   172.16.0.1        8.8.8.8         1554   
111    2018-01-05 15:00:00  192.168.1.1        8.8.8.8        28101   
141    2018-01-06 21:00:00  192.168.1.1        4.4.4.4        48144   
153    2018-01-07 09:00:00  192.168.1.1        4.4.4.4        54919   
199    2018-01-09 07:00:00     10.0.0.1        1.1.1.1        23974   
...                    ...          ...            ...          ...   
54630  2024-03-26 06:00:00  192.168.1.1        8.8.8.8        48873   
54694  2024-03-28 22:00:00  192.168.1.1        8.8.8.8         8685   
54718  2024-03-29 22:00:00   172.16.0.1        8.8.8.8        11816   
54732  2024-03-30 12:00:00  192.168.1.1        8.8.8.8        13217   
54767  2024-03-31 23:00:00  192.168.1.1        4.4.4.4         1277   

       Destination_Port Protocol_Type  Flow_Duration  Packet_Size  \
33                 8080           TCP             67     