## Isolation Forest for Anomaly Detection
**Objective**: Understand and apply the Isolation Forest algorithm to identify anomalies in datasets.

### Task: Anomaly Detection in Network Traffic
**Steps**:
1. Extract Features from Dataset:
    - Load `network_traffic.csv` .
2. Isolation Forest Model
3. Display Anomalies

In [1]:
# Task: Anomaly Detection in Network Traffic
# Steps:
# 1. Extract Features from Dataset:
#    - Load network_traffic.csv
# 2. Isolation Forest Model
# 3. Display Anomalies

# 1. Extract Features from Dataset
import pandas as pd
from sklearn.ensemble import IsolationForest
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

try:
    # Load the network traffic dataset
    df = pd.read_csv('network_traffic.csv')
    print("Network traffic dataset loaded successfully:")
    print(df.head())

    # Identify potential numerical features for anomaly detection
    numerical_features = df.select_dtypes(include=np.number).columns.tolist()
    print("\nPotential numerical features:", numerical_features)

    # Let's select a few relevant numerical features for the Isolation Forest model
    # You might need to adjust these based on your dataset's characteristics
    selected_features = ['duration', 'src_bytes', 'dst_bytes', 'land', 'wrong_fragment', 'urgent']

    # Ensure the selected features exist in the DataFrame
    selected_features = [feature for feature in selected_features if feature in df.columns]

    if not selected_features:
        print("\nNo suitable numerical features found for anomaly detection. Please inspect the dataset.")
    else:
        X = df[selected_features]
        print("\nSelected features for Isolation Forest:", selected_features)

        # 2. Isolation Forest Model
        # Initialize and fit the Isolation Forest model
        model = IsolationForest(contamination='auto', random_state=42)
        df['anomaly'] = model.fit_predict(X)
        df['anomaly_label'] = df['anomaly'].map({1: 'Normal', -1: 'Anomaly'})

        print("\nAnomaly counts:")
        print(df['anomaly_label'].value_counts())
        print("\nAnomaly Data Points:")
        print(df[df['anomaly'] == -1].head())

        # 3. Display Anomalies
        # Visualize anomalies using a pair plot of the selected numerical features
        if len(selected_features) >= 2:
            sns.pairplot(df[selected_features + ['anomaly_label']], hue='anomaly_label', palette={'Normal': 'blue', 'Anomaly': 'red'})
            plt.suptitle('Anomaly Detection in Network Traffic', y=1.02)
            plt.show()
        elif len(selected_features) == 1:
            plt.figure(figsize=(10, 6))
            sns.histplot(data=df, x=selected_features[0], hue='anomaly_label', palette={'Normal': 'blue', 'Anomaly': 'red'}, multiple="stack")
            plt.title(f'Anomaly Distribution for {selected_features[0]}')
            plt.xlabel(selected_features[0])
            plt.ylabel('Frequency')
            plt.show()
        else:
            print("\nNot enough numerical features selected for visualization.")

except FileNotFoundError:
    print("Error: The file 'network_traffic.csv' was not found. Please make sure the file is in the correct directory.")
except Exception as e:
    print(f"An error occurred: {e}")

Error: The file 'network_traffic.csv' was not found. Please make sure the file is in the correct directory.
