In [7]:
import pandas as pd
import numpy as np
from sklearn.ensemble import IsolationForest
import re
import tkinter as tk
from tkinter import messagebox
import os
import json

# File to store detected anomalies
STORED_ANOMALIES_FILE = "stored_anomalies.json"

# Load stored anomalies if available
def load_stored_anomalies():
    if os.path.exists(STORED_ANOMALIES_FILE):
        with open(STORED_ANOMALIES_FILE, "r") as file:
            return json.load(file)
    return []

# Save new anomalies to the file
def save_stored_anomalies(stored_anomalies):
    with open(STORED_ANOMALIES_FILE, "w") as file:
        json.dump(stored_anomalies, file, indent=4)

# Load initial anomalies
stored_anomalies = load_stored_anomalies()

# Define signature patterns for detection
SIGNATURES = [
    r'(\bftp\b.*\broot\b)',           # Example: FTP root attempt
    r'(\bhttp\b.*\battack\b)',        # Example: HTTP request with suspicious content
    r'(\bSELECT\b.*\bFROM\b)',        # Common SQL data extraction pattern
    r'(\bDROP\b.*\bTABLE\b)',         # SQL Injection pattern for table deletion
    r'(\b<|>|\balert\b|\bscript\b)'  # Potential XSS pattern with HTML/JavaScript
]

# Load the KDD CUP 1999 dataset (update path if necessary)
column_names = [
    'duration', 'protocol_type', 'service', 'flag', 'src_bytes', 'dst_bytes',
    'land', 'wrong_fragment', 'urgent', 'hot', 'num_failed_logins',
    'logged_in', 'num_compromised', 'root_shell', 'su_attempted', 'num_root',
    'num_file_creations', 'num_shells', 'num_access_files', 'num_outbound_cmds',
    'is_host_login', 'is_guest_login', 'count', 'srv_count', 'serror_rate',
    'srv_serror_rate', 'rerror_rate', 'srv_rerror_rate', 'same_srv_rate',
    'diff_srv_rate', 'srv_diff_host_rate', 'dst_host_count', 'dst_host_srv_count',
    'dst_host_same_srv_rate', 'dst_host_diff_srv_rate', 'dst_host_same_src_port_rate',
    'dst_host_srv_diff_host_rate', 'dst_host_serror_rate', 'dst_host_srv_serror_rate',
    'dst_host_rerror_rate', 'dst_host_srv_rerror_rate', 'label'
]

dataset_path = 'kddcup.data_10_percent_corrected.csv'  # Replace with the correct path
data = pd.read_csv(dataset_path, names=column_names, header=None)

# Step 1: Signature-Based Detection
def signature_based_detection(data, signatures):
    detected_attacks = []
    for index, row in data.iterrows():
        payload = row['service']  # Update this column based on your data's structure
        for signature in signatures:
            if re.search(signature, payload, re.IGNORECASE):
                detected_attacks.append({
                    'src_ip': f"192.168.1.{index % 255}",  # Placeholder IP address
                    'dst_ip': f"10.0.0.{index % 255}",     # Placeholder IP address
                    'payload': payload,
                    'attack_type': 'Signature-based attack'
                })
    return detected_attacks

# Step 2: Anomaly-Based Detection
def anomaly_based_detection(data):
    # Select numerical features for anomaly detection
    numeric_columns = ['duration', 'src_bytes', 'dst_bytes', 'count', 'srv_count']
    
    # Ensure numeric conversion and drop invalid rows
    for col in numeric_columns:
        data[col] = pd.to_numeric(data[col], errors='coerce')
    data = data.dropna(subset=numeric_columns)
    
    features = data[numeric_columns].fillna(0)
    
    # Train the Isolation Forest model
    isolation_forest = IsolationForest(contamination=0.1, random_state=42)
    isolation_forest.fit(features)
    
    # Predict anomalies
    predictions = isolation_forest.predict(features)

    detected_anomalies = []
    for i, prediction in enumerate(predictions):
        if prediction == -1:
            detected_anomalies.append({
                'src_ip': f"192.168.1.{i % 255}",  # Placeholder IP address
                'dst_ip': f"10.0.0.{i % 255}",     # Placeholder IP address
                'payload': 'Anomalous data point',
                'attack_type': 'Anomaly-based attack'
            })
    return detected_anomalies

# Step 3: Popup Alert Function
def show_popup(alert_message):
    root = tk.Tk()
    root.withdraw()  # Hide the main tkinter window
    messagebox.showwarning("Threat Detected", alert_message)
    root.destroy()  # Properly destroy the Tkinter instance after showing

# Step 4: Combine detection methods and show alerts
def detect_intrusions(data, signatures):
    global stored_anomalies
    signature_attacks = signature_based_detection(data, signatures)
    anomaly_attacks = anomaly_based_detection(data)

    # Check if anomaly matches stored signatures
    for anomaly in anomaly_attacks:
        if anomaly['payload'] in stored_anomalies:
            anomaly['attack_type'] = 'Signature-based attack (from stored anomalies)'

    # Add new anomalies to stored signatures
    for anomaly in anomaly_attacks:
        if anomaly['payload'] not in stored_anomalies:
            stored_anomalies.append(anomaly['payload'])

    # Limit the total alerts to 10
    all_attacks = (signature_attacks + anomaly_attacks)[:10]

    # Show popups for the limited attacks
    for attack in all_attacks:
        alert_message = (
            f"Attack Detected!\n"
            f"Source IP: {attack['src_ip']}\n"
            f"Destination IP: {attack['dst_ip']}\n"
            f"Detection Type: {attack['attack_type']}\n"
            f"Payload: {attack['payload']}"
        )
        show_popup(alert_message)

    return all_attacks

# Step 5: Run the detection process
detected_intrusions = detect_intrusions(data, SIGNATURES)

# Save the updated stored anomalies
save_stored_anomalies(stored_anomalies)

# Print detected intrusions to the console for verification (optional)
if detected_intrusions:
    print("Detected Intrusions:")
    for intrusion in detected_intrusions:
        print(
            f"Source IP: {intrusion['src_ip']}, "
            f"Destination IP: {intrusion['dst_ip']}, "
            f"Detection Type: {intrusion['attack_type']}, "
            f"Payload: {intrusion['payload']}"
        )
else:
    print("No intrusions detected.")


  data = pd.read_csv(dataset_path, names=column_names, header=None)


Detected Intrusions:
Source IP: 192.168.1.50, Destination IP: 10.0.0.50, Detection Type: Anomaly-based attack, Payload: Anomalous data point
Source IP: 192.168.1.96, Destination IP: 10.0.0.96, Detection Type: Anomaly-based attack, Payload: Anomalous data point
Source IP: 192.168.1.117, Destination IP: 10.0.0.117, Detection Type: Anomaly-based attack, Payload: Anomalous data point
Source IP: 192.168.1.119, Destination IP: 10.0.0.119, Detection Type: Anomaly-based attack, Payload: Anomalous data point
Source IP: 192.168.1.127, Destination IP: 10.0.0.127, Detection Type: Anomaly-based attack, Payload: Anomalous data point
Source IP: 192.168.1.144, Destination IP: 10.0.0.144, Detection Type: Anomaly-based attack, Payload: Anomalous data point
Source IP: 192.168.1.218, Destination IP: 10.0.0.218, Detection Type: Anomaly-based attack, Payload: Anomalous data point
Source IP: 192.168.1.36, Destination IP: 10.0.0.36, Detection Type: Anomaly-based attack, Payload: Anomalous data point
Source IP

In [9]:
import pandas as pd
import numpy as np
from sklearn.ensemble import IsolationForest
import re
import os
import json
import io
import ipywidgets as widgets
from IPython.display import display, clear_output

# File to store detected anomalies
STORED_ANOMALIES_FILE = "stored_anomalies.json"

# Load stored anomalies if available
def load_stored_anomalies():
    if os.path.exists(STORED_ANOMALIES_FILE):
        with open(STORED_ANOMALIES_FILE, "r") as file:
            return json.load(file)
    return []

# Save new anomalies to the file
def save_stored_anomalies(stored_anomalies):
    with open(STORED_ANOMALIES_FILE, "w") as file:
        json.dump(stored_anomalies, file, indent=4)

# Load initial anomalies
stored_anomalies = load_stored_anomalies()

# Define signature patterns for detection
SIGNATURES = [
    r'(\bftp\b.*\broot\b)',           # Example: FTP root attempt
    r'(\bhttp\b.*\battack\b)',        # Example: HTTP request with suspicious content
    r'(\bSELECT\b.*\bFROM\b)',        # Common SQL data extraction pattern
    r'(\bDROP\b.*\bTABLE\b)',         # SQL Injection pattern for table deletion
    r'(\b<|>|\balert\b|\bscript\b)'  # Potential XSS pattern with HTML/JavaScript
]

# Step 1: Signature-Based Detection
def signature_based_detection(data, signatures):
    detected_attacks = []
    for index, row in data.iterrows():
        payload = row['service']  # Update this column based on your data's structure
        for signature in signatures:
            if re.search(signature, payload, re.IGNORECASE):
                detected_attacks.append({
                    'src_ip': f"192.168.1.{index % 255}",  # Placeholder IP address
                    'dst_ip': f"10.0.0.{index % 255}",     # Placeholder IP address
                    'payload': payload,
                    'attack_type': 'Signature-based attack'
                })
    return detected_attacks

# Step 2: Anomaly-Based Detection
def anomaly_based_detection(data):
    # Select numerical features for anomaly detection
    numeric_columns = ['duration', 'src_bytes', 'dst_bytes', 'count', 'srv_count']
    
    # Ensure numeric conversion and drop invalid rows
    for col in numeric_columns:
        data[col] = pd.to_numeric(data[col], errors='coerce')
    data = data.dropna(subset=numeric_columns)
    
    features = data[numeric_columns].fillna(0)
    
    # Train the Isolation Forest model
    isolation_forest = IsolationForest(contamination=0.1, random_state=42)
    isolation_forest.fit(features)
    
    # Predict anomalies
    predictions = isolation_forest.predict(features)

    detected_anomalies = []
    for i, prediction in enumerate(predictions):
        if prediction == -1:
            detected_anomalies.append({
                'src_ip': f"192.168.1.{i % 255}",  # Placeholder IP address
                'dst_ip': f"10.0.0.{i % 255}",     # Placeholder IP address
                'payload': 'Anomalous data point',
                'attack_type': 'Anomaly-based attack'
            })
    return detected_anomalies

# Step 3: Combine detection methods and display alerts
def detect_intrusions(data, signatures):
    global stored_anomalies
    signature_attacks = signature_based_detection(data, signatures)
    anomaly_attacks = anomaly_based_detection(data)

    # Check if anomaly matches stored signatures
    for anomaly in anomaly_attacks:
        if anomaly['payload'] in stored_anomalies:
            anomaly['attack_type'] = 'Signature-based attack (from stored anomalies)'

    # Add new anomalies to stored signatures
    for anomaly in anomaly_attacks:
        if anomaly['payload'] not in stored_anomalies:
            stored_anomalies.append(anomaly['payload'])

    # Limit the total alerts to 10
    all_attacks = (signature_attacks + anomaly_attacks)[:10]

    # Save anomalies
    save_stored_anomalies(stored_anomalies)

    return all_attacks

# Step 4: User Interface using ipywidgets
def start_interface():
    # Create widgets
    file_upload = widgets.FileUpload(accept='.csv', multiple=False)
    run_button = widgets.Button(description="Run Detection")
    output_area = widgets.Output()
    
    # Define button click handler
    def on_run_button_clicked(b):
        with output_area:
            clear_output()
            if file_upload.value:
                # Load the uploaded file
                uploaded_file = list(file_upload.value.values())[0]
                data = pd.read_csv(io.BytesIO(uploaded_file['content']), header=None)
                data.columns = [
                    'duration', 'protocol_type', 'service', 'flag', 'src_bytes', 'dst_bytes',
                    'land', 'wrong_fragment', 'urgent', 'hot', 'num_failed_logins',
                    'logged_in', 'num_compromised', 'root_shell', 'su_attempted', 'num_root',
                    'num_file_creations', 'num_shells', 'num_access_files', 'num_outbound_cmds',
                    'is_host_login', 'is_guest_login', 'count', 'srv_count', 'serror_rate',
                    'srv_serror_rate', 'rerror_rate', 'srv_rerror_rate', 'same_srv_rate',
                    'diff_srv_rate', 'srv_diff_host_rate', 'dst_host_count', 'dst_host_srv_count',
                    'dst_host_same_srv_rate', 'dst_host_diff_srv_rate', 'dst_host_same_src_port_rate',
                    'dst_host_srv_diff_host_rate', 'dst_host_serror_rate', 'dst_host_srv_serror_rate',
                    'dst_host_rerror_rate', 'dst_host_srv_rerror_rate', 'label'
                ]

                # Run intrusion detection
                intrusions = detect_intrusions(data, SIGNATURES)
                
                # Display results
                if intrusions:
                    print("Detected Intrusions:")
                    for intrusion in intrusions:
                        print(
                            f"Source IP: {intrusion['src_ip']}, "
                            f"Destination IP: {intrusion['dst_ip']}, "
                            f"Detection Type: {intrusion['attack_type']}, "
                            f"Payload: {intrusion['payload']}"
                        )
                else:
                    print("No intrusions detected.")
            else:
                print("Please upload a dataset file to run detection.")

    # Attach the handler to the button
    run_button.on_click(on_run_button_clicked)

    # Display widgets
    display(widgets.VBox([widgets.Label("Upload your dataset (CSV):"), file_upload, run_button, output_area]))

# Start the interface
start_interface()


VBox(children=(Label(value='Upload your dataset (CSV):'), FileUpload(value={}, accept='.csv', description='Upl…

In [10]:
pip install streamlit


Collecting streamlit
  Downloading streamlit-1.41.0-py2.py3-none-any.whl (23.4 MB)
     ---------------------------------------- 23.4/23.4 MB 3.6 MB/s eta 0:00:00
Collecting cachetools<6,>=4.0
  Downloading cachetools-5.5.0-py3-none-any.whl (9.5 kB)
Collecting pyarrow>=7.0
  Downloading pyarrow-18.1.0-cp39-cp39-win_amd64.whl (25.3 MB)
     ---------------------------------------- 25.3/25.3 MB 3.7 MB/s eta 0:00:00
Collecting altair<6,>=4.0
  Downloading altair-5.5.0-py3-none-any.whl (731 kB)
     -------------------------------------- 731.2/731.2 kB 6.6 MB/s eta 0:00:00
Collecting gitpython!=3.1.19,<4,>=3.0.7
  Downloading GitPython-3.1.43-py3-none-any.whl (207 kB)
     ------------------------------------- 207.3/207.3 kB 13.1 MB/s eta 0:00:00
Collecting tenacity<10,>=8.1.0
  Downloading tenacity-9.0.0-py3-none-any.whl (28 kB)
Collecting pydeck<1,>=0.8.0b4
  Downloading pydeck-0.9.1-py2.py3-none-any.whl (6.9 MB)
     ---------------------------------------- 6.9/6.9 MB 4.0 MB/s eta 0:00:

In [11]:
import pandas as pd
import numpy as np
from sklearn.ensemble import IsolationForest
import re
import os
import json
import streamlit as st
from io import StringIO

# File to store detected anomalies
STORED_ANOMALIES_FILE = "stored_anomalies.json"

# Load stored anomalies if available
def load_stored_anomalies():
    if os.path.exists(STORED_ANOMALIES_FILE):
        with open(STORED_ANOMALIES_FILE, "r") as file:
            return json.load(file)
    return []

# Save new anomalies to the file
def save_stored_anomalies(stored_anomalies):
    with open(STORED_ANOMALIES_FILE, "w") as file:
        json.dump(stored_anomalies, file, indent=4)

# Load initial anomalies
stored_anomalies = load_stored_anomalies()

# Define signature patterns for detection
SIGNATURES = [
    r'(\bftp\b.*\broot\b)',           # Example: FTP root attempt
    r'(\bhttp\b.*\battack\b)',        # Example: HTTP request with suspicious content
    r'(\bSELECT\b.*\bFROM\b)',        # Common SQL data extraction pattern
    r'(\bDROP\b.*\bTABLE\b)',         # SQL Injection pattern for table deletion
    r'(\b<|>|\balert\b|\bscript\b)'   # Potential XSS pattern with HTML/JavaScript
]

# Load the KDD CUP 1999 dataset (replace with correct path)
column_names = [
    'duration', 'protocol_type', 'service', 'flag', 'src_bytes', 'dst_bytes',
    'land', 'wrong_fragment', 'urgent', 'hot', 'num_failed_logins',
    'logged_in', 'num_compromised', 'root_shell', 'su_attempted', 'num_root',
    'num_file_creations', 'num_shells', 'num_access_files', 'num_outbound_cmds',
    'is_host_login', 'is_guest_login', 'count', 'srv_count', 'serror_rate',
    'srv_serror_rate', 'rerror_rate', 'srv_rerror_rate', 'same_srv_rate',
    'diff_srv_rate', 'srv_diff_host_rate', 'dst_host_count', 'dst_host_srv_count',
    'dst_host_same_srv_rate', 'dst_host_diff_srv_rate', 'dst_host_same_src_port_rate',
    'dst_host_srv_diff_host_rate', 'dst_host_serror_rate', 'dst_host_srv_serror_rate',
    'dst_host_rerror_rate', 'dst_host_srv_rerror_rate', 'label'
]

# Step 1: Signature-Based Detection
def signature_based_detection(data, signatures):
    detected_attacks = []
    for index, row in data.iterrows():
        payload = row['service']  # Update this column based on your data's structure
        for signature in signatures:
            if re.search(signature, payload, re.IGNORECASE):
                detected_attacks.append({
                    'src_ip': f"192.168.1.{index % 255}",  # Placeholder IP address
                    'dst_ip': f"10.0.0.{index % 255}",     # Placeholder IP address
                    'payload': payload,
                    'attack_type': 'Signature-based attack'
                })
    return detected_attacks

# Step 2: Anomaly-Based Detection
def anomaly_based_detection(data):
    # Select numerical features for anomaly detection
    numeric_columns = ['duration', 'src_bytes', 'dst_bytes', 'count', 'srv_count']
    
    # Ensure numeric conversion and drop invalid rows
    for col in numeric_columns:
        data[col] = pd.to_numeric(data[col], errors='coerce')
    data = data.dropna(subset=numeric_columns)
    
    features = data[numeric_columns].fillna(0)
    
    # Train the Isolation Forest model
    isolation_forest = IsolationForest(contamination=0.1, random_state=42)
    isolation_forest.fit(features)
    
    # Predict anomalies
    predictions = isolation_forest.predict(features)

    detected_anomalies = []
    for i, prediction in enumerate(predictions):
        if prediction == -1:
            detected_anomalies.append({
                'src_ip': f"192.168.1.{i % 255}",  # Placeholder IP address
                'dst_ip': f"10.0.0.{i % 255}",     # Placeholder IP address
                'payload': 'Anomalous data point',
                'attack_type': 'Anomaly-based attack'
            })
    return detected_anomalies

# Step 4: Combine detection methods
def detect_intrusions(data, signatures):
    global stored_anomalies
    signature_attacks = signature_based_detection(data, signatures)
    anomaly_attacks = anomaly_based_detection(data)

    # Check if anomaly matches stored signatures
    for anomaly in anomaly_attacks:
        if anomaly['payload'] in stored_anomalies:
            anomaly['attack_type'] = 'Signature-based attack (from stored anomalies)'

    # Add new anomalies to stored signatures
    for anomaly in anomaly_attacks:
        if anomaly['payload'] not in stored_anomalies:
            stored_anomalies.append(anomaly['payload'])

    # Limit the total alerts to 10
    all_attacks = (signature_attacks + anomaly_attacks)[:10]
    
    return all_attacks

# Streamlit interface
st.title("Intrusion Detection System")

# File uploader for dataset
uploaded_file = st.file_uploader("Upload Dataset (CSV)", type=["csv"])

if uploaded_file is not None:
    # Read and display the dataset
    data = pd.read_csv(uploaded_file)
    st.write("Dataset Loaded:")
    st.dataframe(data.head())

    # Button to run intrusion detection
    if st.button("Detect Intrusions"):
        detected_intrusions = detect_intrusions(data, SIGNATURES)

        # Display the results
        if detected_intrusions:
            st.subheader("Detected Intrusions:")
            for intrusion in detected_intrusions:
                st.write(
                    f"Source IP: {intrusion['src_ip']}, "
                    f"Destination IP: {intrusion['dst_ip']}, "
                    f"Detection Type: {intrusion['attack_type']}, "
                    f"Payload: {intrusion['payload']}"
                )
        else:
            st.write("No intrusions detected.")

        # Save the updated stored anomalies
        save_stored_anomalies(stored_anomalies)
else:
    st.write("Please upload a dataset to start.")


2024-12-13 21:49:21.261 
  command:

    streamlit run C:\Users\Ravi choudary\anaconda3\lib\site-packages\ipykernel_launcher.py [ARGUMENTS]


In [12]:
streamlit run intrusin,pu

SyntaxError: invalid syntax (1090521028.py, line 1)

In [13]:
streamlit run intrusin.py


SyntaxError: invalid syntax (2841047790.py, line 1)

In [None]:
import pandas as pd
import numpy as np
from sklearn.ensemble import IsolationForest
from sklearn.model_selection import train_test_split
import re
import tkinter as tk
from tkinter import messagebox
import os
import json

# File to store detected anomalies
STORED_ANOMALIES_FILE = "stored_anomalies.json"

# Load stored anomalies if available
def load_stored_anomalies():
    if os.path.exists(STORED_ANOMALIES_FILE):
        with open(STORED_ANOMALIES_FILE, "r") as file:
            return json.load(file)
    return []

# Save new anomalies to the file
def save_stored_anomalies(stored_anomalies):
    with open(STORED_ANOMALIES_FILE, "w") as file:
        json.dump(stored_anomalies, file, indent=4)

# Load initial anomalies
stored_anomalies = load_stored_anomalies()

# Define signature patterns for detection
SIGNATURES = [
    r'(\bftp\b.*\broot\b)',           # Example: FTP root attempt
    r'(\bhttp\b.*\battack\b)',        # Example: HTTP request with suspicious content
    r'(\bSELECT\b.*\bFROM\b)',        # Common SQL data extraction pattern
    r'(\bDROP\b.*\bTABLE\b)',         # SQL Injection pattern for table deletion
    r'(\b<|>|\balert\b|\bscript\b)'   # Potential XSS pattern with HTML/JavaScript
]

# Load the KDD CUP 1999 dataset (update path if necessary)
column_names = [
    'duration', 'protocol_type', 'service', 'flag', 'src_bytes', 'dst_bytes',
    'land', 'wrong_fragment', 'urgent', 'hot', 'num_failed_logins',
    'logged_in', 'num_compromised', 'root_shell', 'su_attempted', 'num_root',
    'num_file_creations', 'num_shells', 'num_access_files', 'num_outbound_cmds',
    'is_host_login', 'is_guest_login', 'count', 'srv_count', 'serror_rate',
    'srv_serror_rate', 'rerror_rate', 'srv_rerror_rate', 'same_srv_rate',
    'diff_srv_rate', 'srv_diff_host_rate', 'dst_host_count', 'dst_host_srv_count',
    'dst_host_same_srv_rate', 'dst_host_diff_srv_rate', 'dst_host_same_src_port_rate',
    'dst_host_srv_diff_host_rate', 'dst_host_serror_rate', 'dst_host_srv_serror_rate',
    'dst_host_rerror_rate', 'dst_host_srv_rerror_rate', 'label'
]

dataset_path = 'kddcup.data_10_percent_corrected.csv'  # Replace with the correct path
data = pd.read_csv(dataset_path, names=column_names, header=None)

# Step 1: Signature-Based Detection
def signature_based_detection(data, signatures):
    detected_attacks = []
    for index, row in data.iterrows():
        payload = row['service']  # Update this column based on your data's structure
        for signature in signatures:
            if re.search(signature, payload, re.IGNORECASE):
                detected_attacks.append({
                    'src_ip': f"192.168.1.{index % 255}",  # Placeholder IP address
                    'dst_ip': f"10.0.0.{index % 255}",     # Placeholder IP address
                    'payload': payload,
                    'attack_type': 'Signature-based attack'
                })
    return detected_attacks

# Step 2: Anomaly-Based Detection
def anomaly_based_detection(train_data, test_data):
    # Select numerical features for anomaly detection
    numeric_columns = ['duration', 'src_bytes', 'dst_bytes', 'count', 'srv_count']
    
    # Ensure numeric conversion and drop invalid rows for training and testing data
    for col in numeric_columns:
        train_data[col] = pd.to_numeric(train_data[col], errors='coerce')
        test_data[col] = pd.to_numeric(test_data[col], errors='coerce')
        
    train_data = train_data.dropna(subset=numeric_columns)
    test_data = test_data.dropna(subset=numeric_columns)
    
    # Features for training and testing
    X_train = train_data[numeric_columns].fillna(0)
    X_test = test_data[numeric_columns].fillna(0)
    
    # Train the Isolation Forest model on the training data
    isolation_forest = IsolationForest(contamination=0.1, random_state=42)
    isolation_forest.fit(X_train)
    
    # Predict anomalies on the test data
    predictions = isolation_forest.predict(X_test)

    detected_anomalies = []
    for i, prediction in enumerate(predictions):
        if prediction == -1:  # Anomaly detected
            detected_anomalies.append({
                'src_ip': f"192.168.1.{i % 255}",  # Placeholder IP address
                'dst_ip': f"10.0.0.{i % 255}",     # Placeholder IP address
                'payload': 'Anomalous data point',
                'attack_type': 'Anomaly-based attack'
            })
    return detected_anomalies

# Step 3: Popup Alert Function
def show_popup(alert_message):
    root = tk.Tk()
    root.withdraw()  # Hide the main tkinter window
    messagebox.showwarning("Threat Detected", alert_message)
    root.destroy()  # Properly destroy the Tkinter instance after showing

# Step 4: Combine detection methods and show alerts
def detect_intrusions(data, signatures):
    global stored_anomalies
    signature_attacks = signature_based_detection(data, signatures)

    # Step 5: Split the data into training and testing sets (80% for training, 20% for testing)
    train_data, test_data = train_test_split(data, test_size=0.2, random_state=42)

    # Step 6: Anomaly-based detection using the train and test data
    anomaly_attacks = anomaly_based_detection(train_data, test_data)

    # Check if anomaly matches stored signatures
    for anomaly in anomaly_attacks:
        if anomaly['payload'] in stored_anomalies:
            anomaly['attack_type'] = 'Signature-based attack (from stored anomalies)'

    # Add new anomalies to stored signatures
    for anomaly in anomaly_attacks:
        if anomaly['payload'] not in stored_anomalies:
            stored_anomalies.append(anomaly['payload'])

    # Limit the total alerts to 10
    all_attacks = (signature_attacks + anomaly_attacks)[:10]

    # Show popups for the limited attacks
    for attack in all_attacks:
        alert_message = (
            f"Attack Detected!\n"
            f"Source IP: {attack['src_ip']}\n"
            f"Destination IP: {attack['dst_ip']}\n"
            f"Detection Type: {attack['attack_type']}\n"
            f"Payload: {attack['payload']}"
        )
        show_popup(alert_message)

    return all_attacks

# Step 7: Run the detection process
detected_intrusions = detect_intrusions(data, SIGNATURES)

# Save the updated stored anomalies
save_stored_anomalies(stored_anomalies)

# Print detected intrusions to the console for verification (optional)
if detected_intrusions:
    print("Detected Intrusions:")
    for intrusion in detected_intrusions:
        print(
            f"Source IP: {intrusion['src_ip']}, "
            f"Destination IP: {intrusion['dst_ip']}, "
            f"Detection Type: {intrusion['attack_type']}, "
            f"Payload: {intrusion['payload']}"
        )
else:
    print("No intrusions detected.")


  data = pd.read_csv(dataset_path, names=column_names, header=None)


In [None]:
import pandas as pd
from sklearn.ensemble import IsolationForest
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix

# Step 1: Load the dataset
dtype = {
    'duration': float,
    'protocol_type': str,
    'service': str,
    'flag': str,
    'src_bytes': float,
    'dst_bytes': float,
    'land': str,  # Example for non-numeric columns
    'wrong_fragment': str,
    # Add other columns with their respective data types
}

column_names = [
    'duration', 'protocol_type', 'service', 'flag', 'src_bytes', 'dst_bytes',
    'land', 'wrong_fragment', 'urgent', 'hot', 'num_failed_logins',
    'logged_in', 'num_compromised', 'root_shell', 'su_attempted', 'num_root',
    'num_file_creations', 'num_shells', 'num_access_files', 'num_outbound_cmds',
    'is_host_login', 'is_guest_login', 'count', 'srv_count', 'serror_rate',
    'srv_serror_rate', 'rerror_rate', 'srv_rerror_rate', 'same_srv_rate',
    'diff_srv_rate', 'srv_diff_host_rate', 'dst_host_count', 'dst_host_srv_count',
    'dst_host_same_srv_rate', 'dst_host_diff_srv_rate', 'dst_host_same_src_port_rate',
    'dst_host_srv_diff_host_rate', 'dst_host_serror_rate', 'dst_host_srv_serror_rate',
    'dst_host_rerror_rate', 'dst_host_srv_rerror_rate', 'label'
]

dataset_path = 'kddcup.data_10_percent_corrected.csv'  # Replace with actual path
data = pd.read_csv(dataset_path, names=column_names, header=None, dtype=dtype, low_memory=False)

# Step 2: Split the dataset into training and testing sets
train_data, test_data = train_test_split(data, test_size=0.2, random_state=42)

# Step 3: Feature Selection (numerical columns for anomaly detection)
numeric_columns = ['duration', 'src_bytes', 'dst_bytes', 'count', 'srv_count']
X_train = train_data[numeric_columns].fillna(0)
X_test = test_data[numeric_columns].fillna(0)

# Step 4: Initialize and Train the Isolation Forest model
isolation_forest = IsolationForest(contamination=0.1, random_state=42)
isolation_forest.fit(X_train)

# Step 5: Make predictions on the test dataset
predictions = isolation_forest.predict(X_test)

# Convert -1 to 'Anomaly' and 1 to 'Normal' for evaluation purposes
predictions = ['Anomaly' if p == -1 else 'Normal' for p in predictions]

# Step 6: Show the training and testing datasets
print("Training Data (First 5 rows):")
print(X_train.head())

print("\nTesting Data (First 5 rows):")
print(X_test.head())

# Step 7: Evaluation (since the real anomaly labels are not available, this can be simulated)
# Assuming we don't have ground truth labels, you could evaluate with dummy labels or use manual evaluation:
# For demonstration, let's assume anomalies as 1 and normal as 0 (this would normally come from actual labels if available)

# Here, for evaluation purposes, we'll just print the classification report and confusion matrix
# Note: Normally, you would have a ground truth for comparison
print("\nModel Evaluation:")
print(classification_report(predictions, predictions))  # Here, you should replace with actual labels
print(confusion_matrix(predictions, predictions))  # Here, you should replace with actual labels


In [18]:
import pandas as pd
from sklearn.ensemble import IsolationForest
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix

In [24]:
import pandas as pd

# Step 1: Load the dataset without specifying dtype initially to inspect the data
dataset_path = 'kddcup.data_10_percent_corrected.csv'  # Replace with actual path
column_names = [
    'duration', 'protocol_type', 'service', 'flag', 'src_bytes', 'dst_bytes',
    'land', 'wrong_fragment', 'urgent', 'hot', 'num_failed_logins',
    'logged_in', 'num_compromised', 'root_shell', 'su_attempted', 'num_root',
    'num_file_creations', 'num_shells', 'num_access_files', 'num_outbound_cmds',
    'is_host_login', 'is_guest_login', 'count', 'srv_count', 'serror_rate',
    'srv_serror_rate', 'rerror_rate', 'srv_rerror_rate', 'same_srv_rate',
    'diff_srv_rate', 'srv_diff_host_rate', 'dst_host_count', 'dst_host_srv_count',
    'dst_host_same_srv_rate', 'dst_host_diff_srv_rate', 'dst_host_same_src_port_rate',
    'dst_host_srv_diff_host_rate', 'dst_host_serror_rate', 'dst_host_srv_serror_rate',
    'dst_host_rerror_rate', 'dst_host_srv_rerror_rate', 'label'
]

# Load dataset without dtype specification
data = pd.read_csv(dataset_path, names=column_names, header=None, low_memory=False)

# Step 2: Inspect the first few rows to check for non-numeric issues
print(data.head())

# Step 3: Check data types and identify any columns that may have been incorrectly inferred as numeric
print(data.dtypes)

# Step 4: Convert numeric columns with non-numeric values to numeric (use errors='coerce' to turn invalid values to NaN)
numeric_columns = ['duration', 'src_bytes', 'dst_bytes', 'count', 'srv_count']
data[numeric_columns] = data[numeric_columns].apply(pd.to_numeric, errors='coerce')

# Step 5: Handle missing values by filling NaN values (e.g., with 0 or mean)
data.fillna(0, inplace=True)

# Step 6: Check again after cleaning
print(data.head())

# Now, you can proceed with your train-test split and model training


   duration  protocol_type  service  flag  src_bytes  dst_bytes  land  \
0  duration  protocol_type  service  flag  src_bytes  dst_bytes  land   
1         0            tcp     http    SF        181       5450     0   
2         0            tcp     http    SF        239        486     0   
3         0            tcp     http    SF        235       1337     0   
4         0            tcp     http    SF        219       1337     0   

   wrong_fragment  urgent  hot  ...  dst_host_srv_count  \
0  wrong_fragment  urgent  hot  ...  dst_host_srv_count   
1               0       0    0  ...                   9   
2               0       0    0  ...                  19   
3               0       0    0  ...                  29   
4               0       0    0  ...                  39   

   dst_host_same_srv_rate  dst_host_diff_srv_rate  \
0  dst_host_same_srv_rate  dst_host_diff_srv_rate   
1                     1.0                     0.0   
2                     1.0                     0

In [29]:
# Step 2: Split the dataset into training and testing sets
train_data, test_data = train_test_split(data, test_size=0.2, random_state=42)

# Step 3: Feature Selection (numerical columns for anomaly detection)
numeric_columns = ['duration', 'src_bytes', 'dst_bytes', 'count', 'srv_count']
X_train = train_data[numeric_columns].fillna(0)
X_test = test_data[numeric_columns].fillna(0)

# Step 4: Initialize and Train the Isolation Forest model
isolation_forest = IsolationForest(contamination=0.1, random_state=42)
isolation_forest.fit(X_train)

# Step 5: Make predictions on the test dataset
predictions = isolation_forest.predict(X_test)

# Convert -1 to 'Anomaly' and 1 to 'Normal' for evaluation purposes
predictions = ['Anomaly' if p == -1 else 'Normal' for p in predictions]

# Step 6: Show the training and testing datasets
print("Training Data (First 5 rows):")
print(X_train.head())



Training Data (First 5 rows):
       duration  src_bytes  dst_bytes  count  srv_count
61089       0.0      297.0      327.0   26.0       26.0
40286       0.0      346.0     3694.0    6.0        7.0
50242       0.0      208.0      393.0    8.0       13.0
46990       0.0      148.0      427.0    1.0        2.0
14217       0.0      310.0      336.0    9.0        9.0


In [30]:
data.describe()

Unnamed: 0,duration,src_bytes,dst_bytes,count,srv_count,serror_rate,srv_serror_rate,rerror_rate,srv_rerror_rate,same_srv_rate,...,logged_in_logged_in,root_shell_0,root_shell_1,root_shell_root_shell,su_attempted_0,su_attempted_su_attempted,is_host_login_0,is_host_login_is_host_login,is_guest_login_0,is_guest_login_is_guest_login
count,64294.0,64294.0,64294.0,64294.0,64294.0,64294.0,64294.0,64294.0,64294.0,64294.0,...,64294.0,64294.0,64294.0,64294.0,64294.0,64294.0,64294.0,64294.0,64294.0,64294.0
mean,1.26623,2084.757629,4256.075,9.203907,11.6919,0.00445,0.004783,0.085368,0.087672,0.997529,...,1.6e-05,0.999704,0.00028,1.6e-05,0.999984,1.6e-05,0.999984,1.6e-05,0.999984,1.6e-05
std,162.532963,9826.523839,18988.77,13.219673,11.626554,0.05745,0.056281,0.277118,0.276203,0.045344,...,0.003944,0.017188,0.01673,0.003944,0.003944,0.003944,0.003944,0.003944,0.003944,0.003944
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,210.0,473.0,2.0,3.0,0.0,0.0,0.0,0.0,1.0,...,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0
50%,0.0,243.0,1502.0,6.0,8.0,0.0,0.0,0.0,0.0,1.0,...,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0
75%,0.0,306.0,4361.0,13.0,17.0,0.0,0.0,0.0,0.0,1.0,...,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0
max,41065.0,54540.0,3916592.0,511.0,109.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [33]:
print("Training Data (First 5 rows):")
print(X_train.head())

Training Data (First 5 rows):
       duration  src_bytes  dst_bytes  count  srv_count
61089       0.0      297.0      327.0   26.0       26.0
40286       0.0      346.0     3694.0    6.0        7.0
50242       0.0      208.0      393.0    8.0       13.0
46990       0.0      148.0      427.0    1.0        2.0
14217       0.0      310.0      336.0    9.0        9.0


In [34]:
data.describe()

Unnamed: 0,duration,src_bytes,dst_bytes,count,srv_count,serror_rate,srv_serror_rate,rerror_rate,srv_rerror_rate,same_srv_rate,...,logged_in_logged_in,root_shell_0,root_shell_1,root_shell_root_shell,su_attempted_0,su_attempted_su_attempted,is_host_login_0,is_host_login_is_host_login,is_guest_login_0,is_guest_login_is_guest_login
count,64294.0,64294.0,64294.0,64294.0,64294.0,64294.0,64294.0,64294.0,64294.0,64294.0,...,64294.0,64294.0,64294.0,64294.0,64294.0,64294.0,64294.0,64294.0,64294.0,64294.0
mean,1.26623,2084.757629,4256.075,9.203907,11.6919,0.00445,0.004783,0.085368,0.087672,0.997529,...,1.6e-05,0.999704,0.00028,1.6e-05,0.999984,1.6e-05,0.999984,1.6e-05,0.999984,1.6e-05
std,162.532963,9826.523839,18988.77,13.219673,11.626554,0.05745,0.056281,0.277118,0.276203,0.045344,...,0.003944,0.017188,0.01673,0.003944,0.003944,0.003944,0.003944,0.003944,0.003944,0.003944
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,210.0,473.0,2.0,3.0,0.0,0.0,0.0,0.0,1.0,...,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0
50%,0.0,243.0,1502.0,6.0,8.0,0.0,0.0,0.0,0.0,1.0,...,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0
75%,0.0,306.0,4361.0,13.0,17.0,0.0,0.0,0.0,0.0,1.0,...,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0
max,41065.0,54540.0,3916592.0,511.0,109.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [35]:
data.head()

Unnamed: 0,duration,src_bytes,dst_bytes,num_failed_logins,num_compromised,num_root,num_file_creations,num_shells,num_access_files,num_outbound_cmds,...,logged_in_logged_in,root_shell_0,root_shell_1,root_shell_root_shell,su_attempted_0,su_attempted_su_attempted,is_host_login_0,is_host_login_is_host_login,is_guest_login_0,is_guest_login_is_guest_login
0,0.0,0.0,0.0,num_failed_logins,num_compromised,num_root,num_file_creations,num_shells,num_access_files,num_outbound_cmds,...,1,0,0,1,0,1,0,1,0,1
1,0.0,181.0,5450.0,0,0,0,0,0,0,0,...,0,1,0,0,1,0,1,0,1,0
2,0.0,239.0,486.0,0,0,0,0,0,0,0,...,0,1,0,0,1,0,1,0,1,0
3,0.0,235.0,1337.0,0,0,0,0,0,0,0,...,0,1,0,0,1,0,1,0,1,0
4,0.0,219.0,1337.0,0,0,0,0,0,0,0,...,0,1,0,0,1,0,1,0,1,0


In [38]:
import pandas as pd
from sklearn.ensemble import IsolationForest
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.preprocessing import OneHotEncoder

# Step 1: Load the dataset
dtype = {
    'duration': float,
    'protocol_type': str,
    'service': str,
    'flag': str,
    'src_bytes': float,
    'dst_bytes': float,
    'land': str,  
    'wrong_fragment': str,
    'urgent': str, 
    'hot': str,  
    'num_failed_logins': float,  
    'logged_in': str,  
    'num_compromised': float,  
    'root_shell': str,  
    'su_attempted': str,  
    'num_root': float,  
    'num_file_creations': float,  
    'num_shells': str,  
    'num_access_files': float,  
    'num_outbound_cmds': float,  
    'is_host_login': str,  
    'is_guest_login': str,  
    'count': float,  
    'srv_count': float,  
    'serror_rate': float,  
    'srv_serror_rate': float,  
    'rerror_rate': float,  
    'srv_rerror_rate': float,  
    'same_srv_rate': float,  
    'diff_srv_rate': float,  
    'srv_diff_host_rate': float,  
    'dst_host_count': float,  
    'dst_host_srv_count': float,  
    'dst_host_same_srv_rate': float,  
    'dst_host_diff_srv_rate': float,  
    'dst_host_same_src_port_rate': float,  
    'dst_host_srv_diff_host_rate': float,  
    'dst_host_serror_rate': float,  
    'dst_host_srv_serror_rate': float,  
    'dst_host_rerror_rate': float,  
    'dst_host_srv_rerror_rate': float,  
    'label': str  
}

column_names = [
    'duration', 'protocol_type', 'service', 'flag', 'src_bytes', 'dst_bytes',
    'land', 'wrong_fragment', 'urgent', 'hot', 'num_failed_logins',
    'logged_in', 'num_compromised', 'root_shell', 'su_attempted', 'num_root',
    'num_file_creations', 'num_shells', 'num_access_files', 'num_outbound_cmds',
    'is_host_login', 'is_guest_login', 'count', 'srv_count', 'serror_rate',
    'srv_serror_rate', 'rerror_rate', 'srv_rerror_rate', 'same_srv_rate',
    'diff_srv_rate', 'srv_diff_host_rate', 'dst_host_count', 'dst_host_srv_count',
    'dst_host_same_srv_rate', 'dst_host_diff_srv_rate', 'dst_host_same_src_port_rate',
    'dst_host_srv_diff_host_rate', 'dst_host_serror_rate', 'dst_host_srv_serror_rate',
    'dst_host_rerror_rate', 'dst_host_srv_rerror_rate', 'label'
]

dataset_path = 'kddcup.data_10_percent_corrected.csv'  # Replace with actual path
data = pd.read_csv(dataset_path, names=column_names, header=None, low_memory=False)

# Step 2: Convert categorical columns to numeric using one-hot encoding
categorical_columns = ['protocol_type', 'service', 'flag', 'land', 'wrong_fragment', 'urgent', 'hot', 'logged_in', 'root_shell', 'su_attempted', 'is_host_login', 'is_guest_login']
data = pd.get_dummies(data, columns=categorical_columns)

# Step 3: Handle missing values or conversion issues
numeric_columns = ['duration', 'src_bytes', 'dst_bytes', 'count', 'srv_count', 'serror_rate', 'srv_serror_rate', 'rerror_rate', 'srv_rerror_rate', 'same_srv_rate', 'diff_srv_rate', 'srv_diff_host_rate', 'dst_host_count', 'dst_host_srv_count', 'dst_host_same_srv_rate', 'dst_host_diff_srv_rate', 'dst_host_same_src_port_rate', 'dst_host_srv_diff_host_rate', 'dst_host_serror_rate', 'dst_host_srv_serror_rate', 'dst_host_rerror_rate', 'dst_host_srv_rerror_rate']
data[numeric_columns] = data[numeric_columns].apply(pd.to_numeric, errors='coerce')
data.fillna(0, inplace=True)

# Step 4: Split the dataset into features (X) and target (y)
X = data.drop(columns=['label'])
y = data['label']

# Step 5: Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Step 6: Initialize and train the IsolationForest model
model = IsolationForest()
model.fit(X_train)

# Step 7: Make predictions using the model
y_pred = model.predict(X_test)

# Since IsolationForest returns -1 for anomalies and 1 for normal data, we will adjust labels:
y_pred = [1 if pred == -1 else 0 for pred in y_pred]

# Step 8: Evaluate the model performance using confusion matrix
from sklearn.metrics import classification_report, confusion_matrix
print("Classification Report:")
print(classification_report(y_test, y_pred))

print("\nConfusion Matrix:")
conf_matrix = confusion_matrix(y_test, y_pred)
print(conf_matrix)


ValueError: could not convert string to float: 'num_failed_logins'