In [2]:
import pandas as pd

# Load dataset (update path accordingly)
df = pd.read_csv("/kaggle/input/network-traffic-data-for-intrusion-detection/network_traffic_data.csv") 

# Extract Source and Destination IP
df_filtered = df[['SourceIP', 'DestinationIP']]

print(df_filtered.head())

        SourceIP  DestinationIP
0  192.168.1.239  192.168.1.234
1  192.168.1.176   192.168.1.82
2  192.168.1.120  192.168.1.113
3  192.168.1.212  192.168.1.140
4   192.168.1.17  192.168.1.223


In [3]:
from collections import deque

window_size = 100  # Example: Process 100 IPs at a time
ip_stream = deque(maxlen=window_size)

for ip in df_filtered["SourceIP"]:
    ip_stream.append(ip)
    # Apply FM Algorithm after window fills


In [4]:
import hashlib
import numpy as np

class FlajoletMartin:
    def __init__(self, num_hashes=4):
        self.num_hashes = num_hashes
        self.max_zeros = np.zeros(num_hashes, dtype=int)

    def hash_function(self, ip, seed):
        hash_val = int(hashlib.md5((ip + str(seed)).encode()).hexdigest(), 16)
        binary = bin(hash_val)[2:]  # Convert to binary
        return len(binary) - len(binary.rstrip('0'))  # Count trailing zeros

    def process_ip(self, ip):
        for i in range(self.num_hashes):
            self.max_zeros[i] = max(self.max_zeros[i], self.hash_function(ip, i))

    def estimate_count(self):
        return 2 ** (np.mean(self.max_zeros))  # FM estimate

# Initialize FM algorithm
fm = FlajoletMartin()
for ip in ip_stream:
    fm.process_ip(ip)

distinct_count_estimate = fm.estimate_count()
print(f"Estimated Distinct IPs: {distinct_count_estimate}")

Estimated Distinct IPs: 53.81737057623773


In [5]:
THRESHOLD = 500  # Adjust based on dataset analysis

if distinct_count_estimate > THRESHOLD:
    print("Potential Attack Detected!")

In [6]:
true_distinct_count = len(set(ip_stream))
accuracy = (1 - abs(true_distinct_count - distinct_count_estimate) / true_distinct_count) * 100

print(f"Accuracy of Distinct Count: {accuracy:.2f}%")

Accuracy of Distinct Count: 68.12%


In [7]:
# Experiment with SHA-256 instead of MD5
def hash_function_sha256(ip, seed):
    hash_val = int(hashlib.sha256((ip + str(seed)).encode()).hexdigest(), 16)
    binary = bin(hash_val)[2:]
    return len(binary) - len(binary.rstrip('0'))