In [1]:
from random import randint
import numpy as np
from math import inf
from loaddata import *
import time

In [2]:
df = load_data('datasets/capture20110811.pcap.netflow.labeled')
df.head()

Unnamed: 0,Date_flow,start,Durat,Prot,Flags,Tos,Packets,Bytes,Flows,Label,src_ip,src_port,dst_ip,dst_port
0,2011-08-11,10:10:00.003,0.967,UDP,INT,0,2,135,1,Background,89.31.8.11,23929,147.32.84.229,13363
1,2011-08-11,10:10:00.003,0.967,UDP,INT,0,2,276,1,Background,147.32.84.229,13363,89.31.8.11,23929
2,2011-08-11,10:10:00.006,0.0,UDP,INT,0,1,62,1,Background,208.88.186.6,34042,147.32.84.229,13363
3,2011-08-11,10:10:00.008,0.0,UDP,INT,0,1,78,1,Background,92.118.218.77,55246,147.32.84.229,13363
4,2011-08-11,10:10:00.009,0.0,UDP,INT,0,1,72,1,Background,182.185.139.181,10223,147.32.84.229,13363


In [3]:
class CountMinSketch():
    
    # Initialization
    def __init__(self, buckets, d):
        self.d = d
        self.buckets = buckets
        # xor used for generating different hash functions
        self.xor = [randint(0, 100000) for i in range(d)]
        # table of size(buckets,d)
        self.table = np.zeros(shape=(d, buckets))

    # Add item into sketch
    def add_item(self, item):
        # Get results from different hashes
        hash_value = hash(item)
        hash_values = [hash_value^self.xor[i] for i in range(self.d)]
        
        for i, val in enumerate(hash_values):
            column = val % self.buckets
            self.table[i, column] += 1
    
    # Check how many times item in sketch
    def check_item(self, item):
        hash_value = hash(item)
        hash_values = [hash_value^self.xor[i] for i in range(self.d)]
        
        min_freq = inf
        for i, val in enumerate(hash_values):
            column = val % self.buckets
            
            if(self.table[i, column] < min_freq):
                min_freq = self.table[i, column]
        
        return min_freq        

In [6]:
def run_sketch(seq, buckets, d):
    print("---------- SKETCH with bins = %d d = %d ----------" % (buckets, d))
    
    # Create sketch
    cms = CountMinSketch(buckets, d)
    
    # Prepare sketch
    start_time = time.time()
    for ip in seq:
        cms.add_item(ip)
    print("Take %s seconds to build sketch" % (time.time() - start_time))
        
    # Test sketch
    ips = list(set(seq))
    count = np.zeros(len(ips))

    # Check their esitimated occurrence
    for idx, ip in enumerate(ips):
        count[idx] = cms.check_item(ip)

    sorted_idx = np.argsort(count)
    for i in range(10):
        idx = sorted_idx[-i-1]
        print("IP: %s, count: %d" % (ips[idx], count[idx]))

In [7]:
host_ip = '147.32.84.165'
seq = load_ip_sequence(df, host_ip)

parameters = []
for bins in [20, 50, 100, 200, 500]:
    for d in [3, 5, 8]:
        parameters.append((bins, d))
    
for vals in parameters:
    run_sketch(seq, vals[0], vals[1])

---------- SKETCH with bins = 20 d = 3 ----------
Take 0.5180401802062988 seconds to build sketch
IP: 158.35.238.238, count: 7905
IP: 193.23.181.44, count: 7905
IP: 168.61.70.66, count: 7711
IP: 194.106.220.35, count: 7711
IP: 184.82.155.107, count: 7711
IP: 68.202.89.226, count: 7711
IP: 216.178.72.158, count: 7711
IP: 147.32.96.45, count: 6076
IP: 46.98.83.57, count: 6076
IP: 154.32.105.132, count: 6076
---------- SKETCH with bins = 20 d = 5 ----------
Take 0.7513084411621094 seconds to build sketch
IP: 193.23.181.44, count: 7289
IP: 174.37.196.55, count: 5898
IP: 184.191.34.121, count: 5009
IP: 112.205.111.114, count: 5009
IP: 196.0.35.154, count: 4515
IP: 46.4.36.120, count: 4515
IP: 68.67.185.209, count: 4515
IP: 168.95.6.53, count: 4515
IP: 157.205.238.165, count: 4515
IP: 85.119.248.7, count: 4515
---------- SKETCH with bins = 20 d = 8 ----------
Take 1.1169641017913818 seconds to build sketch
IP: 193.23.181.44, count: 6986
IP: 216.178.72.158, count: 4720
IP: 155.212.33.146, cou