In [27]:
from random import randint
import numpy as np
from math import inf
from loaddata import *

In [29]:
df = load_data('datasets/capture20110811.pcap.netflow.labeled')
df.head()

Unnamed: 0,Date_flow,start,Durat,Prot,Flags,Tos,Packets,Bytes,Flows,Label,src_ip,src_port,dst_ip,dst_port
0,2011-08-11,10:10:00.003,0.967,UDP,INT,0,2,135,1,Background,89.31.8.11,23929,147.32.84.229,13363
1,2011-08-11,10:10:00.003,0.967,UDP,INT,0,2,276,1,Background,147.32.84.229,13363,89.31.8.11,23929
2,2011-08-11,10:10:00.006,0.0,UDP,INT,0,1,62,1,Background,208.88.186.6,34042,147.32.84.229,13363
3,2011-08-11,10:10:00.008,0.0,UDP,INT,0,1,78,1,Background,92.118.218.77,55246,147.32.84.229,13363
4,2011-08-11,10:10:00.009,0.0,UDP,INT,0,1,72,1,Background,182.185.139.181,10223,147.32.84.229,13363


In [42]:
class CountMinSketch():
    
    # Initialization
    def __init__(self, buckets, d):
        self.d = d
        self.buckets = buckets
        # xor used for generating different hash functions
        self.xor = [randint(0, 100000) for i in range(d)]
        # table of size(buckets,d)
        self.table = np.zeros(shape=(d, buckets))

    # Add item into sketch
    def add_item(self, item):
        # Get results from different hashes
        hash_value = hash(item)
        hash_values = [hash_value^self.xor[i] for i in range(self.d)]
        
        for i, val in enumerate(hash_values):
            column = val % self.buckets
            self.table[i, column] += 1
    
    # Check how many times item in sketch
    def check_item(self, item):
        hash_value = hash(item)
        hash_values = [hash_value^self.xor[i] for i in range(self.d)]
        
        min_freq = inf
        for i, val in enumerate(hash_values):
            column = val % self.buckets
            
            if(self.table[i, column] < min_freq):
                min_freq = self.table[i, column]
        
        return min_freq        

In [60]:
def run_sketch(seq, buckets, d):
    print("---------- SKETCH with bins = %d d = %d ----------" % (buckets, d))
    
    # Create sketch
    cms = CountMinSketch(buckets, d)
    
    # Prepare sketch
    for ip in seq:
        cms.add_item(ip)
        
    # Test sketch
    # Get unique IPs
    ips = list(set(seq))
    count = np.zeros(len(ips))

    # Check their esitimated occurrence
    for idx, ip in enumerate(ips):
        count[idx] = cms.check_item(ip)

    sorted_idx = np.argsort(count)
    for i in range(10):
        idx = sorted_idx[-i-1]
        print("IP: %s, count: %d" % (ips[idx], count[idx]))

In [63]:
host_ip = '147.32.84.165'
seq = load_ip_sequence(df, host_ip)

# Sketch parameters
parameters = []
for bins in [20, 50, 100, 200, 500]:
    for d in [3, 5, 8]:
        parameters.append((bins, d))

# Run sketch        
for vals in parameters:
    run_sketch(seq, vals[0], vals[1])

---------- SKETCH with bins = 20 d = 3 ----------
IP: 93.116.166.52, count: 7481
IP: 193.23.181.44, count: 7481
IP: 217.163.21.36, count: 5422
IP: 203.138.181.112, count: 5422
IP: 92.112.3.72, count: 5318
IP: 63.251.204.102, count: 5318
IP: 67.15.185.102, count: 5127
IP: 194.1.150.66, count: 5127
IP: 174.128.246.102, count: 5127
IP: 155.70.32.98, count: 5127
---------- SKETCH with bins = 20 d = 5 ----------
IP: 199.217.10.203, count: 7640
IP: 193.23.181.44, count: 7636
IP: 110.36.112.54, count: 6077
IP: 195.14.4.101, count: 6077
IP: 205.188.103.2, count: 5782
IP: 74.125.235.51, count: 5782
IP: 67.195.168.230, count: 5782
IP: 212.52.133.252, count: 5782
IP: 174.128.246.102, count: 5782
IP: 213.146.65.56, count: 5561
---------- SKETCH with bins = 20 d = 8 ----------
IP: 193.23.181.44, count: 6831
IP: 174.128.246.102, count: 4924
IP: 216.82.254.51, count: 4909
IP: 173.0.129.6, count: 4909
IP: 202.153.104.23, count: 4667
IP: 77.238.14.21, count: 4621
IP: 184.154.89.154, count: 4594
IP: 208