In [1]:
from loaddata import load_data
import pandas as pd
from random import random
from random import randint
from collections import Counter

In [2]:
df = load_data('datasets/capture20110811.pcap.netflow.labeled')
df.head()

Unnamed: 0,Date_flow,start,Durat,Prot,Flags,Tos,Packets,Bytes,Flows,Label,src_ip,src_port,dst_ip,dst_port
0,2011-08-11,10:10:00.003,0.967,UDP,INT,0,2,135,1,Background,89.31.8.11,23929,147.32.84.229,13363
1,2011-08-11,10:10:00.003,0.967,UDP,INT,0,2,276,1,Background,147.32.84.229,13363,89.31.8.11,23929
2,2011-08-11,10:10:00.006,0.0,UDP,INT,0,1,62,1,Background,208.88.186.6,34042,147.32.84.229,13363
3,2011-08-11,10:10:00.008,0.0,UDP,INT,0,1,78,1,Background,92.118.218.77,55246,147.32.84.229,13363
4,2011-08-11,10:10:00.009,0.0,UDP,INT,0,1,72,1,Background,182.185.139.181,10223,147.32.84.229,13363


In [3]:
# Get top10 most frequent ips connected to the host
def get_top_connected_ip(df, host_ip):
    
    def filter_ip(row):
        if row['src_ip'] == host_ip:
            return row['dst_ip']
        else:
            return row['src_ip']
    
    # Filter out rows without host_ip
    df = df[['src_ip', 'dst_ip']]
    df = df[(df['src_ip'] == host_ip) | (df['dst_ip'] == host_ip)]
    
    # Filter out host_ip, store the other ip
    df['connected'] = df.apply(filter_ip, axis=1)
    
    # Rank result based on occurrence
    result = df['connected'].value_counts().nlargest(10)
    return result, df

In [4]:
# Get ground truth results of most frequent ips connected to host
host_ip = '147.32.84.165'
truth, df = get_top_connected_ip(df, host_ip)
print(truth)

193.23.181.44      6442
174.128.246.102    4101
174.37.196.55      3707
173.236.31.226     3410
184.154.89.154     3344
67.19.72.206       3224
46.4.36.120        3150
72.20.15.61        3111
147.32.80.9        1573
212.117.171.138     967
Name: connected, dtype: int64


In [5]:
def reservoir_sampling(size, data):
    reservoir = data[0: reservoir_size]

    for i in range(size, len(data)):
        prob = size / i

        if random() >= prob:
            # Generate a random int to be the replaced sample index
            index = randint(0,size-1)
            # Replace old sample
            reservoir[index] = data[i]
    
    return reservoir 

In [6]:
data = df['connected'].tolist()

# Try different reservoir size
for i in [100, 1000, 2500, 5000, 7500, 10000, 20000, 30000]:
    print("********** size = %d **********" % i)
    reservoir_size = i
    
    # Reservior sampling
    result = reservoir_sampling(reservoir_size, data)
    
    # Count results
    count = Counter(result)
    print(count.most_common(10))

********** size = 100 **********
[('184.154.89.154', 10), ('217.163.21.39', 10), ('83.133.119.197', 8), ('98.137.54.237', 7), ('94.100.28.114', 6), ('212.117.171.138', 5), ('46.4.36.120', 5), ('98.139.175.225', 5), ('173.241.240.4', 3), ('217.163.21.34', 3)]
********** size = 1000 **********
[('184.154.89.154', 140), ('46.4.36.120', 89), ('147.32.80.9', 37), ('98.139.175.225', 37), ('98.137.54.237', 34), ('217.163.21.39', 33), ('209.85.143.27', 26), ('184.82.148.43', 26), ('173.192.170.88', 25), ('184.82.148.44', 19)]
********** size = 2500 **********
[('184.154.89.154', 344), ('46.4.36.120', 231), ('147.32.80.9', 115), ('173.192.170.88', 64), ('212.117.171.138', 63), ('98.137.54.237', 61), ('98.139.175.225', 59), ('184.82.148.43', 56), ('173.236.31.226', 55), ('217.163.21.39', 50)]
********** size = 5000 **********
[('184.154.89.154', 643), ('46.4.36.120', 474), ('173.236.31.226', 246), ('147.32.80.9', 183), ('212.117.171.138', 154), ('74.6.136.244', 126), ('67.195.168.31', 113), ('17