In [70]:
import pandas as pd
from random import random
from random import randint
from collections import Counter

In [4]:
# Function to load datasets as there are some inconsistencies between file formats
def load_data(path):
    # Set fixed column names to read
    column_names = ['Date_flow', 'start', 'Durat', 'Prot', 'Src_IP_Addr:Port', '->',
       'Dst_IP_Addr:Port', 'Flags', 'Tos', 'Packets', 'Bytes', 'Flows',
       'Label']
    
    df = pd.read_csv(path, names=column_names, header=None, delim_whitespace=True, skiprows=1)
    
    # Split ip_address and port into separate columns
    df[['src_ip','src_port']] = df['Src_IP_Addr:Port'].str.split(":", n=1, expand=True) 
    df[['dst_ip','dst_port']] = df['Dst_IP_Addr:Port'].str.split(":", n=1, expand=True) 
    
    # Drop old columns
    df.drop(columns=['Src_IP_Addr:Port', 'Dst_IP_Addr:Port', '->'], inplace = True) 
    
    return df

In [38]:
df = load_data('datasets/capture20110811.pcap.netflow.labeled')
df.head()

Unnamed: 0,Date_flow,start,Durat,Prot,Flags,Tos,Packets,Bytes,Flows,Label,src_ip,src_port,dst_ip,dst_port
0,2011-08-11,10:10:00.003,0.967,UDP,INT,0,2,135,1,Background,89.31.8.11,23929,147.32.84.229,13363
1,2011-08-11,10:10:00.003,0.967,UDP,INT,0,2,276,1,Background,147.32.84.229,13363,89.31.8.11,23929
2,2011-08-11,10:10:00.006,0.0,UDP,INT,0,1,62,1,Background,208.88.186.6,34042,147.32.84.229,13363
3,2011-08-11,10:10:00.008,0.0,UDP,INT,0,1,78,1,Background,92.118.218.77,55246,147.32.84.229,13363
4,2011-08-11,10:10:00.009,0.0,UDP,INT,0,1,72,1,Background,182.185.139.181,10223,147.32.84.229,13363


In [43]:
# Get top10 most frequent ips connected to the host
def get_top_connected_ip(df, host_ip):
    
    def filter_ip(row):
        if row['src_ip'] == host_ip:
            return row['dst_ip']
        else:
            return row['src_ip']
    
    # Filter out rows without host_ip
    df = df[['src_ip', 'dst_ip']]
    df = df[(df['src_ip'] == host_ip) | (df['dst_ip'] == host_ip)]
    
    # Filter out host_ip, store the other ip
    df['connected'] = df.apply(filter_ip, axis=1)
    
    # Rank result based on occurrence
    result = df['connected'].value_counts().nlargest(10)
    return result, df

In [44]:
# Get ground truth results of most frequent ips connected to host
host_ip = '147.32.84.165'
truth, df = get_top_connected_ip(df, host_ip)
print(truth)

193.23.181.44      6442
174.128.246.102    4101
174.37.196.55      3707
173.236.31.226     3410
184.154.89.154     3344
67.19.72.206       3224
46.4.36.120        3150
72.20.15.61        3111
147.32.80.9        1573
212.117.171.138     967
Name: connected, dtype: int64


In [67]:
def reservoir_sampling(size, data):
    reservoir = data[0: reservoir_size]

    for i in range(size, len(data)):
        prob = size / i

        if random() >= prob:
            # Generate a random int to be the replaced sample index
            index = randint(0,size-1)
            # Replace old sample
            reservoir[index] = data[i]
    
    return reservoir 

In [74]:
data = df['connected'].tolist()

# Try different reservoir size
for i in [100, 1000, 2500, 5000, 7500, 10000, 20000, 30000]:
    print("********** size = %d **********" % i)
    reservoir_size = i
    
    # Reservior sampling
    result = reservoir_sampling(reservoir_size, data)
    
    # Count results
    count = Counter(result)
    print(count.most_common(10))

********** size = 100 **********
[('184.154.89.154', 12), ('217.163.21.39', 10), ('94.100.28.114', 8), ('212.117.171.138', 6), ('83.133.119.197', 6), ('46.4.36.120', 5), ('98.137.54.237', 5), ('98.139.175.225', 4), ('64.12.138.161', 3), ('209.85.143.27', 3)]
********** size = 1000 **********
[('184.154.89.154', 136), ('46.4.36.120', 94), ('98.137.54.237', 43), ('147.32.80.9', 39), ('217.163.21.39', 37), ('209.85.143.27', 31), ('184.82.148.43', 28), ('98.139.175.225', 27), ('173.192.170.88', 22), ('212.117.171.138', 20)]
********** size = 2500 **********
[('184.154.89.154', 315), ('46.4.36.120', 228), ('147.32.80.9', 108), ('212.117.171.138', 72), ('184.82.148.43', 65), ('98.137.54.237', 56), ('98.139.175.225', 53), ('173.192.170.88', 52), ('94.100.28.114', 50), ('217.163.21.39', 47)]
********** size = 5000 **********
[('184.154.89.154', 634), ('46.4.36.120', 443), ('173.236.31.226', 264), ('147.32.80.9', 187), ('212.117.171.138', 154), ('67.195.168.31', 115), ('173.192.170.88', 114), (