# Task 1: Sampling
## Import functions

In [2]:
import numpy as np
import pandas as pd

## Reservoir function

In [3]:
# save_smallest updates the reservoir, only stores the smallest tags
def save_smallest(reservoir, tags, current_ip, current_tag, k):
    reservoir = np.append(reservoir, current_ip)
    tags = np.append(tags, current_tag)
        
    index = np.argsort(tags)[:k]
    return (reservoir[index], tags[index])

## Sampling

In [4]:
df = 'data/capture20110811_43_.pcap.netflow.labeled'
infected_ip = '147.32.84.165'
n = 10
amount_samples = [100,1000,5000]

### Read the data

In [5]:
with open(df, "r") as ins:
    lines = ins.readlines()

### Compute estimates

In [6]:
import time

# run 5 iterations to average the run-time
iterations = 5
run_times = np.zeros([iterations,len(amount_samples)])

for it in range(iterations):
    estimates = []
    for k_i, k in enumerate(amount_samples):
        reservoir = np.array([])
        tags = np.array([])

        start = time.time()
        for line in lines:
            parts = line.split()
            ip_port_src = parts[4].split(':')
            ip_src = ip_port_src[0]
            
            # if this is from our infected host
            if ip_src == infected_ip:
                ip_port_dst = parts[6].split(':')
                ip_dst = ip_port_dst[0]
                
                # generate tag
                r = np.random.rand()
                # obtain new reservoir
                (reservoir, tags) = save_smallest(reservoir, tags, ip_dst, r, k)

        stop = time.time()
        run_times[it][k_i] = stop - start

        # only use the top 10 most frequent
        ips_estimated, counts = np.unique(reservoir, return_counts=True)
        ind = np.argsort(-counts)[:n]
        estimates.append({
            'ips': ips_estimated[ind],
            'freqs': counts[ind] / k
        })

### Count traffic from botnet ip

In [7]:
ips = {}
infected_flow_count = 0
for line in lines:
    parts = line.split()
    ip_port_src = parts[4].split(':')
    ip_src = ip_port_src[0]

    # if this is from the infected host
    if ip_src == infected_ip:
        ip_port_dst = parts[6].split(':')
        ip_dst = ip_port_dst[0]
        
        # if the map entry doesn't exist yet
        if not ip_dst in ips:
            ips[ip_dst] = 0
            
        # add a count to the destination ip
        ips[ip_dst] += 1
        infected_flow_count += 1  

In [8]:
ips_ip = np.array(list(ips.keys()))
ips_count = np.array(list(ips.values()))

ind = np.argsort(-ips_count)[:n]
true = {}
true['ips'] = ips_ip[ind]
true['freqs'] = ips_count[ind] / infected_flow_count

### create the estimation table
### first column is the real value, last 3 ip columns are the estimates

In [13]:
for i in range(n):
    line = [i+1, true['ips'][i], round(true['freqs'][i],3)]
    for j,k in enumerate(amount_samples):
        line.append(estimates[j]['ips'][i])
        line.append(round(estimates[j]['freqs'][i],3))
    print("\t".join([str(x) for x in line]))

1	193.23.181.44	0.136	193.23.181.44	0.15	193.23.181.44	0.13	193.23.181.44	0.139
2	174.128.246.102	0.076	173.236.31.226	0.08	174.37.196.55	0.091	174.128.246.102	0.074
3	174.37.196.55	0.074	67.19.72.206	0.07	67.19.72.206	0.08	174.37.196.55	0.071
4	67.19.72.206	0.069	72.20.15.61	0.07	174.128.246.102	0.074	72.20.15.61	0.068
5	72.20.15.61	0.066	174.128.246.102	0.06	72.20.15.61	0.059	67.19.72.206	0.065
6	173.236.31.226	0.038	174.37.196.55	0.06	46.4.36.120	0.045	184.154.89.154	0.039
7	184.154.89.154	0.037	46.4.36.120	0.03	184.154.89.154	0.033	46.4.36.120	0.036
8	46.4.36.120	0.036	184.82.147.252	0.03	173.236.31.226	0.03	173.236.31.226	0.035
9	147.32.80.9	0.017	64.12.90.33	0.02	147.32.80.9	0.025	147.32.80.9	0.018
10	217.163.21.37	0.015	65.55.92.168	0.02	217.163.21.36	0.019	217.163.21.37	0.016


### Performance per k, frequency distance

In [19]:
def freq_distance(true_values, estimate):
    estimate_map = {}
    for index, ip in enumerate(estimate['ips']):
        estimate_map[ip] = estimate['freqs'][index]
    score = 0
    for index, ip in enumerate(true['ips']):
        if ip in estimate_map:
            score += abs(true_values['freqs'][index] - estimate_map[ip])
        else:
            score += true_values['freqs'][index]
    return score

### print recall, distance

In [20]:
print("\t".join(['k','recall','distance']))
for j,k in enumerate(amount_samples):
    recall = float(len(np.intersect1d(true['ips'], estimates[j]['ips']))) / float(n)
    freq_score = round(freq_distance(true, estimates[j]),4)
    print("{}\t{}\t{}".format(k, recall, freq_score))

k	recall	distance
100	0.7	0.167
1000	0.9	0.086
5000	1.0	0.0196
