# Ingress Detection Algorithm

```
cidr_max = 28   # nax split cidr mask
t=60            # seconds
e=120           # expire time if IPs
q = 0.95        # needed ingress fraction
c = 64          # c* sqrt(2^(IPv * max^-cidr)) -> min number of sampled

start with no knowledge (only /0 is known) 
loop
    collect IP from Netflow
        Filter IPs for ingress
        Mask them to cidrmax
        Insert IP into corresponding range

Every t seconds 
    Check all ranges
        Remove IPs older than e seconds 
        Prevalent color still valid (s_color >= q)
            YES → join siblings ? (join(s_color ) >= q) 
                YES → join siblings and check again 
                NO → do nothing
            NO → remove all information

        Check if enough samples have been collected (s_ipcount >= n_cidr ) 
            YES → is a single color prevalent ? (s_color >=q)
                YES → color range with link color
                NO → split subnet if s_cidr < cidrmax
            NO → join siblings ? (join(s_color) >= q or join(s_ipcount) < n_cidr−1)
                YES → join siblings and check again 
                NO → do nothing
```

In [1]:
#!pip install netaddr 

import pandas as pd 
import csv
import gzip
import pytricia
import ipaddress
from netaddr import *
from collections import defaultdict
import math
import dpath.util as dp

cidr_max = 28   # nax split cidr mask
t=60            # seconds
e= 2*t          # 120  # expire time if IPs
q = 0.95        # needed ingress fraction
c = 0.00025          # c* sqrt(2^(IPv * max^-cidr))

cols=['tag', 'peer_src_ip', 'in_iface', 'out_iface', 'src_ip', 'dst_net', 'src_port', 'dst_port', 'proto', '__', '_', 'ts_start', 'ts_end', 'pkts', 'bytes']

netflow_path="/data/slow/mehner/netflow/dummy_netflow.gz"
ingresslink_file = "/data/slow/mehner/ingresslink/1605571200.gz"                # if we get more netflow, we should adjust the file 
router_ip_mapping_file="/data/slow/mehner/router_lookup_tables/1605571200.txt"

###################################################
########### ROUTER NAME <--> IP MAPPING ###########
###################################################
with open(router_ip_mapping_file, 'r') as csv_file:
    router_ip_mapping_csv = csv.reader(csv_file, delimiter=' ')
    router_ip_lookup_dict = {rows[0]:rows[1] for rows in router_ip_mapping_csv}

###################################################
###########     INGRESS LINK FILE       ###########
###################################################

print("> load ingresslink file")

ingresslink_dict= {}
with gzip.open("{}".format(ingresslink_file), 'rb') as f:
    for line in f:
        line = line.decode('utf-8').split(",")
        router= line[0].replace("PEER_SRC_IP=", "")
        in_iface= line[1].replace("IN_IFACE=", "")
        
        # ingresslink_list.append("{}.{}".format(router, in_iface))
        ingresslink_dict["{}.{}".format(router, in_iface)] = True
print("  ...done\n")




# TAG     PEER_SRC_IP  IN IFACE OUT_IFACE SRC_IP          DST_NET        SRC_PORT DST_PORT PROTO  _       _       TS_START        TS_END    PKTS    BYTES
# 0       194.25.7.141    13      1571    91.127.69.122   31.13.84.4      40730   443     tcp     0       i       1605639641      1605639641 1       121
netflow_df = pd.read_csv(netflow_path, compression='gzip', header=None, sep=',', quotechar='"', error_bad_lines=False, names=cols, usecols = ['peer_src_ip', 'in_iface', 'src_ip', 'ts_end'])
print("read: ", len(netflow_df))

## pandas pipe  -> https://towardsdatascience.com/25-pandas-functions-you-didnt-know-existed-p-guarantee-0-8-1a05dcaad5d0
netflow_df['ingress_router'] = netflow_df.peer_src_ip.apply(lambda x: router_ip_lookup_dict.get(x))
netflow_df['ingress'] = netflow_df['ingress_router'] + "." + netflow_df.in_iface.astype(str)
netflow_df.drop(columns=['ingress_router', 'peer_src_ip', 'in_iface'], inplace=True)

netflow_df.drop(netflow_df.index[netflow_df['ts_end'] == 'TIMESTAMP_END'], inplace=True)

netflow_df['is_ingresslink'] = netflow_df.ingress.apply(lambda x: ingresslink_dict.get(x,False))
netflow_df = netflow_df.loc[netflow_df.is_ingresslink]
 
netflow_df.drop(columns=['is_ingresslink'], inplace=True)
print("ingress only: ", len(netflow_df))

netflow_df['ts_end'] = netflow_df.ts_end.apply(lambda x: int(int(x) / t) * t) 
netflow_df.sort_values(by = 'ts_end', inplace=True)

# mask to cidr max
netflow_df['src_ip'] = netflow_df.src_ip.apply(lambda x: str(ipaddress.ip_network("{}/{}".format(x, cidr_max), strict=False)).split("/")[0])

netflow_df = netflow_df.convert_dtypes()




> load ingresslink file
  ...done

read:  100000




  netflow_df = pd.read_csv(netflow_path, compression='gzip', header=None, sep=',', quotechar='"', error_bad_lines=False, names=cols, usecols = ['peer_src_ip', 'in_iface', 'src_ip', 'ts_end'])
  netflow_df = pd.read_csv(netflow_path, compression='gzip', header=None, sep=',', quotechar='"', error_bad_lines=False, names=cols, usecols = ['peer_src_ip', 'in_iface', 'src_ip', 'ts_end'])
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  netflow_df.drop(columns=['is_ingresslink'], inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  netflow_df['ts_end'] = netflow_df.ts_end.apply(lambda x: int(int(x) / t) * t)
A val

ingress only:  66949


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  netflow_df['src_ip'] = netflow_df.src_ip.apply(lambda x: str(ipaddress.ip_network("{}/{}".format(x, cidr_max), strict=False)).split("/")[0])


In [5]:
netflow_df.head()
# netflow_df.convert_dtypes().dtypes

Unnamed: 0,src_ip,ts_end,ingress
86492,31.13.84.48,1605639480,VIE-SB5.25
82140,81.182.112.32,1605639480,VIE-SB5.17
82141,45.57.17.144,1605639480,VIE-SB5.18
89876,52.46.159.128,1605639480,VIE-SB5.1507
89874,23.47.213.80,1605639480,VIE-SB5.1493


In [93]:
cur_ts=1605639480
e=1
q = 0.7
cidr_max = 28 

##################################
### PROTOTYPING IPDRange Class ###
##################################

DEBUG =True

### DICT implementation

# if classified range will be in range dict
# def __range_atts():
    # NOTE last seen will be updated if there is any new IP that belongs to this range
    #   if last_seen < 'current now' - e: drop prefix
    # return {'last_seen': 0, 'ingress': "", 'match' : 0, 'miss' : 0}

# if not yet classified range will be in subnet dict - here ip addresses are monitored
def __subnet_atts():
    return {'last_seen': 0,  'ingress' : "", 'match' : 0, 'miss' : 0}

def __multi_dict(K, type):
    if K == 1:
        return defaultdict(type)
    else:
        return defaultdict(lambda: __multi_dict(K-1, type))

# something like range_dict[ip_version][range]{last_seen: ... , ingress: ... , match: ... , miss: ... }
# range_dict=__multi_dict(2, __range_atts)

# something like subnet_dict[ip_version][range][{ip: ... , ingress: ... , last_seen: ... }]
#subnet_dict=__multi_dict(3, __subnet_atts)
subnet_dict=__multi_dict(4, __subnet_atts) # smehner TESTING

# initialization
range_lookup_dict = __multi_dict(1, pytricia.PyTricia) #defaultdict(lambda: pytricia.PyTricia())
range_lookup_dict[4].insert("0.0.0.0/0", "0.0.0.0/0")
range_lookup_dict[6].insert("::/0", "::/0")


## lookup in pytricia tree and return corresponding range
def get_corresponding_range(ip):
    ip_version = 4 if not ":" in ip else 6
    res =range_lookup_dict[ip_version][ip]
    # if DEBUG: print("check corresponding range;  ip: {} ; range: {}".format(ip_address, res))
    return res

def mask_ip(ip_address):
    return str(ipaddress.ip_network("{}/{}".format(ip_address, cidr_max), strict=False)).split("/")[0]

def __get_min_samples(cidr, ip_version=4):
        ipv_max = 32
        if ip_version == 6:
            ipv_max = 128
        return int(c * math.sqrt( math.pow(2, (ipv_max - cidr))))    

def __split_ip_and_mask(prefix):
    # prefix should be in this format 123.123.123.123/12 or 2001:db8:abcd:0012::0/64

    ip = prefix.split("/")[0]
    mask = prefix.split("/")[1]
        
    return str(ip), int(mask)


def get_sample_count(path):
    p_path=path
    if type(p_path) == list:
        p_path = p_path.append("prevalent")
    else: 
        p_path += "/prevalent"
    
    
    count = -1
    try:    
        count= len(dp.get(subnet_dict, path))

        try:
            dp.get(subnet_dict, p_path) #, yielded=True)
            count -=1 # since a 'prevalent' key is now here too, we have to decrement by one
            print("decremtnt counter")
        except KeyError: 
            # no prevalent ingress exists
            pass
    except KeyError:
        print("ERROR: key {} does not exist".format(path))

    return count

def get_prevalent_ingress(path):
    def __init_with_zero():
        return 0
    
    cur_prevalent=None
    sample_count=get_sample_count(path)

    # calculate prevalent ingress
    counter_dict=defaultdict(__init_with_zero)
    search_path="{}/**/ingress".format(path)
    for p, v in dp.search(subnet_dict, search_path, yielded=True): 
        counter_dict[v]+=1

    ratio= -1
    for ingress in counter_dict:
        ratio = counter_dict.get(ingress) / sample_count
        if DEBUG: print("   ratio for {}: {:.2f}".format(ingress, ratio))
        if ratio >= q: 
            cur_prevalent = ingress
            break

    if DEBUG: print("prevalent for {}: {} ({:.2f})".format(path, cur_prevalent, ratio))
    
    return cur_prevalent
    


# iterates over all ranges that are already classified
def is_prevalent_ingress_still_valid():
    for p, i in dp.search(subnet_dict, "**/prevalent", yielded=True): 
        current_prevalent= i

        new_prevalent = get_prevalent_ingress(p)

        if current_prevalent == new_prevalent:
            print("     yep - ingress still valid")
        else:
            print("     NOO - not valid anymore -> delete all information")



def __split_range(range_to_be_splitted):
    ip, mask = __split_ip_and_mask(range_to_be_splitted)

    if cidr_max <= mask:
        if DEBUG: print("max_cidr reached - do nothing")
        return
    
    mask +=1
    nw= IPNetwork(range_to_be_splitted)
    for i in nw.subnet(mask):
        range_lookup_dict[nw.version].insert(str(i), str(i))
    range_lookup_dict[nw.version].delete(range_to_be_splitted)

    # TODO remove range from range_dict and update subnet_dict


    

def add_to_subnet(ip, ingress, last_seen):
    # something like subnet_dict[ip_version][range][ip]{ingress: ... , last_seen: ... }]
    # if DEBUG: print("adding ", ip, ingress, last_seen)

    ip_version = 4 if not ":" in ip else 6

    ip = mask_ip(ip)
    range, mask = __split_ip_and_mask(get_corresponding_range(ip))
    
    # subnet_dict[ip_version][mask][range]['prevalent'] = ingress # <-- we have a prevalent color for this range we can mark it somehow like this
    dp.new(subnet_dict, [str(ip_version), str(mask), str(range), str(ip), 'last_seen'], last_seen)
    dp.new(subnet_dict, [str(ip_version), str(mask), str(range), str(ip), 'ingress'], ingress)

# def add_to_range(range, ingress, last_seen, match_counter, miss_counter, ip_version=4):
#     # something like range_dict[ip_version][range]{last_seen: ... , ingress: ... , match: ... , miss: ... }
#     print("adding ", range, ingress, last_seen)
    
#     range_dict[ip_version][range]['last_seen'] = last_seen
#     range_dict[ip_version][range]['ingress'] = ingress
#     range_dict[ip_version][range]['match'] = match_counter
#     range_dict[ip_version][range]['miss'] = miss_counter


    

# remove all ips older than e seconds
def remove_old_ips_from_range(current_ts):
    
    pop_list=[]

    for path, ts in dp.search(subnet_dict, "**/last_seen",yielded=True): 
        # print(path, ts)
        if int(ts)  < current_ts - e : 
            # if DEBUG: print("remove old ip: {} ({})".format(path, ts))
            pop_list.append(path)

    print("removing {} expired IP addresses".format(len(pop_list)))
    # b= len(subnet_dict["4"]["0"]["0.0.0.0"])
    for i, path in enumerate(pop_list): 
        try:
            path_elems= path.split("/")

            #dp.delete(subnet_dict, path.replace("/last_seen", "")) # too slow
            print(subnet_dict[path_elems[0]][path_elems[1]][path_elems[2]].pop(path_elems[3]))

        except:
            print("ERROR: {} cannot be deleted".format(path))
            pass






In [94]:
INSERT = False
CHECK_RANGES= True

initial_ts= netflow_df.ts_end.unique()[0]

e= 1
#     Insert netflow data into corresponding ranges
if INSERT == True:
    for t in netflow_df.ts_end.unique(): #TODO currently only first timestamp is used ~ t=0
        # walk along the binned timestamps
        # add all flows of current timestamp to range
        #   is this prefix already classified:
        #       same ingress: match++
        #       other ingress: miss++
        cur_slice = netflow_df.loc[netflow_df.ts_end == t]
        #cur_slice['range']= cur_slice.src_ip.apply(get_corresponding_range)
        if DEBUG: print(cur_slice.head(2))
        for i in cur_slice.itertuples():
            add_to_subnet(ip=i.src_ip, ingress=i.ingress, last_seen=i.ts_end)
        
        # break

if CHECK_RANGES:
    remove_old_ips_from_range(1605639600)


#   Check all ranges

              src_ip      ts_end     ingress
86492    31.13.84.48  1605639480  VIE-SB5.25
82140  81.182.112.32  1605639480  VIE-SB5.17
            src_ip      ts_end     ingress
84758  2a03:2880::  1605639540  VIE-SB5.25
84760  2001:4c40::  1605639540  VIE-SB5.17
              src_ip      ts_end     ingress
68565  31.46.249.224  1605639600  VIE-SB5.17
68629  149.200.8.208  1605639600  VIE-SB5.17
removing 2921 expired IP addresses
{'last_seen': 1605639480, 'ingress': 'VIE-SB5.17', 'match': 0, 'miss': 0}
{'last_seen': 1605639480, 'ingress': 'VIE-SB5.1507', 'match': 0, 'miss': 0}
{'last_seen': 1605639540, 'ingress': 'VIE-SB5.1493', 'match': 0, 'miss': 0}
{'last_seen': 1605639480, 'ingress': 'VIE-SB5.1493', 'match': 0, 'miss': 0}
{'last_seen': 1605639480, 'ingress': 'VIE-SB5.1507', 'match': 0, 'miss': 0}
{'last_seen': 1605639480, 'ingress': 'VIE-SB5.1493', 'match': 0, 'miss': 0}
{'last_seen': 1605639480, 'ingress': 'VIE-SB5.1507', 'match': 0, 'miss': 0}
{'last_seen': 1605639540, 'ingress':

In [85]:
import json

# print(subnet_dict["4"]["0"]['0.0.0.0']["69.171.250.48"])
# ["69.171.250.48"]

# print(dp.get(subnet_dict, "4/0/0.0.0.0/31.13.84.48/last_seen"))
# 4/0/0.0.0.0/69.171.250.48/last_seen
print(subnet_dict)

defaultdict(<function __multi_dict.<locals>.<lambda> at 0x7fa44221e4c0>, {'4': defaultdict(<function __multi_dict.<locals>.<lambda> at 0x7fa44045daf0>, {'0': defaultdict(<function __multi_dict.<locals>.<lambda> at 0x7fa44045d550>, {'0.0.0.0': defaultdict(<function __subnet_atts at 0x7fa443aa6af0>, {'69.171.250.48': {'last_seen': 1605639600, 'ingress': 'VIE-SB5.9', 'match': 0, 'miss': 0}, '23.47.209.80': {'last_seen': 1605639480, 'ingress': 'VIE-SB5.1493', 'match': 0, 'miss': 0}, '35.169.160.144': {'last_seen': 1605639480, 'ingress': 'VIE-SB5.1507', 'match': 0, 'miss': 0}, '69.171.250.0': {'last_seen': 1605639600, 'ingress': 'VIE-SB5.9', 'match': 0, 'miss': 0}, '23.47.212.16': {'last_seen': 1605639480, 'ingress': 'VIE-SB5.1493', 'match': 0, 'miss': 0}, '62.104.23.32': {'last_seen': 1605639480, 'ingress': 'VIE-SB5.1507', 'match': 0, 'miss': 0}, '23.47.212.192': {'last_seen': 1605639600, 'ingress': 'VIE-SB5.1493', 'match': 0, 'miss': 0}, '2.21.172.80': {'last_seen': 1605639540, 'ingress':

In [54]:
dp.get(subnet_dict ,"4/0/0.0.0.0/69.171.250.32/last_seen")

1605639600

In [9]:

for cur_range in subnet_dict[4].keys():
    cur_cidr= int(cur_range.split("/")[1])
    min_samples= __get_min_samples(cur_cidr)
    cur_samples= len(subnet_dict[4].get(cur_range))

    
    print("{} --> {} samples (needed {}) ".format(cur_range,cur_samples, min_samples))
    
    # Check if enough samples   have been collected (s_ipcount >= n_cidr ) 
    if cur_samples >=min_samples:
        print("    enough samples".format(cur_range))
                
        check_prevalent_ingress(cur_range) 

        pass
    else:
        print("    not enough samples".format(cur_range))
    
    
subnet_dict
        

AttributeError: 'int' object has no attribute 'split'

In [8]:
from netaddr import *

ip = IPNetwork('84.60.150.162/21')
ip.version

4

In [11]:
import pytricia
from netaddr import *
import random
min_cidr=0
max_cidr=7

for i in range_lookup_dict[4]: print(i)

for i in range(min_cidr, max_cidr):
    # randomly choose a prefix from range and split it

    rand_chosen_network=list(range_lookup_dict[4])[random.randint(0,len(range_lookup_dict))]
    print("counter: {} | random chosen network: {} | len_range_lookup: {}".format(i, rand_chosen_network, len(range_lookup_dict)))
    __split_range(rand_chosen_network)

for i in range_lookup_dict[4]: print(i)


# for i in p: print(i)



# sibling = same mask



0.0.0.0/0


IndexError: list index out of range

In [None]:
last=None
for i in range_lookup_dict[4]: 
    ip, mask = __split_ip_and_mask(i)

    if mask < 1: break

    nw = IPNetwork(i).supernet(mask - 1)

    if nw != last:
        print(" NO {} {}".format(nw, last))
    else:
        print(" YES {} {}".format(nw, last))

    last=nw
    

 NO [IPNetwork('0.0.0.0/5')] None
 NO [IPNetwork('4.0.0.0/8')] [IPNetwork('0.0.0.0/5')]
 YES [IPNetwork('4.0.0.0/8')] [IPNetwork('4.0.0.0/8')]
 NO [IPNetwork('4.0.0.0/7')] [IPNetwork('4.0.0.0/8')]
 NO [IPNetwork('6.0.0.0/7')] [IPNetwork('4.0.0.0/7')]
 YES [IPNetwork('6.0.0.0/7')] [IPNetwork('6.0.0.0/7')]
 NO [IPNetwork('0.0.0.0/4')] [IPNetwork('6.0.0.0/7')]
 NO [IPNetwork('16.0.0.0/5')] [IPNetwork('0.0.0.0/4')]
 YES [IPNetwork('16.0.0.0/5')] [IPNetwork('16.0.0.0/5')]
 NO [IPNetwork('24.0.0.0/5')] [IPNetwork('16.0.0.0/5')]
 YES [IPNetwork('24.0.0.0/5')] [IPNetwork('24.0.0.0/5')]
 NO [IPNetwork('32.0.0.0/5')] [IPNetwork('24.0.0.0/5')]
 NO [IPNetwork('36.0.0.0/6')] [IPNetwork('32.0.0.0/5')]
 YES [IPNetwork('36.0.0.0/6')] [IPNetwork('36.0.0.0/6')]
 NO [IPNetwork('32.0.0.0/4')] [IPNetwork('36.0.0.0/6')]
 NO [IPNetwork('32.0.0.0/3')] [IPNetwork('32.0.0.0/4')]
 NO [IPNetwork('64.0.0.0/3')] [IPNetwork('32.0.0.0/3')]
 YES [IPNetwork('64.0.0.0/3')] [IPNetwork('64.0.0.0/3')]
 NO [IPNetwork('96.0.

In [4]:
# subnet_dict[ip_version][mask][range][ip]['last_seen'] = last_seen

# subnet_dict
temp_dict = subnet_dict.copy()

subnet_dict

defaultdict(<function __main__.__multi_dict.<locals>.<lambda>()>,
            {4: defaultdict(<function __main__.__multi_dict.<locals>.<lambda>()>,
                         {0: defaultdict(<function __main__.__multi_dict.<locals>.<lambda>()>,
                                      {'0.0.0.0': defaultdict(<function __main__.__subnet_atts()>,
                                                   {'31.13.84.48': {'last_seen': 1605639480,
                                                     'ingress': 'VIE-SB5.25',
                                                     'match': 0,
                                                     'miss': 0},
                                                    '81.182.112.32': {'last_seen': 1605639480,
                                                     'ingress': 'VIE-SB5.17',
                                                     'match': 0,
                                                     'miss': 0},
                                                    '4

In [34]:
# !pip install dpath

import dpath.util as dp

temp = "last_seen"

pop_list=[]




def check_prevalent_ingress(path): # dpath 
    print(path, "\n")

    def init_with_zero():
        return 0
    
    sample_count= get_sample_count(path)

    ## check if there is already a classified ingress for that range



   

    
    

    
    cur_prevalent=None

        
    
    # now differnt things can happen:
    #   1. last_prevalent == cur_prevalent and last_prevalent != None -> prevalent ingress still valid -> fine
    #   2. cur_prevalent == None                                      -> no prevalent ingress found
    #   3. cur_prevalent != None and last_prevalent == None  
    if cur_prevalent == None:
        print("no prevalent ingress found - split subnets")
    elif (last_prevalent == None and cur_prevalent != None):
        print("set new prevalent ingress for {}: {}".format(path, cur_prevalent))
    elif (last_prevalent != None and cur_prevalent != last_prevalent):
        print("current prevalent ingress != last one -> remove all information")
        print("     {} != {} ".format(cur_prevalent, last_prevalent))
    else:
        print("not catched: ", path, last_prevalent, cur_prevalent)
    

    
#is_prevalent_color_still_valid()
q=0.9
is_prevalent_ingress_still_valid()



prevalent for 4/0/0.0.0.0/prevalent: None (-1.00)
     NOO - not valid anymore -> delete all information


In [13]:
dp.new(temp_dict, "4/0/0.0.0.0/prevalent", "VIE-Uschi")
dp.search(temp_dict, "**/prevalent")

{4: defaultdict(None,
             {0: defaultdict(None,
                          {'0.0.0.0': defaultdict(None,
                                       {'prevalent': 'VIE-Uschi'})})})}