# Ingress Detection Algorithm

```
cidr_max = 28   # nax split cidr mask
t=60            # seconds
e=120           # expire time if IPs
q = 0.95        # needed ingress fraction
c = 64          # c* sqrt(2^(IPv * max^-cidr)) -> min number of sampled

start with no knowledge (only /0 is known) 
loop
    collect IP from Netflow
        Filter IPs for ingress
        Mask them to cidrmax
        Insert IP into corresponding range

Every t seconds 
    Check all ranges
        Remove IPs older than e seconds 
        Prevalent color still valid (s_color >= q)
            YES → join siblings ? (join(s_color ) >= q) 
                YES → join siblings and check again 
                NO → do nothing
            NO → remove all information

        Check if enough samples have been collected (s_ipcount >= n_cidr ) 
            YES → is a single color prevalent ? (s_color >=q)
                YES → color range with link color
                NO → split subnet if s_cidr < cidrmax
            NO → join siblings ? (join(s_color) >= q or join(s_ipcount) < n_cidr−1)
                YES → join siblings and check again 
                NO → do nothing
```

In [3]:
#!pip install netaddr 

import pandas as pd 
import csv
import gzip
import pytricia
import ipaddress
from netaddr import *
from collections import defaultdict
import math
import dpath.util as dp

cidr_max = 28   # nax split cidr mask
t=60            # seconds
e= 2*t          # 120  # expire time if IPs
q = 0.95        # needed ingress fraction
c = 0.00025          # c* sqrt(2^(IPv * max^-cidr))

cols=['tag', 'peer_src_ip', 'in_iface', 'out_iface', 'src_ip', 'dst_net', 'src_port', 'dst_port', 'proto', '__', '_', 'ts_start', 'ts_end', 'pkts', 'bytes']

netflow_path="/data/slow/mehner/netflow/dummy_netflow.gz"
ingresslink_file = "/data/slow/mehner/ingresslink/1605571200.gz"                # if we get more netflow, we should adjust the file 
router_ip_mapping_file="/data/slow/mehner/router_lookup_tables/1605571200.txt"

###################################################
########### ROUTER NAME <--> IP MAPPING ###########
###################################################
with open(router_ip_mapping_file, 'r') as csv_file:
    router_ip_mapping_csv = csv.reader(csv_file, delimiter=' ')
    router_ip_lookup_dict = {rows[0]:rows[1] for rows in router_ip_mapping_csv}

###################################################
###########     INGRESS LINK FILE       ###########
###################################################

print("> load ingresslink file")

ingresslink_dict= {}
with gzip.open("{}".format(ingresslink_file), 'rb') as f:
    for line in f:
        line = line.decode('utf-8').split(",")
        router= line[0].replace("PEER_SRC_IP=", "")
        in_iface= line[1].replace("IN_IFACE=", "")
        
        # ingresslink_list.append("{}.{}".format(router, in_iface))
        ingresslink_dict["{}.{}".format(router, in_iface)] = True
print("  ...done\n")




# TAG     PEER_SRC_IP  IN IFACE OUT_IFACE SRC_IP          DST_NET        SRC_PORT DST_PORT PROTO  _       _       TS_START        TS_END    PKTS    BYTES
# 0       194.25.7.141    13      1571    91.127.69.122   31.13.84.4      40730   443     tcp     0       i       1605639641      1605639641 1       121
netflow_df = pd.read_csv(netflow_path, compression='gzip', header=None, sep=',', quotechar='"', error_bad_lines=False, names=cols, usecols = ['peer_src_ip', 'in_iface', 'src_ip', 'ts_end'])
print("read: ", len(netflow_df))

## pandas pipe  -> https://towardsdatascience.com/25-pandas-functions-you-didnt-know-existed-p-guarantee-0-8-1a05dcaad5d0
netflow_df['ingress_router'] = netflow_df.peer_src_ip.apply(lambda x: router_ip_lookup_dict.get(x))
netflow_df['ingress'] = netflow_df['ingress_router'] + "." + netflow_df.in_iface.astype(str)
netflow_df.drop(columns=['ingress_router', 'peer_src_ip', 'in_iface'], inplace=True)

netflow_df.drop(netflow_df.index[netflow_df['ts_end'] == 'TIMESTAMP_END'], inplace=True)

netflow_df['is_ingresslink'] = netflow_df.ingress.apply(lambda x: ingresslink_dict.get(x,False))
netflow_df = netflow_df.loc[netflow_df.is_ingresslink]
 
netflow_df.drop(columns=['is_ingresslink'], inplace=True)
print("ingress only: ", len(netflow_df))

netflow_df['ts_end'] = netflow_df.ts_end.apply(lambda x: int(int(x) / t) * t) 
netflow_df.sort_values(by = 'ts_end', inplace=True)

# mask to cidr max
netflow_df['src_ip'] = netflow_df.src_ip.apply(lambda x: str(ipaddress.ip_network("{}/{}".format(x, cidr_max), strict=False)).split("/")[0])

netflow_df = netflow_df.convert_dtypes()




> load ingresslink file
  ...done

read:  100000




  netflow_df = pd.read_csv(netflow_path, compression='gzip', header=None, sep=',', quotechar='"', error_bad_lines=False, names=cols, usecols = ['peer_src_ip', 'in_iface', 'src_ip', 'ts_end'])
  netflow_df = pd.read_csv(netflow_path, compression='gzip', header=None, sep=',', quotechar='"', error_bad_lines=False, names=cols, usecols = ['peer_src_ip', 'in_iface', 'src_ip', 'ts_end'])
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  netflow_df.drop(columns=['is_ingresslink'], inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  netflow_df['ts_end'] = netflow_df.ts_end.apply(lambda x: int(int(x) / t) * t)
A val

ingress only:  66949


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  netflow_df['src_ip'] = netflow_df.src_ip.apply(lambda x: str(ipaddress.ip_network("{}/{}".format(x, cidr_max), strict=False)).split("/")[0])


In [4]:
netflow_df.head()
# netflow_df.convert_dtypes().dtypes

Unnamed: 0,src_ip,ts_end,ingress
86492,31.13.84.48,1605639480,VIE-SB5.25
82140,81.182.112.32,1605639480,VIE-SB5.17
82141,45.57.17.144,1605639480,VIE-SB5.18
89876,52.46.159.128,1605639480,VIE-SB5.1507
89874,23.47.213.80,1605639480,VIE-SB5.1493


In [158]:
cur_ts=1605639480
e=1
q = 0.7
cidr_max = 28 

##################################
### PROTOTYPING IPDRange Class ###
##################################

DEBUG =True

### DICT implementation

# if classified range will be in range dict
# def __range_atts():
    # NOTE last seen will be updated if there is any new IP that belongs to this range
    #   if last_seen < 'current now' - e: drop prefix
    # return {'last_seen': 0, 'ingress': "", 'match' : 0, 'miss' : 0}

# if not yet classified range will be in subnet dict - here ip addresses are monitored
def __subnet_atts():
    return {'last_seen': 0,  'ingress' : ""}

def __multi_dict(K, type):
    if K == 1:
        return defaultdict(type)
    else:
        return defaultdict(lambda: __multi_dict(K-1, type))

# something like range_dict[ip_version][range]{last_seen: ... , ingress: ... , match: ... , miss: ... }
# range_dict=__multi_dict(2, __range_atts)

# something like subnet_dict[ip_version][range][{ip: ... , ingress: ... , last_seen: ... }]
#subnet_dict=__multi_dict(3, __subnet_atts)
subnet_dict=__multi_dict(4, __subnet_atts) # smehner TESTING

# initialization
range_lookup_dict = __multi_dict(1, pytricia.PyTricia) #defaultdict(lambda: pytricia.PyTricia())
range_lookup_dict["4"].insert("0.0.0.0/0", "0.0.0.0/0")
range_lookup_dict["6"].insert("::/0", "::/0")


## lookup in pytricia tree and return corresponding range
def get_corresponding_range(ip):
    ip_version = 4 if not ":" in ip else 6
    res =range_lookup_dict[str(ip_version)][ip]
    # if DEBUG: print("check corresponding range;  ip: {} ; range: {}".format(ip_address, res))
    return res

def mask_ip(ip_address):
    return str(ipaddress.ip_network("{}/{}".format(ip_address, cidr_max), strict=False)).split("/")[0]

def __get_min_samples(path):
        t = path.split("/")
        ip_version = int(t[0])
        cidr = int(t[1])
        
        ipv_max = 32
        if ip_version == 6:
            ipv_max = 128
        return int(c * math.sqrt( math.pow(2, (ipv_max - cidr))))    

def __split_ip_and_mask(prefix):
    # prefix should be in this format 123.123.123.123/12 or 2001:db8:abcd:0012::0/64

    ip = prefix.split("/")[0]
    mask = prefix.split("/")[1]
        
    return str(ip), int(mask)

def __convert_range_string_to_range_path(range_string):
    ip_version = 4 if not ":" in range_string else 6

    range, mask = range_string.split("/")

    return f"{ip_version}/{mask}/{range}"

def __convert_range_path_to_single_elems(path):
    t = path.split("/")
    ip_version = int(t[0])
    mask = int(t[1])
    range= t[2]
    return ip_version, mask, range
            

def get_sample_count(path):
    print(" GET SAMPLE COUNT ", path)
    p_path=path
    if type(p_path) == list:
        p_path = p_path.append("prevalent")
    else: 
        p_path += "/prevalent"
    
    
    count = -1
    try:    
        count= len(dp.get(subnet_dict, path))

        try:
            dp.get(subnet_dict, p_path) #, yielded=True)
            count -=1 # since a 'prevalent' key is now here too, we have to decrement by one
            print("decrement counter")
        except KeyError: 
            # no prevalent ingress exists
            pass
    except KeyError:
        print("ERROR: key {} does not exist".format(path))

    return count

def check_if_enough_samples_have_been_collected(range):
    if get_sample_count(range) >= __get_min_samples(range):
        print("    YES → is a single color prevalent ? (s_color >=q)")
        # print("      YES → color range with link color")
        # print("      NO → split subnet if s_cidr < cidrmax")
            
        return True
    else:
        print("    NO → join siblings ? (join(s_color) >= q or join(s_ipcount) < n_cidr−1)")
        print("      YES → join siblings and check again ")
        print("      NO → do nothing")
        return False

def get_prevalent_ingress(path):
    def __init_with_zero():
        return 0
    
    cur_prevalent=None
    sample_count=get_sample_count(path)

    # calculate prevalent ingress
    counter_dict=defaultdict(__init_with_zero)
    search_path="{}/**/ingress".format(path)
    for p, v in dp.search(subnet_dict, search_path, yielded=True): 
        counter_dict[v]+=1

    ratio= -1
    for ingress in counter_dict:
        ratio = counter_dict.get(ingress) / sample_count
        # if DEBUG: print("       ratio for {}: {:.2f}".format(ingress, ratio))
        if ratio >= q: 
            cur_prevalent = ingress
            break
    
    if cur_prevalent == None: ratio = -1
    if DEBUG: print("        prevalent for {}: {} ({:.2f})".format(path, cur_prevalent, ratio))
    
    return cur_prevalent
    
def set_prevalent_ingress(path, ingress):
    # TODO if an ingress is prevalent we set a 'prevalent' attribute for this path 
    # TODO then we can set the counter for miss and match
    # TODO and pop the list with all single ips
    # TODO then we need to distinguish between 
    #   already classified ranges => increment counters for misses and matches; decrement by dec_function
    #   not classified ranges = add IPs 
    #               
    dp.new(subnet_dict, f"{path}/prevalent", ingress)
    sample_count = get_sample_count(path)
    match=0
    for p,v in dp.search(subnet_dict, f"{path}/**/ingress", yielded=True): match = match +1 if v == ingress else match


    dp.new(subnet_dict, f"{path}/prevalent", ingress)
    dp.new(subnet_dict, f"{path}/match", match)
    dp.new(subnet_dict, f"{path}/miss", sample_count - match)
    dp.new(subnet_dict, f"{path}/last_seen", max(dp.search(subnet_dict, f"{path}/**/last_seen", yielded=True))[1])
    


# iterates over all ranges that are already classified
def is_prevalent_ingress_still_valid():
    if DEBUG: print("  > prevalent color still valid?")
    
    for p, i in dp.search(subnet_dict, "**/prevalent", yielded=True): 
        if DEBUG: print(f"    checking {p}")
            
        current_prevalent= i

        new_prevalent = get_prevalent_ingress(p)

        if current_prevalent == new_prevalent:
            print("     yep - ingress still valid")

        else:
            print("     NOO - not valid anymore -> delete all information")




def split_range(path):
    if DEBUG: print(f"        split range {path}")
    
    ip_version, mask, range = __convert_range_path_to_single_elems(path)

    if cidr_max <= mask:
        if DEBUG: print("    max_cidr reached - do nothing")
        return
    
    nw= IPNetwork(f"{range}/{mask}")
    
    ip_version = str(nw.version)
    #print(f"nw: {nw}")
    # add range to pytrcia tree and remove supernet 
    for splitted_nw in nw.subnet(mask+1):
        #if DEBUG: print(f"     add {splitted_nw}")
        range_lookup_dict[ip_version].insert(str(splitted_nw), str(splitted_nw))
    # if DEBUG: print(f"     del {nw}")

    print("         ", list(range_lookup_dict["4"]))
    range_lookup_dict[ip_version].delete(str(nw))

    # now split subnet_dict with all IPs 
    change_list=[]
    for p,v  in dp.search(subnet_dict, f"{path}/*", yielded=True): change_list.append((p,v))

    if DEBUG: print("        #items {}; first 3 elems: {}".format(len(change_list), change_list[:3]))
    subnet_dict[ip_version][str(mask)].pop(range)
    for p,v in change_list: add_to_subnet(ip= p.split("/")[3], ingress=v.get("ingress"), last_seen=v.get("last_seen"))



def join_siblings(path):
    if DEBUG: print(f"        join siblings for range {path}")

    ip_version, mask, range = __convert_range_path_to_single_elems(path)
    
    ip_version=str(ip_version)
    if mask == 0:
        print("    join siblings not possible - we are at the root of the tree")
        return None

    nw = IPNetwork(f"{range}/{mask}")
    if DEBUG: print("NET", nw)

    #what is the potential sibling?
    nw_supernet=nw.supernet(mask-1)[0]
    supernet_ip=str(nw_supernet).split("/")[0]
    supernet_mask=str(nw_supernet).split("/")[1]

    siblings=list(nw_supernet.subnet(mask))
    for sibling in siblings:
        # if one of both siblings does not exist -> skip joining
        if range_lookup_dict[str(ip_version)].get(f"{sibling}/{mask}", None) == None: return None

    # if both siblings exists -> delete it from range_lookup_dict and add supernet
    if DEBUG: print("len before: {}".format(len(subnet_dict[ip_version][supernet_mask][supernet_ip].keys())))
    
    range_lookup_dict[ip_version].insert(str(nw_supernet), str(nw_supernet))
    for sibling in siblings:
        if DEBUG: print("sibling: ", sibling)

        # merge subnet trees to supernet 
        if DEBUG: print("{} -> {}".format(sibling, len(subnet_dict[ip_version][str(mask)][str(sibling).split("/")[0]])))
        subnet_dict[ip_version][supernet_mask][supernet_ip].update(subnet_dict[ip_version][str(mask)].pop(str(sibling).split("/")[0]))
        
        range_lookup_dict[ip_version].delete(str(sibling))
    if DEBUG: print("len now: {}".format(len(subnet_dict[ip_version][supernet_mask][supernet_ip].keys())))
    return f"{ip_version}/{supernet_mask}/{supernet_ip}"    

def add_to_subnet(ip, ingress, last_seen):
    # something like subnet_dict[ip_version][range][ip]{ingress: ... , last_seen: ... }]
    # if DEBUG: print("adding ", ip, ingress, last_seen)

    ip_version = 4 if not ":" in ip else 6

    ip = mask_ip(ip)
    range, mask = __split_ip_and_mask(get_corresponding_range(ip))
    
    # subnet_dict[ip_version][mask][range]['prevalent'] = ingress # <-- we have a prevalent color for this range we can mark it somehow like this

    dp.new(subnet_dict, [int(ip_version), int(mask), range, ip, 'last_seen'], last_seen)
    dp.new(subnet_dict, [int(ip_version), int(mask), range, ip, 'ingress'], ingress)

    
    

# remove all ips older than e seconds
def remove_old_ips_from_range(current_ts):
    if DEBUG: print(f"  > remove IPs older than {e} seconds")
    pop_list=[]

    for path, ts in dp.search(subnet_dict, "**/last_seen",yielded=True): 
        # print(path, ts)
        if int(ts)  < current_ts - e : 
            # if DEBUG: print("remove old ip: {} ({})".format(path, ts))
            pop_list.append(path)

    print("    removing {} expired IP addresses".format(len(pop_list)))
    # b= len(subnet_dict["4"]["0"]["0.0.0.0"])
    for i, path in enumerate(pop_list): 
        try:
            path_elems= path.split("/")

            #dp.delete(subnet_dict, path.replace("/last_seen", "")) # too slow
            subnet_dict[path_elems[0]][path_elems[1]][path_elems[2]].pop(path_elems[3])

        except:
            print("    ERROR: {} cannot be deleted".format(path))
            pass
    






In [159]:
INSERT = False
CHECK_RANGES= True

e= 120
#     Insert netflow data into corresponding ranges


if CHECK_RANGES:
    for current_ts in netflow_df.ts_end.unique():
        
        cur_slice = netflow_df.loc[netflow_df.ts_end == current_ts]
        for i in cur_slice.itertuples():
            add_to_subnet(ip=i.src_ip, ingress=i.ingress, last_seen=i.ts_end)
        
        print(f"current ts: {current_ts}")
        remove_old_ips_from_range(current_ts=current_ts)

        is_prevalent_ingress_still_valid()

        # now go over all already classified ranges        
        for current_range in list(range_lookup_dict['4']) + list(range_lookup_dict['6']):
            print(f"\n   current_range: {current_range}")

            # dpath path
            current_range_path = __convert_range_string_to_range_path(current_range)
            
            while True:
                if check_if_enough_samples_have_been_collected(current_range_path):
                    prevalent_ingress = get_prevalent_ingress(current_range_path) # str or None
                    if prevalent_ingress != None:
                        print(f"        YES -> color {range} with {prevalent_ingress}")
                        # TODO color range with link color
                        set_prevalent_ingress_for_range(current_range_path, prevalent_ingress)
                        break
                    else:
                        print(f"        NO -> split subnet")
                        split_range(current_range_path)
                        break

                else:
                    print("      NO -> join siblings")
                    current_range_path = join_siblings(current_range_path)

                    if current_range_path == None: break
            


        print("\n   -------------- \n")
    # TODO after IPD loop is over -> change current_ts to next one


#   Check all ranges

current ts: 1605639480
  > remove IPs older than 120 seconds
    removing 0 expired IP addresses
  > prevalent color still valid?

   current_range: 0.0.0.0/0
 GET SAMPLE COUNT  4/0/0.0.0.0
    YES → is a single color prevalent ? (s_color >=q)
 GET SAMPLE COUNT  4/0/0.0.0.0
        prevalent for 4/0/0.0.0.0: None (-1.00)
        NO -> split subnet
        split range 4/0/0.0.0.0
          ['0.0.0.0/0', '0.0.0.0/1', '128.0.0.0/1']
        #items 3063; first 3 elems: [('4/0/0.0.0.0/31.13.84.48', {'last_seen': 1605639480, 'ingress': 'VIE-SB5.25'}), ('4/0/0.0.0.0/81.182.112.32', {'last_seen': 1605639480, 'ingress': 'VIE-SB5.17'}), ('4/0/0.0.0.0/45.57.17.144', {'last_seen': 1605639480, 'ingress': 'VIE-SB5.18'})]

   current_range: ::/0
 GET SAMPLE COUNT  6/0/::
ERROR: key 6/0/:: does not exist
    NO → join siblings ? (join(s_color) >= q or join(s_ipcount) < n_cidr−1)
      YES → join siblings and check again 
      NO → do nothing
      NO -> join siblings
        join siblings for range 6

In [150]:
import json

# print(subnet_dict["4"]["0"]['0.0.0.0']["69.171.250.48"])
# ["69.171.250.48"]

ingress= 'VIE-SB5.23'

# 4/0/0.0.0.0/69.171.250.48/last_seen
#split_range("4/0/0.0.0.0")
print(counter)

#change_list = []
#print(subnet_dict['4']['3'].keys())




#print(list(range_lookup_dict['4']))
#join_siblings("4/3/160.0.0.0")


# dp.search(subnet_dict, "4/3/128.0.0.0.0")

48


In [165]:
# print("before: ", list(range_lookup_dict["4"]))
# split_range("4/0/0.0.0.0")
# print("after: ", list(range_lookup_dict["4"]))

#print(subnet_dict['4']['3']['128.0.0.0'].keys())
print(list(range_lookup_dict['4']))

# join_siblings("4/3/128.0.0.0")
# print(dp.search(subnet_dict, "4/2/128.0.0.0"))
# print(dp.search(subnet_dict, "4/3/128.0.0.0"))

# subnet_dict['4']['2']["128.0.0.0.0"] = subnet_dict['4']['3'].pop("128.0.0.0")
# subnet_dict['4']['2']["128.0.0.0.0"] = subnet_dict['4']['3'].pop("160.0.0.0")
# subnet_dict['4']['2']["128.0.0.0.0"]

#set_prevalent_ingress("4/3/128.0.0.0", "STEFAN")

dp.search(subnet_dict, "4/3/128.0.0.0").keys()

['0.0.0.0/3', '32.0.0.0/3', '64.0.0.0/3', '96.0.0.0/3', '128.0.0.0/3', '160.0.0.0/3', '192.0.0.0/3', '224.0.0.0/3']


dict_keys(['4', 4])

In [34]:
# !pip install dpath

import dpath.util as dp

temp = "last_seen"

pop_list=[]




def check_prevalent_ingress(path): # dpath 
    print(path, "\n")

    def init_with_zero():
        return 0
    
    sample_count= get_sample_count(path)

    ## check if there is already a classified ingress for that range



   

    
    

    
    cur_prevalent=None

        
    
    # now differnt things can happen:
    #   1. last_prevalent == cur_prevalent and last_prevalent != None -> prevalent ingress still valid -> fine
    #   2. cur_prevalent == None                                      -> no prevalent ingress found
    #   3. cur_prevalent != None and last_prevalent == None  
    if cur_prevalent == None:
        print("no prevalent ingress found - split subnets")
    elif (last_prevalent == None and cur_prevalent != None):
        print("set new prevalent ingress for {}: {}".format(path, cur_prevalent))
    elif (last_prevalent != None and cur_prevalent != last_prevalent):
        print("current prevalent ingress != last one -> remove all information")
        print("     {} != {} ".format(cur_prevalent, last_prevalent))
    else:
        print("not catched: ", path, last_prevalent, cur_prevalent)
    

    
#is_prevalent_color_still_valid()
q=0.9
is_prevalent_ingress_still_valid()



prevalent for 4/0/0.0.0.0/prevalent: None (-1.00)
     NOO - not valid anymore -> delete all information
