In [None]:
import os
import gzip
import csv
import math
from collections import defaultdict
import json
import pytricia
import ipaddress

cidr_max = 28   # nax split cidr mask
t=60            # seconds
e= 2*t          # 120  # expire time if IPs
q = 0.95        # needed ingress fraction
c = 0.00025          # c* sqrt(2^(IPv * max^-cidr))
### TODO concurrent adding to range from multiple Netflow parsers

class IPDRange:

    ### range as hierarchical dicts
    
    # |----------------------|
    # |                      | /0
    # |----------------------|
    #
    # 0.0.0.0/1  128.0.0.0/1
    # |----------------------|
    # |          |           | /1
    # |----------------------|
    # 0./2  64./2 128./2 192./2
    # |----------------------|
    # |     |    |     |     | /2
    # |----------------------|
    #
    # ....
    # ipd_range[cidr][prefix]
    #


    def __init__(self):
        self.ipd_range=self.__multi_dict(5, self.__atts)
        
        # TODO one pyt for ipv4 and one for v6
        pyt = pytricia.PyTricia()

        # initialization
        range_dict = self.__multi_dict(1, pytricia.PyTricia) #defaultdict(lambda: pytricia.PyTricia())
        range_dict[4].insert("0.0.0.0/0", "0.0.0.0/0")
        range_dict[6].insert("::/0", "::/0")
        


    def __atts(self):
        return {'last_seen': 0, 'match' : 0, 'miss' : 0, 'counter': 0 }
    
    def __multi_dict(self, K, type):
        if K == 1:
            return defaultdict(type)
        else:
            return defaultdict(lambda: self.__multi_dict(K-1, type))

    def __get_min_samples(self, cidr, ip_version=4):
        ipv_max = 32
        if ip_version == 6:
            ipv_max = 128
        return int(c * math.sqrt( math.pow(2, (ipv_max - cidr))))    


    def add_to_range(self, ip_version, ip_address,last_seen,ingress):
        # to reduce state -> mask to cidr_max
        ip_address = str(ipaddress.ip_network("{}/{}".format(ip_address, cidr_max), strict=False)).split("/")[0]
        # print(ip_version, prefix,last_seen,ingress)
        # self.ipd_range[last_seen][ip_version][cidr][prefix][ingress]['last_seen'] = last_seen

        # TODO check if already in another cidr -> if not 0
        cidr=0
        range = ipaddress.ip_network("{}/{}".format(ip_address, cidr), strict=False)
        self.ipd_range[last_seen][ip_version][str(range)][ip_address][ingress]['counter'] +=1

    def add_to_range_test(self):
     
        self.add_to_range(ip_version=4, ip_address="123.0.0.0",  ingress="VIE-SB4", last_seen=0)
        self.add_to_range(ip_version=4, ip_address="124.0.0.0",  ingress="VIE-SB4", last_seen=0)
        self.add_to_range(ip_version=4, ip_address="80.12.15.0", ingress="VIE-SB5", last_seen=0)
        self.add_to_range(ip_version=4, ip_address="80.12.16.0", ingress="VIE-SB5", last_seen=0)
        self.add_to_range(ip_version=4, ip_address="80.12.17.0", ingress="VIE-SB5", last_seen=60)
        self.add_to_range(ip_version=4, ip_address="80.12.18.0", ingress="VIE-SB5", last_seen=60)
        self.add_to_range(ip_version=4, ip_address="80.12.19.0", ingress="VIE-SB5", last_seen=60)
        self.add_to_range(ip_version=4, ip_address="80.12.20.0", ingress="VIE-SB5", last_seen=60)
        self.add_to_range(ip_version=4, ip_address="80.12.21.0", ingress="VIE-SB5", last_seen=60)
        self.add_to_range(ip_version=4, ip_address="125.0.0.0",  ingress="VIE-SB4", last_seen=0)
        self.add_to_range(ip_version=4, ip_address="126.0.0.0",  ingress="VIE-SB4", last_seen=0)
        self.add_to_range(ip_version=4, ip_address="80.13.15.0", ingress="VIE-SB5", last_seen=0)
        self.add_to_range(ip_version=4, ip_address="80.14.15.0", ingress="VIE-SB5", last_seen=0)
        self.add_to_range(ip_version=4, ip_address="80.15.15.0", ingress="VIE-SB5", last_seen=60)
        self.add_to_range(ip_version=4, ip_address="80.16.15.0", ingress="VIE-SB5", last_seen=60)
        self.add_to_range(ip_version=4, ip_address="80.17.15.0", ingress="VIE-SB5", last_seen=60)
        self.add_to_range(ip_version=4, ip_address="80.18.15.0", ingress="VIE-SB5", last_seen=60)
        self.add_to_range(ip_version=4, ip_address="80.19.15.0", ingress="VIE-SB5", last_seen=60)
        self.add_to_range(ip_version=4, ip_address="127.0.0.0",  ingress="VIE-SB4", last_seen=0)
        self.add_to_range(ip_version=4, ip_address="128.0.0.0",  ingress="VIE-SB4", last_seen=0)
        self.add_to_range(ip_version=4, ip_address="80.20.15.0", ingress="VIE-SB5", last_seen=0)
        self.add_to_range(ip_version=4, ip_address="80.21.15.0", ingress="VIE-SB5", last_seen=0)
        self.add_to_range(ip_version=4, ip_address="80.22.15.0", ingress="VIE-SB5", last_seen=60)
        self.add_to_range(ip_version=4, ip_address="80.23.15.0", ingress="VIE-SB5", last_seen=60)
        self.add_to_range(ip_version=4, ip_address="80.24.15.0", ingress="VIE-SB5", last_seen=60)
        self.add_to_range(ip_version=4, ip_address="80.25.15.0", ingress="VIE-SB5", last_seen=60)
        self.add_to_range(ip_version=4, ip_address="80.26.15.0", ingress="VIE-SB5", last_seen=60)
        self.add_to_range(ip_version=4, ip_address="129.0.0.0",  ingress="VIE-SB4", last_seen=0)
        self.add_to_range(ip_version=4, ip_address="130.0.0.0",  ingress="VIE-SB4", last_seen=0)
        self.add_to_range(ip_version=4, ip_address="80.27.15.0", ingress="VIE-SB5", last_seen=0)
        self.add_to_range(ip_version=4, ip_address="80.27.16.0", ingress="VIE-SB5", last_seen=0)
        self.add_to_range(ip_version=4, ip_address="80.12.33.0", ingress="VIE-SB5", last_seen=60)
        self.add_to_range(ip_version=4, ip_address="80.12.34.0", ingress="VIE-SB5", last_seen=60)
        self.add_to_range(ip_version=4, ip_address="80.12.35.0", ingress="VIE-SB5", last_seen=60)
        self.add_to_range(ip_version=4, ip_address="80.12.36.0", ingress="VIE-SB5", last_seen=60)
        self.add_to_range(ip_version=4, ip_address="80.12.37.0", ingress="VIE-SB5", last_seen=60)
        
                
        pass

    def __str__(self):
        return(json.dumps(self.ipd_range, indent=4))

    def find_prefix_in_range(self):
        # https://www.geeksforgeeks.org/how-to-use-dict-get-with-multidimensional-dict/
        # print(dict.get('emp2', {}).get('Name', {}).get('Last Name', 'Not Found'))
        pass

    def check_range(self):
        #https://stackoverflow.com/questions/22162321/search-for-a-value-in-a-nested-dictionary-python
        # if there a prevalent color in range 
        # print("items: ", self.ipd_range.items())
        # print("keys: ", self.ipd_range.keys())
        # print("values: ", self.ipd_range.values())

        # iterate over every binned timestamp
        for t in self.ipd_range.keys():

            # TODO for now we only use ipv4

            for r in self.ipd_range[t][4].keys():
                print(t, r)
                
                # check if enough samples in range

                samples_in_current_range = len(self.ipd_range[t][4][r].keys())
                current_cidr= int(r.split("/")[1])                
                if samples_in_current_range >= self.__get_min_samples(current_cidr):
                    # YES -> is a single color prevalent
                    check_prevalent_ingress(cur_range)
                    pass
                else:
                    # NO -> join siblings?
                    # TODO recursive? 
                    pass

        pass

ipd = IPDRange()
ipd.add_to_range_test()
ipd.check_range()

In [None]:

cidr_max = 28   # nax split cidr mask
t=60            # seconds
e= 2*t          # 120  # expire time if IPs
q = 0.95        # needed ingress fraction
c = 64          # c* sqrt(2^(IPv * max^-cidr))


# /data/slow/mehner/netflow/parser_[00..25]/archived/[@000000000000001605639660.gz|@000000000000001605643260.gz]
netflow_path="/data/slow/mehner/netflow/dummy_netflow.gz"
#netflow_path="/data/slow/mehner/netflow/parser_25/archived/@000000000000001605639660.gz" # one netflow takes 21min to be read
ingresslink_file = "/data/slow/mehner/ingresslink/1605571200.gz"                # if we get more netflow, we should adjust the file 
router_ip_mapping_file="/data/slow/mehner/router_lookup_tables/1605571200.txt"
# output folders
output_folder="results"

os.makedirs(output_folder, exist_ok=True)
    

###################################################
########### ROUTER NAME <--> IP MAPPING ###########
###################################################
with open(router_ip_mapping_file, 'r') as csv_file:
    router_ip_mapping_csv = csv.reader(csv_file, delimiter=' ')
    router_ip_lookup_dict = {rows[0]:rows[1] for rows in router_ip_mapping_csv}

###################################################
###########     INGRESS LINK FILE       ###########
###################################################

print("> load ingresslink file")

ingresslink_dict= {}
with gzip.open("{}".format(ingresslink_file), 'rb') as f:
    for line in f:
        line = line.decode('utf-8').split(",")
        router= line[0].replace("PEER_SRC_IP=", "")
        in_iface= line[1].replace("IN_IFACE=", "")
        
        # ingresslink_list.append("{}.{}".format(router, in_iface))
        ingresslink_dict["{}.{}".format(router, in_iface)] = True
print("  ...done\n")

###################################################
###########     READ NETFLOW             ###########
###################################################

# reads a netflow file
# filter for ingress only
# mask ip to cidr_max
# insert ip into corresponding range

def read_netflow_file(netflow_file):
    ## netflow files sind auf stundenbasis e.g. @000000000000001605639660.gz und @000000000000001605643260.gz
    #
    # TAG     PEER_SRC_IP  IN IFACE OUT_IFACE SRC_IP          DST_NET        SRC_PORT DST_PORT PROTO  _       _       TS_START        TS_END    PKTS    BYTES
    # 0       194.25.7.141    13      1571    91.127.69.122   31.13.84.4      40730   443     tcp     0       i       1605639641      1605639641 1       121
    # 0       194.25.7.141    13      1530    91.228.166.91   66.171.29.67    80      45659   tcp     0       i       1605639641      1605639641 1       46
    # 0       194.25.7.141    646     17      46.125.249.104  5.187.227.210   42681   27046   udp     0       i       1605639641      1605639642 2       2184
    # 0       194.25.7.141    633     21      216.58.212.175  178.165.131.117 443     3462    udp     0       i       1605639641      1605639641 1       56
    #

    with gzip.open(netflow_file, 'rb') as f:
        for line in f:        
            line = line.decode('utf-8').split(",")
            router_name = router_ip_lookup_dict.get(line[1])
            if line[-3] == "TIMESTAMP_END": continue
            in_iface = line[2]

            ip_version = 4 if not ":" in line[4] else 6

            src_ip = line[4] # mask to cidr_max

            cur_ts = str(int(int(line[-3]) / t) * t) # ts is binned to t ~time between calculations

            # if router and in_iface not in ingresslink list -> skip since we only want ingress traffic
            if not ingresslink_dict.get("{}.{}".format(router_name,in_iface), False): continue
            
            ### here are only valid ingressing ips that are masked to /28 ###

            # insert IP into corresponding range
            ipd.add_to_range(ip_version=ip_version, ip_address=src_ip, last_seen=cur_ts, ingress="{}.{}".format(router_name, in_iface))

## TODO insert whole netflow from all parsers 


ipd = IPDRange()
read_netflow_file(netflow_path)
