Add scripts to download and process BGP dumps

sipa · Dec 20, 2019 · 9210017 · 9210017
1 parent 6b87ab1
commit 9210017
Show file tree

Hide file tree

Showing 6 changed files with 142 additions and 0 deletions.
diff --git a/remote_dumps/README.md b/remote_dumps/README.md
@@ -0,0 +1,25 @@
+This set of scripts allows to download, parse and aggregate BGP announcement dumps from open repositories to be used in asmap construction.
+
+### Pre-reqs
+
+``./setup.sh``
+
+### Use
+
+0. ``./prepare.sh`` deletes old data.
+1. ``./download_dumps.py`` downloads RIPE dumps for a selected date (configured in the file) to the `dumps` folder.
+2. ``./quagga_parse.sh`` reads dumps from the `dumps` folder and
+writes the human readable interpretation to the `paths` folder.
+3. ``./quagga_aggregate.py`` goes through the interpreted dumps in ``paths`` folder, aggregates paths and assigns every IP prefix to the first element of the common suffix of the asn path.
+
+Resulting ``prefix_asns.out`` can be fed to ``../buildmap.py``.
+
+### Rationale
+
+Consider the following scenario:
+1.2.3.4: A -> B -> C -> X
+1.2.3.4: A -> F -> C -> X
+
+In this case, {C, X} is the common suffix, and we will map 1.2.3.4 to C, because C represents the single infrastructure required to reach that IP address.
+
+Note that diversifying by C would implicitly diversify by X too.
diff --git a/remote_dumps/download_dumps.py b/remote_dumps/download_dumps.py
@@ -0,0 +1,22 @@
+#!/usr/bin/env python
+
+import urllib.request
+import datetime
+
+providers = range(1, 24)
+date = datetime.date.today()
+
+dumps_dir = "dumps/"
+
+for provider in providers:
+    provider = ("{:02d}".format(provider))
+    link = "http://data.ris.ripe.net/rrc{0}/latest-bview.gz".format(provider)
+    dump_name = "dump_{0}_{1}.gz".format(provider, date)
+    print(link)
+    try:
+        dump = urllib.request.urlopen(link)
+    except Exception:
+        print('Failed to download: ' + link)
+        continue
+    with open(dumps_dir + dump_name,'wb+') as output:
+        output.write(dump.read())
diff --git a/remote_dumps/prepare.sh b/remote_dumps/prepare.sh
@@ -0,0 +1,5 @@
+#!/bin/bash
+
+rm dumps/*
+rm paths/*
+rm prefix_asns.out
diff --git a/remote_dumps/quagga_aggregate.py b/remote_dumps/quagga_aggregate.py
@@ -0,0 +1,70 @@
+#!/usr/bin/env python
+
+import sys
+import os
+import re
+
+PARSED_DUMPS_DIR = 'paths/'
+RESULT_OUTPUT = 'prefix_asns.out'
+first_octet = re.compile(r"^[^.|:]*")
+
+
+# Remove duplicate asns in a row
+# [1, 1, 2, 3, 3, 3] -> [1, 2, 3]
+def dedup(asn_path):
+    i = len(asn_path) - 2
+    while i > 0:
+        if asn_path[i] == asn_path[i - 1]:
+            asn_path = asn_path[0:i] + asn_path[i+1:]
+        i -= 1
+    return asn_path
+
+def find_common_suffixes(prefix_asn_paths, common_asn_suffix):
+    for prefix, asn_lists in prefix_asn_paths.items():
+        asn_lists = [dedup(asn_list.split(' ')) for asn_list in asn_lists] # preprocess
+        asn_lists.sort(key = len)
+        cur_asn_suffix = asn_lists[0] # represents the common sub-path (from the end) of asns to a prefix
+        for asn_list in asn_lists[1:]:
+            if cur_asn_suffix == asn_list:
+                continue
+            if cur_asn_suffix[-1] != asn_list[-1]: # multi-homed
+                break
+            cur_asn_suffix_len = len(cur_asn_suffix)
+            for i in range(1, cur_asn_suffix_len): # position from the end
+                if cur_asn_suffix[len(cur_asn_suffix) - i - 1] != asn_list[len(asn_list) - i - 1]:
+                    cur_asn_suffix = cur_asn_suffix[len(cur_asn_suffix) - i:]
+                    break
+        common_asn_suffix[prefix] = cur_asn_suffix
+
+def process_files():
+    res = dict()
+    files = os.listdir(PARSED_DUMPS_DIR)
+    step = 40
+    for i in range(1, 256, step): # process ip range chunks so that memory is not filled
+        print("Working on chunk: ", i, flush=True)
+        announcements = dict()
+        for file_name in files:
+            print('Reading file: ', file_name, flush=True)
+            with open(PARSED_DUMPS_DIR + file_name, "r") as file:
+                for line in file:
+                    announcement_data = re.sub(r'{*}', ' ', line.strip()) # removes {} sets in AS path
+                    announcement_data = announcement_data.split('|')
+                    prefix = announcement_data[0]
+                    first_oc = re.search(first_octet, prefix).group(0)
+                    if first_oc == '' or int(first_oc) > i + step: # passed current chunk
+                        break
+                    if int(first_oc) < i: # current chunk is ahead
+                        continue
+                    asns = announcement_data[1]
+                    announcements.setdefault(prefix, set()).add(asns)
+        find_common_suffixes(announcements, res)
+    return res
+
+def dump_result(prefix_unique_asn_suffixes):
+    with open(RESULT_OUTPUT, 'w+') as file:
+        for prefix, unique_asn_suffix in prefix_unique_asn_suffixes.items():
+            if unique_asn_suffix != [] and unique_asn_suffix[0] != '': # This happens very rarely. TODO debug
+                file.write("%s AS%s\n" % (prefix, unique_asn_suffix[0]))
+
+res = process_files()
+dump_result(res)
diff --git a/remote_dumps/quagga_parse.sh b/remote_dumps/quagga_parse.sh
@@ -0,0 +1,8 @@
+#!/bin/bash
+
+for mrt in `ls dumps`; do
+    /bin/echo -n "processing $mrt...      "
+    OUT=$mrt
+    /usr/local/bin/bgpdump -vm dumps/$mrt | cut -d '|' -f '6,7' > paths/$OUT
+done
+
diff --git a/remote_dumps/setup.sh b/remote_dumps/setup.sh
@@ -0,0 +1,12 @@
+#!/bin/bash
+
+mkdir dumps
+mkdir paths
+
+wget http://ris.ripe.net/source/bgpdump/libbgpdump-1.6.0.tgz
+tar zxvf libbgpdump-1.6.0.tgz
+rm libbgpdump-1.6.0.tgz
+cd libbgpdump-1.6.0
+./bootstrap.sh
+make install
+cd ..