In [115]:
import re
import sys
import pandas as pd
import numpy as np

In [116]:
probes_sent_dict = {
    'SEQ': re.compile(r'SEQ\((?P<SEQ>.*)\)'),
    'OPS': re.compile(r'OPS\((?P<OPS>.*)\)'),
    'WIN': re.compile(r'WIN\((?P<WIN>.*)\)'),
    'ECN': re.compile(r'ECN\((?P<ECN>.*)\)'),
    'T1': re.compile(r'T1\((?P<T1>.*)\)'),
    'T2': re.compile(r'T2\((?P<T2>.*)\)'),
    'T3': re.compile(r'T3\((?P<T3>.*)\)'),
    'T4': re.compile(r'T4\((?P<T4>.*)\)'),
    'T5': re.compile(r'T5\((?P<T5>.*)\)'),
    'T6': re.compile(r'T6\((?P<T6>.*)\)'),
    'T7': re.compile(r'T7\((?P<T7>.*)\)'),
    'U1': re.compile(r'U1\((?P<U1>.*)\)'),
    'IE': re.compile(r'IE\((?P<IE>.*)\)'),
}

test_parser = re.compile(r'(?P<key>.*)=(?P<value>.*)')

fingerprint_dict = {
    'SEQ.SP':np.nan,
    'SEQ.GCD':np.nan,
    'SEQ.ISR':np.nan,
    'SEQ.TI':np.nan,
    'SEQ.CI':np.nan,
    'SEQ.II':np.nan,
    'SEQ.SS':np.nan,
    'SEQ.TS':np.nan,

    'OPS.O1':np.nan,
    'OPS.O2':np.nan,
    'OPS.O3':np.nan,
    'OPS.O4':np.nan,
    'OPS.O5':np.nan,
    'OPS.O6':np.nan,

    'WIN.W1':np.nan,
    'WIN.W2':np.nan,
    'WIN.W3':np.nan,
    'WIN.W4':np.nan,
    'WIN.W5':np.nan,
    'WIN.W6':np.nan,

    'ECN.R':np.nan,
    'ECN.DF':np.nan,
    'ECN.T':np.nan,
    'ECN.TG':np.nan,
    'ECN.W':np.nan,
    'ECN.O':np.nan,
    'ECN.CC':np.nan,
    'ECN.Q':np.nan,

    'T1.R':np.nan,
    'T1.DF':np.nan,
    'T1.T':np.nan,
    'T1.TG':np.nan,
    'T1.S':np.nan,
    'T1.A':np.nan,
    'T1.F':np.nan,
    'T1.RD':np.nan,
    'T1.Q':np.nan,

    'T2.R':np.nan,
    'T2.DF':np.nan,
    'T2.T':np.nan,
    'T2.TG':np.nan,
    'T2.W':np.nan,
    'T2.S':np.nan,
    'T2.A':np.nan,
    'T2.F':np.nan,
    'T2.O':np.nan,
    'T2.RD':np.nan,
    'T2.Q':np.nan,

    'T3.R':np.nan,
    'T3.DF':np.nan,
    'T3.T':np.nan,
    'T3.TG':np.nan,
    'T3.W':np.nan,
    'T3.S':np.nan,
    'T3.A':np.nan,
    'T3.F':np.nan,
    'T3.O':np.nan,
    'T3.RD':np.nan,
    'T3.Q':np.nan,

    'T4.R':np.nan,
    'T4.DF':np.nan,
    'T4.T':np.nan,
    'T4.TG':np.nan,
    'T4.W':np.nan,
    'T4.S':np.nan,
    'T4.A':np.nan,
    'T4.F':np.nan,
    'T4.O':np.nan,
    'T4.RD':np.nan,
    'T4.Q':np.nan,

    'T5.R':np.nan,
    'T5.DF':np.nan,
    'T5.T':np.nan,
    'T5.TG':np.nan,
    'T5.W':np.nan,
    'T5.S':np.nan,
    'T5.A':np.nan,
    'T5.F':np.nan,
    'T5.O':np.nan,
    'T5.RD':np.nan,
    'T5.Q':np.nan,

    'T6.R':np.nan,
    'T6.DF':np.nan,
    'T6.T':np.nan,
    'T6.TG':np.nan,
    'T6.W':np.nan,
    'T6.S':np.nan,
    'T6.A':np.nan,
    'T6.F':np.nan,
    'T6.O':np.nan,
    'T6.RD':np.nan,
    'T6.Q':np.nan,

    'T7.R':np.nan,
    'T7.DF':np.nan,
    'T7.T':np.nan,
    'T7.TG':np.nan,
    'T7.W':np.nan,
    'T7.S':np.nan,
    'T7.A':np.nan,
    'T7.F':np.nan,
    'T7.O':np.nan,
    'T7.RD':np.nan,
    'T7.Q':np.nan,

    'U1.R':np.nan,
    'U1.DF':np.nan,
    'U1.T':np.nan,
    'U1.TG':np.nan,
    'U1.IPL':np.nan,
    'U1.UN':np.nan,
    'U1.RIPL':np.nan,
    'U1.RID':np.nan,
    'U1.RIPCK':np.nan,
    'U1.RUCK':np.nan,
    'U1.RUD':np.nan,

    'IE.R':np.nan,
    'IE.DFI':np.nan,
    'IE.T':np.nan,
    'IE.TG':np.nan,
    'IE.CD':np.nan,
}



In [117]:
def _parse_probe(line):
    """
    Do a regex search against all defined regexes and
    return the key and match result of the first matching regex
    """

    line = line.strip()
    for key, rx in probes_sent_dict.items():
        match = rx.search(line)
        if match:
            return key, match.group(key)

    return None, None

    

In [118]:
def _parse_fingerprint(fingerprint):

    result = fingerprint_dict.copy()

    for probe in fingerprint:
        
        probe_key, probe_responses = _parse_probe(probe)

        if probe_key == None:
            continue

        for test in probe_responses.split('%'):

            match = test_parser.search(test)

            if match:
                test_key = match.group('key')
                test_value = match.group('value')
                id = probe_key + "." + test_key

                result[id] = np.nan if test_value == "" else test_value

    return result

    

In [119]:
def parse_database(filepath):

    with open(filepath, 'r') as database_file:
        dataset = []
        
        fingerprints = database_file.read().split('\n\n')

        for fingerprint in fingerprints:
        
            fingerprint_obj = _parse_fingerprint(fingerprint.splitlines())

            dataset.append(fingerprint_obj)

    return dataset


In [120]:
#db = pd.DataFrame(parse_database('db6.txt'))