In [3]:
inputExample = open('day19-example.txt', 'r').read().split('\n\n')
inputExample2 = open('day19-example2.txt', 'r').read().split('\n\n')
inputReal = open('day19.txt', 'r').read().split('\n\n')

import re

In [4]:
def parse(input):
    rules = {}
    for r in map(lambda x: x.split(': '), input[0].split('\n')):
        rules.update({int(r[0]) : r[1]})
        
    testCases = input[1].split('\n')
    
    return (rules, testCases)

In [5]:
def build(rules, rule):
    subRules = rules[rule].split(' ')

    regex = "("

    for sr in subRules:
        if sr[0] == '"':
            return sr[1]
        elif sr[0] == '|':
            regex += "|"
        else:
            regex += build(rules, int(sr))

    regex += ")"
    return regex

In [15]:
def part1(input):
    (rules, testCases) = parse(input)

    expression = ("^" + build(rules, 0) + "$")

    count = 0
    for m in testCases:
        if re.search(expression, m):
            count += 1

    print(count)

In [16]:
part1(inputExample)

2


In [17]:
part1(inputReal)

226


## Part 2

This feels like a substantial step up from part 1. If you've done this like me and recognised the grammar described by the rules is a [Chomsky Type-3 grammar](https://en.wikipedia.org/wiki/Chomsky_hierarchy) (ie, a regex), and used built-in regex tooling to solve the issue.

The addition of loops isn't fatal to the use of regular expressions for this the replacement rule 8 (`8: 42 | 42 8`) is representable in regex as `(<RULE42>)+` - all it's doing is repeating rule 42 one or more times.

However, the new rule 11 (`11: 42 31 | 42 11 31`) **is** fatal to the use of regexes. This one rule transforms the grammar as a whole from a Type-3 to a Type-2 (context-free) grammar. This is because there has to be a matching number of 42s and 31s.

Luck is, however, on our side. I'm not particularly interested in writing a full parser for *all* the rules - that sounds like far too much effort for an AoC problem. However, the new rules 8 and 11 are called from exactly one other rule - rule 0. Additionally, both rules 8 and 11 only call (themselves, or) rules 42 and 31, and everything below those rules can still be considered as a Type-3 grammar. Let's see if we can build a parser, where

Through some manual shuffling of the example2 rules, we can determine that the valid examples are all fixed-length of size 15. (Right column is the length of string the rule matches)

```
0: 8 11            15
1: "a"             1
2: 1 24 | 14 4     3
3: 5 14 | 16 1     3
4: 1 1             2
5: 1 14 | 15 1     2
6: 14 14 | 1 14    2
7: 14 5 | 1 21     3
8: 42              5
9: 14 27 | 1 26    4
10: 23 14 | 28 1   4
11: 42 31          10
12: 24 14 | 19 1   3
13: 14 3 | 1 12    4
14: "b"            1
15: 1 | 14         1
16: 15 1 | 14 14   2
17: 14 2 | 1 7     4
18: 15 15          2
19: 14 1 | 14 14   2
20: 14 14 | 1 15   2
21: 14 1 | 1 14    2
22: 14 14          2
23: 25 1 | 22 14   3
24: 14 1           2
25: 1 1 | 1 14     2
26: 14 22 | 1 20   3
27: 1 6 | 14 18    3
28: 16 1           3
31: 14 17 | 1 13   5
42: 9 14 | 10 1    5
```

In [48]:
def scanRules(input):
    ruleTotals = {}
    (unprocessedRules, _) = parse(input)
    changed = True
    while changed:
        changed = False
        for r in unprocessedRules:
            subRules = unprocessedRules[r].split(' ')
            length = 0
            secondLength = None
            issueEncountered = False
            for sr in subRules:
                if sr[0] == '"':
                    length = 1
                    break
                elif sr == '|':
                    secondLength = length
                    length = 0
                else:
                    if int(sr) in ruleTotals:
                        length += ruleTotals[int(sr)]
                    else:
                        issueEncountered = True
                        break
            if issueEncountered:
                continue

            if secondLength is not None and length != secondLength:
                raise Exception("Differing lengths detected!")

            unprocessedRules.pop(r)
            ruleTotals.update({r: length})
            changed = True
            break

    return ruleTotals

print("Example:")
ruleTotalsExample = scanRules(inputExample2)
print("rule 0 length:", ruleTotalsExample[0])
print()
print("Real input:")
ruleTotalsReal = scanRules(inputReal)
print("rule 0 length:", ruleTotalsReal[0])

Example:
rule 0 length: 15

Real input:
rule 0 length: 24


From this, we can see my input *also* has a fixed length (of 24) for rule 0.

We need to replace rules 8 and 11 with the following:
```
8: 42 | 42 8
11: 42 31 | 42 11 31
```

To make this easier, we'll just check the values for rules 42 and 31:

In [49]:
print("rule 31 length:", ruleTotalsReal[31])
print("rule 42 length:", ruleTotalsReal[42])

rule 31 length: 8
rule 42 length: 8


Let's just use the regexes for these rules to make the overall parser easier, and treat 31 and 42 as terminal symbols for the grammar.

In [6]:
(rules, testCases) = parse(inputExample2)
print(build(rules, 42))

((b(a(bb|ab)|b((a|b)(a|b)))|a(b(bb)|a(bb|a(a|b))))b|(((aa|ab)a|(bb)b)b|(((a|b)a|bb)a)a)a)


In [7]:
print(build(rules, 31))

(b(b(a(ba)|b(aa))|a(b(ab|(a|b)a)|a(ba|ab)))|a(b((ab|(a|b)a)b|((a|b)a|bb)a)|a((ba)b|(ba|bb)a)))


In [55]:
def matchR42(input, indent, matchStack):
    #print("match42")

    regex = build(rules, 42)

    print("  "*indent, "match42", input, regex)
    result = re.match(regex, input)

    #print("m42", result)
    if result:
        print("  "*indent, "match42 pass")
        matchStack.push(result.end())
        return result.end()
    else:
        print("  "*indent, "match42 fail")
        return 0

def matchR31(input, indent, matchStack):
    regex = build(rules, 31)
    print("  "*indent, "match31", input, regex)
    result = re.match(regex, input)
    if result:
        print("  "*indent, "match31 pass")
        matchStack.push(result.end())
        return result.end()
    else:
        print("  "*indent, "match31 fail")
        return 0

In [56]:
def matchR8(input, indent, matchStack):
    # 8: 42 | 42 8
    print("  "*indent, "match8", input)
    first = matchR42(input, indent + 1, matchStack)

    if first == 0:
        # no match on first 42, so we can't match this iteration
        print("  "*indent, "match8 failed first")
        return 0

    # attempt to recurse on the substring
    second = matchR8(input[first:], indent + 1, matchStack)

    # return the number of characters consumed
    if second == 0:
        print("  "*indent, "match8 firstonly")
        return first
    else:
        print("  "*indent, "match8 first+second")
        return first + second

In [57]:
def matchR11(input, indent, matchStack):
    # 11: 42 31 | 42 11 31
    print("  "*indent, "match11", input)
    first = matchR42(input, indent + 1, matchStack)

    if first == 0:
        # no match on first 42, so we can't match this iteration
        print("  "*indent, "match11 failed first")
        return 0

    recurse = matchR11(input[first:], indent + 1, matchStack)
    # we don't care whether the return value is 0 or some matching number.

    second = matchR31(input[first+recurse:], indent + 1, matchStack)

    if second == 0:
        print("  "*indent, "match11 failed second")
        return 0


    print("  "*indent, "match11 pass")
    return first + recurse + second

In [58]:
def matchR0(input):
    # 0: 8 11

    matchStack = deque()

    print("match0", input)
    first = matchR8(input, 1, matchStack)

    if first == 0:
        print("match0 failed first")
        return 0

    second = matchR11(input[first:], 1, matchStack)

    if second == 0:
        print("match0 failed second")
        return 0

    print("match0 pass")
    return first+second

Let's test some inputs!

According to the example, the 12 out of the 15 inputs should now match.

In [54]:
for t in testCases[1:2]:
    print("####", t, "of length", len(t))
    result = matchR0(t)
    print("matched chars:", result)
    print()

#### bbabbbbaabaabba of length 15
match0 bbabbbbaabaabba
   match8 bbabbbbaabaabba
     match42 bbabbbbaabaabba ((b(a(bb|ab)|b((a|b)(a|b)))|a(b(bb)|a(bb|a(a|b))))b|(((aa|ab)a|(bb)b)b|(((a|b)a|bb)a)a)a)
     match42 pass
     match8 bbaabaabba
       match42 bbaabaabba ((b(a(bb|ab)|b((a|b)(a|b)))|a(b(bb)|a(bb|a(a|b))))b|(((aa|ab)a|(bb)b)b|(((a|b)a|bb)a)a)a)
       match42 pass
       match8 aabba
         match42 aabba ((b(a(bb|ab)|b((a|b)(a|b)))|a(b(bb)|a(bb|a(a|b))))b|(((aa|ab)a|(bb)b)b|(((a|b)a|bb)a)a)a)
         match42 fail
       match8 failed first
     match8 firstonly
   match8 first+second
   match11 aabba
     match42 aabba ((b(a(bb|ab)|b((a|b)(a|b)))|a(b(bb)|a(bb|a(a|b))))b|(((aa|ab)a|(bb)b)b|(((a|b)a|bb)a)a)a)
     match42 fail
   match11 failed first
match0 failed second
matched chars: 0



In [34]:
re.match("((b(a(bb|ab)|b((a|b)(a|b)))|a(b(bb)|a(bb|a(a|b))))b|(((aa|ab)a|(bb)b)b|(((a|b)a|bb)a)a)a)", "bbabbbbaabaabba")

<re.Match object; span=(0, 5), match='bbabb'>

In [None]:
def part2(input):
    (rules, testCases) = parse(input)
        
    rules.update({8:"42 | 42 8"})
    rules.update({11:"42 31 | 42 11 31"})

    for r in [31, 42]:
        print(r, build(rules,r))

    return

    def evalRule(rule, inputString, pos, indent):
        if rule in [0, 8, 11, 31, 42]:
            pass
            # print('  ' *indent,  'eval rule', rule, 'from pos', pos, "str", inputString[pos:], "rule", rules[rule])
            
        subRules = rules[rule].split(' ')
        
        newPos = pos
        matchState = True
        
        for sr in subRules:
            if sr[0] == '"' and matchState:
                if newPos >= len(inputString):
                    matchState = False
                else:
                    if inputString[newPos] == sr[1]:
                        newPos += 1
                    else:
                        matchState = False
            elif sr[0] == '|':
                if matchState:
                    #print('  ' *indent, "rtn", newPos, "from first alt")
                    return newPos
                else:
                    
                    #print('  ' *indent, "reset")
                    matchState = True
                    newPos = pos
            else:
                if indent > (len(inputString)*10):
                    matchState = False
                    continue
                    
                returned = evalRule(int(sr), inputString, newPos, indent+1)
                if returned >= 0:
                    newPos = returned
                else:
                    matchState = False
        
        rtn = newPos if matchState else -1
        #print('  ' *indent, "rtn", rtn)
        return rtn
    
    def evalString(inputString):
        result = evalRule(0, inputString, 0,  0)
        # print(m, result)
        return result == len(inputString)
    
   
    count = 0
    for m in testCases:
        #print("###################################################################################################################")
        #print(m)
        if evalString(m):
            count += 1
    
    print(count)

In [None]:
part2(inputExample2)

In [None]:
part2(inputReal)