# LR(1) elemzö

_Sipos István_  
_2020.04.20_

In [1]:
from IPython.display import display, Math, Markdown, HTML
from collections import deque
import string
import re
import pandas as pd
import numpy as np
import string

### Segédfüggvények LaTeX listák és halmazok kiíratásához

In [2]:
def iterable_to_latex(iterable, sep = "", pref = "", post = ""):
    return pref + " " + sep.join(element.latex() + " " for element in iterable) + " " + post + " "

def set_to_latex(obj):
    if len(obj) == 0:
        return  r"\emptyset"
    return iterable_to_latex(obj, ", ", r"\{", r"\}")

def list_to_latex(obj):
    if len(obj) == 0:
        return  r"\lambda"
    return iterable_to_latex(obj, ", ", r"[", r"]")

def queue_to_latex(obj):
    if len(obj) == 0:
        return  r"\lambda"
    return iterable_to_latex(obj, "", r"", r"")

def tex_escape(text):
    conv = {
        '&': r'\&',
        '%': r'\%',
        '$': r'\$',
        '#': r'\#',
        '_': r'\_',
        '{': r'\{',
        '}': r'\}',
#        '~': r'\textasciitilde{}',
        '^': r'\^{}',
#        '\\': r'\textbackslash{}',
#        '<': r'\textless{}',
#        '>': r'\textgreater{}',
    }
    regex = re.compile('|'.join(re.escape(str(key)) for key in sorted(conv.keys(), key = lambda item: - len(item))))
    return regex.sub(lambda match: conv[match.group()], text)

def load_bnf_grammar():
    states = pd.read_csv('simple_bnf_grammar.csv', index_col=0)
    
    for d in string.digits:
        states[d] = states['digit']

    for l in string.ascii_lowercase + string.ascii_uppercase:
        states[l] = states['letter']

    states.drop(columns=['digit', 'letter'], inplace=True)
    states.rename(columns={'space': ' ', 'single_quote': "'", 'double_quote': '"'}, inplace=True)

    states[['store', 'read', 'back']] = states[['store', 'read', 'back']] == 'Yes'
    states[['symbol']] = states[['symbol']].replace(np.nan, '', regex=True)

    return states

LOGGING_ENABLED = False

def log(*args, **argv):
    if LOGGING_ENABLED:
        print(*args, **argv)

λ = ()
dot = '.'

## Egyszerü szkenner implementáció

Ezt fogom használni az input nyelvtan feldolgozására (egy leegyszerüsített Backus-Naur Form)

In [3]:

class Scanner():
    def __init__(self, state_transfer_table):
        self.state_transfer_table = state_transfer_table

    def get_state_vector(self, state_id):
        return self.state_transfer_table.loc[state_id]

    def tokenize(self, program):
        state_id = 0
        index = 0
        current_token = []
        tokens = []

        while True:
            char = program[index]
            if char in self.state_transfer_table.columns:
                state_id = self.get_state_vector(state_id)[char]
            else:
                state_id = self.get_state_vector(state_id)['other']

            if len(program) < index:
                raise IndexError()
                
            if self.get_state_vector(state_id)['name'] == 'error':
                raise ValueError("'" + program[index:] + "'")
            elif self.get_state_vector(state_id)['name'] == 'stop':
                return tokens
      
            if self.get_state_vector(state_id)['read']:
                current_token.append(program[index])
            
            if self.get_state_vector(state_id)['store']:
                tokens.append((''.join(current_token), self.get_state_vector(state_id)['symbol']))
                current_token = []

            if not self.get_state_vector(state_id)['back']:
                index += 1


### Általános típusok (Szimbólum, terminális, nem terminiális, szabály és nyelvtan)

In [4]:
class LatexDisplayable():
    def _repr_latex_(self):
        return "$ " + self.latex() + " $"

class Symbol(LatexDisplayable):
    def __init__(self, text, index = None):
        self.text = text
        self.index = index
        
    def __eq__(self, other):
        return type(self) == type(other) and self.text == other.text and self.index == other.index
    
    def __hash__(self):
        return hash(self.latex())
    
    def latex(self):
        if self.index == None:
            return tex_escape(str(self.text))
        return r'{}_{}'.format(tex_escape(str(self.text)), self.index)
        
    def __repr__(self):
        return self.latex()
    
class NonTerminal(Symbol):
    def latex(self):
        return r"<" + super().latex() + r">"
        
class Terminal(Symbol):
    def latex(self):
        return '"' + super().latex() + '"'
    
class Production(LatexDisplayable):
    def __init__(self, left, right):
        self.left = left
        self.right = right
        
    def latex(self):
        return r"{} \rightarrow {}".format(self.left.latex(), " ".join(element.latex() for element in self.right))
    
    def __eq__(self, other):
        return isinstance(other, Production) \
            and self.left == other.left \
            and self.right == other.right
    
    def __hash__(self):
        return hash(self.latex())

class Grammar(LatexDisplayable):
    def __init__(self, non_terminals, terminals, start, productions):
        self.non_terminals = non_terminals
        self.terminals = terminals
        self.start = start
        self.productions = productions
        
    def latex(self):
        return r"\langle " +\
           set_to_latex(self.non_terminals) +\
            " , " + set_to_latex(self.terminals) +\
            " , " + self.start.latex() +\
            " , " + set_to_latex(self.productions) +\
            r" \rangle"

## Segédfüggvények a szabályok beolvasásához

In [5]:
def add_symbol_to_grammar(symbol, grammar):
    if isinstance(symbol, NonTerminal):
        grammar.non_terminals.add(symbol)
    else:
        grammar.terminals.add(symbol)


def parse_productions(grammar, scanner, lines):
    for line in lines:
        tokens = scanner.tokenize(line + ';')
        assert tokens[1] == ('::=', 'expand')
        assert tokens[0][1] == 'term'
        
        
        non_terminal = NonTerminal(tokens[0][0])
        add_symbol_to_grammar(non_terminal, grammar)
        
        right = []
        for (token, symbol_type) in tokens[2:]:
            if symbol_type == 'separator':
                grammar.productions.append(Production(non_terminal, tuple(right)))
                right = []
            else:
                symbol = NonTerminal(token) if symbol_type == "term" else Terminal(token)
                add_symbol_to_grammar(symbol, grammar)
                right.append(symbol)
                
        grammar.productions.append(Production(non_terminal, tuple(right)))
                
def parse_grammar(text, scanner):
    grammar = Grammar(set(), set(), None, deque())
    
    lines = [line.strip() for line in text.splitlines()]
    # üres sorok és kommentek elhagyása
    lines = [line for line in lines  if len(line)>0 and line[0] != ";"]

    # A definíció elsö sora tartalmazza a startszimbólumot
    start = NonTerminal(scanner.tokenize(lines[0]+';')[0][0])
    grammar.start = start
    grammar.non_terminals.add(start)

    parse_productions(grammar, scanner, lines[1:])
        
    return grammar

eoi = Terminal('$')

## Egy nyelv kiterjesztése

In [6]:
def augment_grammar(grammar):
    
    s = grammar.start
    s_tick = NonTerminal(s.text + "`", s.index)

    assert eoi not in grammar.terminals
    assert s_tick not in grammar.non_terminals

    augmented_production = Production(s_tick, [s])
    
    return Grammar(grammar.non_terminals | {s_tick}, grammar.terminals | {eoi}, s_tick, deque([augmented_production, *grammar.productions]))

## A müveleteket reprezentáló osztályok

In [7]:
class Action():
    def __init__(self, token, params = None):
        self.token = token
        self.params = params

    def __eq__(self, other):
        return issubclass(type(other), Action) \
            and self.token == other.token \
            and self.params == other.params
    
    def __hash__(self):
        return hash(self.token)
    
    def __str__(self):
        if self.params == None:
            return self.token
        else:
            return f"{self.token}({self.params})"

class Shift(Action):
    def __init__(self, to):
        super().__init__('Shift', to)
        self.to = to

class Reduce(Action):
    def __init__(self, to):
        super().__init__('Reduce', to)
        self.to = to

class Goto(Action):
    def __init__(self, to):
        super().__init__('Goto', to)
        self.to = to

class Accept(Action):
    def __init__(self):
        super().__init__('Accept')

class Error(Action):
    def __init__(self):
        super().__init__('Error')


## LR(1) elemek

In [8]:
class Item(LatexDisplayable):
    def __init__(self, core, lookahead, dotindex=0):
        self.core = core
        self.lookahead = lookahead
        self.dotindex = dotindex
        
    def __eq__(self, other):
        return isinstance(other, Item) \
            and self.core == other.core \
            and self.lookahead == other.lookahead \
            and self.dotindex == other.dotindex
    
    def __hash__(self):
        return hash(self.latex())
    
    def latex(self):
        return "[ " +\
        self.core.left.latex() + r" \rightarrow " +\
        " ".join((r"\bullet " + element.latex() if i == self.dotindex else element.latex()) for (i, element) in enumerate(self.core.right)) +\
        (r" \bullet" if self.dotindex >= len(self.core.right) else "") +\
        " , " + (r"\lambda" if self.lookahead == λ else self.lookahead.latex()) +\
        " ]"

    # [A -> α.Bβ, a].next() => [A -> αB.β, a]
    # [A -> α., a].next() => IndexError
    def next(self):
        if self.in_tail_position:
            raise IndexError()
            
        return Item(self.core, self.lookahead, self.dotindex + 1)

    # [A -> α.Bβ, a]
    @property
    def α(self):
        return tuple(self.core.right[:self.dotindex])
    
    @property
    def B(self):
        return self.core.right[self.dotindex]
    
    @property
    def β(self):
        return tuple(self.core.right[self.dotindex+1:])
    
    @property
    def in_tail_position(self):
        return len(self.core.right) <= self.dotindex
   

## Az elemzö táblázat reprezentációja

In [9]:
class SubTable():
    def __init__(self, rows, symbols):
        self.rows = []
        self.symbols = symbols
        
        for i in range(rows):
            row = {}
            for symbol in symbols:
                row[symbol] = None
                
            self.rows.append(row)
        
    def __getitem__(self, pos):
        row, symbol = pos
        value = self.rows[row][symbol]
        return value
    
    def __setitem__(self, pos, value):
        row, symbol = pos
        
        self.rows[row][symbol] = value

class LR1ParsingTable():
    def __init__(self, rows, terminals, non_terminals):
        self.rows = rows
        self.terminals = terminals
        self.non_terminals = non_terminals
        self.action = SubTable(rows, terminals)
        self.goto = SubTable(rows, non_terminals)
        
    def _repr_html_(self):
        return f"""<table>
            <thead>
                <tr>
                    <th>&nbsp;</th>
                    <th colspan="{len(self.terminals)}">Action</th>
                    <th colspan="{len(self.non_terminals)}">GOTO</th>
                </tr>
                <tr>
                    <th>State</th>
                    """ + "".join(f"<th>{x.text}</th>" for x in self.terminals) + """
                    """ + "".join(f"<th>{x.text}</th>" for x in self.non_terminals) + """
                </tr>
            </thead>
            <tbody>
            """ + "".join(("<tr>"  + f"<td>{row}</td>" + \
                           "".join(("<td>" + (str(self.action[row, x]) if self.action[row, x] != None else "") + "</td>") for x in self.terminals) + \
                           "".join(("<td>" + (str(self.goto[row, x]) if self.goto[row, x] != None else "") + "</td>") for x in self.non_terminals) + \
                           "</tr>") for row in range(self.rows)) + """
            </tbody>
        </table>"""

# Az LR(1) feldolgozó osztály

In [10]:
 
class LR1Parser():
    def __init__(self, grammar):
        self.grammar = augment_grammar(grammar)

    def first(self, symbol):
        if isinstance(symbol, Terminal):
            return {symbol}
        
        result = set()
        
        for production in self.grammar.productions:
            if production.left != symbol:
                continue
            
            if production.right == λ:
                result |= {λ}
                continue
            
            if production.right[0] == symbol:
                continue
                
            for i, Y in enumerate(production.right):
                t = self.first(Y)
                result |= (t - {λ})
                
                if λ not in t:
                    break
                
                if i == len(production.right)-1: 
                    result |= {λ}
        
        return result
            
    def closure(self, items):
        add = True
        while add:
            add = False
            for item in items.copy():
                for production in self.grammar.productions:
                    if item.in_tail_position:
                        continue
                    if production.left != item.B:
                        continue

                    for b in self.first((item.β + (item.lookahead, ))[0]):
                        if b == λ: continue

                        new_item = Item(production, b)
                        if new_item not in items:
                            items |= {new_item}
                            add = True
            
        return items
    
    def read(self, I, X):
        J = set()

        for item in I:
            if item.in_tail_position or item.B != X:
                continue

            J |= self.closure({item.next()})
        
        # A jegyzet alapján a halmazt tovább kell bövíteni, de ez
        # hibás eredményt ad. A további bövítés nélkül viszont jól müködik
        
        #if J != I:
        #    J |= self.read(J, X)

        return J

    @property
    def canonical_production(self):
        return self.grammar.productions[0]
    
    def I0(self):
        # closure([S` -> .S, $])
        return self.closure({Item(self.canonical_production, eoi)})
    
    def states(self):
        """
        A kanonikus halmazok
        """
        result = []
        result.append(self.I0())
        
        while True:
            add = False

            new_states = []
            for state in result:
                for symbol in {item.B for item in state if not item.in_tail_position}:
                    new_state = self.read(state, symbol)

                    if len(new_state) == 0: continue
                    
                    if new_state not in (result + new_states):
                        new_states.append(new_state)
                        add = True

            result.extend(new_states)
            
            if not add:
                break
                
        return result
    
    def parsing_table(self):
        states = self.states()
        table = LR1ParsingTable(len(states), self.grammar.terminals, self.grammar.non_terminals - {self.grammar.start})
        
        for i,state in enumerate(states):
            for item in state:
                # Action
                if not item.in_tail_position:
                    shift = self.read(state, item.B)
                    if shift in states:
                        table.action[i, item.B] = Shift(states.index(shift))
                else:
                    if item.core.left != self.grammar.start:
                        table.action[i, item.lookahead] = Reduce(self.grammar.productions.index(item.core))
                    else:
                        table.action[i, item.lookahead] = Accept()
                        
                #GOTO
                goto = self.read(state, item.core.left)
                if goto in states:
                    table.goto[i, item.core.left] = Goto(states.index(goto))
                    
        return table
    
    def parse(self, input_word):
        word = deque([*input_word, eoi])
        table = self.parsing_table()
        stack = deque([eoi, 0])
        
        derivative = []
        
        log("A feldolgozás lépései:")
        
        while True:
            log(stack, word)
            action = table.action[stack[-1], word[0]]
            log(action)
            
            if action == Accept():
                return True, derivative
            
            elif isinstance(action, Shift):
                stack.append(word.popleft())
                stack.append(action.to)
                
            elif isinstance(action, Reduce):
                production = self.grammar.productions[action.to]
                derivative.append(production)
                for i in range(len(production.right)*2): stack.pop()
                
                goto = stack[-1]
                stack.append(production.left)
                stack.append(table.goto[goto, production.left].to)
            
            else:
                return False, (stack, word)
                

## Egy feldolgozást elvégzö segédfüggvény

In [11]:
def run_parse(lang_spec, word_spec, title = "Példa"):
    display(Markdown("# " + title))
    
    states = load_bnf_grammar()
    scanner = Scanner(states)
    G = parse_grammar(lang_spec, scanner)
    
    display(Markdown("## A bemeneti nyelv"))
    display(Math(G.latex()))
    
    lr1p = LR1Parser(G)

    display(Markdown("## A nyelv kanonikus halmazai"))
    for i, state in enumerate(lr1p.states()):
        print(i)
        display(Math(set_to_latex(state)))

    display(Markdown("## A feldolgozási tábla"))
    display(HTML(lr1p.parsing_table()._repr_html_()))

    word = [Terminal(token) for token, symbol_type in scanner.tokenize(word_spec)]

    display(Markdown("## A feldolgozás eredménye"))
    
    parseable, derivative = lr1p.parse(word)

    if parseable:
        print(f"A {word} szó jobboldali levezetése: ")
        display(Math(list_to_latex(list(reversed(derivative)))))
    else:
        print(f"A {word} szó nem levezethetö le!")

# Példák

In [12]:
lang_spec = """
<S>
<S> ::= <A> <A>
<A> ::= "a" <A> | "b"
"""
word_spec = '"a" "b" "b";'

run_parse(lang_spec, word_spec, title="A jegyzetbeli példa levezetése")

# A jegyzetbeli példa levezetése

## A bemeneti nyelv

<IPython.core.display.Math object>

## A nyelv kanonikus halmazai

0


<IPython.core.display.Math object>

1


<IPython.core.display.Math object>

2


<IPython.core.display.Math object>

3


<IPython.core.display.Math object>

4


<IPython.core.display.Math object>

5


<IPython.core.display.Math object>

6


<IPython.core.display.Math object>

7


<IPython.core.display.Math object>

8


<IPython.core.display.Math object>

9


<IPython.core.display.Math object>

## A feldolgozási tábla

Unnamed: 0_level_0,Action,Action,Action,GOTO,GOTO
State,b,a,$,A,S
0,Shift(1),Shift(2),,Goto(3),Goto(4)
1,Reduce(3),Reduce(3),,,
2,Shift(1),Shift(2),,Goto(5),
3,Shift(6),Shift(8),,Goto(7),
4,,,Accept,,
5,Reduce(2),Reduce(2),,,
6,,,Reduce(3),,
7,,,Reduce(1),,
8,Shift(6),Shift(8),,Goto(9),
9,,,Reduce(2),,


## A feldolgozás eredménye

A ["a", "b", "b"] szó jobboldali levezetése: 


<IPython.core.display.Math object>