# Basic Definitions

This module illustrates derivate-based scanning.

In [1]:
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
import sys

# -- Detect if in Own Install or in Colab
try:
    import google.colab
    OWN_INSTALL = False
except:
    OWN_INSTALL = True
    
if OWN_INSTALL:
    
  #---- Leave these definitions ON if running on laptop
  #---- Else turn OFF by putting them between ''' ... '''

  sys.path[0:0] = ['../../../../..',  '../../../../../3rdparty',  
                   '../../../..',  '../../../../3rdparty',  
                   '../../..',     '../../../3rdparty', 
                   '../..',        '../../3rdparty',
                   '..',           '../3rdparty' ]

else: # In colab
  ! if [ ! -d Jove ]; then git clone https://github.com/anon-Jove/Jove Jove; fi
  sys.path.append('./Jove')
  sys.path.append('./Jove/jove')

# -- common imports --
from jove.lex import lex
from jove.yacc import yacc
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

In [2]:
#!/usr/bin/env python
# re2ast.py
# A simple regexp parser to convert RE strings to an AST


tokens = ('EPS','STR','LPAREN','RPAREN','PLUS','STAR', 'NOT')

# Tokens
t_PLUS    = r'\+'
t_STAR    = r'\*'
t_LPAREN  = r'\('
t_RPAREN  = r'\)'
t_EPS     = r'\@'
t_STR     = r'[a-zA-Z0-9]'
t_NOT     = r'\!'

# Ignored characters
t_ignore = " \t"

def t_newline(t):
    r'\n+'
    t.lexer.lineno += t.value.count("\n")
    
def t_error(t):
    print("Illegal character '%s'" % t.value[0])
    t.lexer.skip(1)
    
# Build the lexer if you need to

#- lex()

# Parsing rules

precedence = (
     ('left','PLUS'),
     ('left','STAR'),
     ('right','NOT')
  )

def p_expression_plus(t):
    '''expression : expression PLUS catexp'''
    #print("-> plus(" + str(t[1]) + "," + str(t[3]) + ")")
    #t[0] = mk_plus_nfa(t[1], t[3])
    #
    t[0] = ('+', (t[1], t[3]))

def p_expression_plus_id(t):
    '''expression : catexp'''
    #print("-> catexp: " + str(t[1]))
    #
    #
    t[0] = t[1]

def p_expression_cat(t):
    '''catexp :  catexp ordyexp'''
    #print("-> cat(" + str(t[1]) + "," + str(t[2]) + ")")
    t[0] = ('.', (t[1], t[2]))

def p_expression_cat_id(t):
    '''catexp :  ordyexp'''
    #print("-> ordyexpr:" + str(t[1]))
    #
    t[0] = t[1]


def p_expression_ordy_star(t):
    'ordyexp : ordyexp STAR'
    #print("-> star(" + str(t[1]) + ")")
    t[0] = ('*', t[1])

def p_expression_ordy_not(t):
    'ordyexp : NOT ordyexp'
    #print("-> not(" + str(t[1]) + ")")
    t[0] = ('!', t[2])    

def p_expression_ordy_paren(t):
    'ordyexp : LPAREN expression RPAREN'
    #print("-> (" + str(t[2]) + ")")
    #
    t[0] = t[2]

def p_expression_ordy_eps(t):
    'ordyexp : EPS'
    #print('-> eps.')
    #
    #
    t[0] = ('@', '@')
    
def p_expression_ordy_str(t):
    'ordyexp : STR'
    #print("-> string:" + str(t[1]))
    #    
    t[0] = ('str', t[1])


def p_error(t):
    print("Syntax error at '%s'" % t.value)



#--

def re2ast(s): 
    relexer  = lex()
    reparser = yacc()
    myparseRETree = reparser.parse(s, lexer=relexer)
    return myparseRETree

#=== Now comes derivMatch as illustration of RE Derivative scanning

def opr(E):
    return E[0]

def arg1(E):
    return E[1][0]

def arg2(E):
    return E[1][1]

def arg(E):
    return E[1]

def nullable(E):
    if (opr(E) == "str") :
        return False
    elif (opr(E) == "@") :
        return True
    elif (opr(E) == "mty") :
        return False
    elif (opr(E) == "*"):
        return True
    elif (opr(E) == "!"):
        return not nullable(arg(E))
    elif (opr(E) == '+') :
        return nullable(arg1(E)) or nullable(arg2(E))
    elif (opr(E) == '.') :
        return nullable(arg1(E)) and nullable(arg2(E))
    else:
        return "???"    

def dv(c, E):
    if (opr(E) == "str") :
        if (arg(E) == c):
            return ("@", "@")
        else:
            return ("mty", "mty")
    elif (opr(E) == "@") :
        return ("mty", "mty")
    elif (opr(E) == "mty") :
        return ("mty", "mty")
    elif (opr(E) == "*"):
        return (".", (dv(c, arg(E)), E))
    elif (opr(E) == "!"):
        return ("!", dv(c, arg(E)))
    elif (opr(E) == '+') :
        return ("+", (dv(c, arg1(E)), dv(c, arg2(E))))
    elif (opr(E) == '.') :
        if nullable(arg1(E)):
            return ("+", ( ('.', (dv(c,arg1(E)), arg2(E))), dv(c, arg2(E)) ))
        else:
            return ('.', (dv(c,arg1(E)), arg2(E)))
    else:
        return "???"        

def matches(w, E):
    if w=="":
        return nullable(E)
    else:
        return matches(w[1:], dv(w[0], E))

    

In [3]:
print(' matches("aa", re2ast("!((aaa)*)")) = ', matches("aa", re2ast("!((aaa)*)")))
re4 = '(a+b)*b(a+b)(a+b)(a+b)'
nre4 = '!((a+b)*b(a+b)(a+b)(a+b))'

print(' matches("aabaa", re2ast(re4)) = ', matches("aabaa", re2ast(re4)))
print(' matches("aabaa", re2ast(nre4)) = ', matches("aabaa", re2ast(nre4)))


 matches("aa", re2ast("!((aaa)*)")) =  True
 matches("aabaa", re2ast(re4)) =  False
 matches("aabaa", re2ast(nre4)) =  True


In [4]:
matches("ab", re2ast("!b"))

True