In [1]:
#Token class is used here to create the individual tokens that were identified in lexer class
#__init__ constructor used to initialize the object 
#__repr__ special method in python that will give a string representation of the object

class Token:
    def __init__(self, token_identity, value):
        self.token_identity = token_identity
        self.value = value
        
    def __repr__(self):
        return f"{self.token_identity}: {self.value}"

import re     #python module used for the regular expressions





#lexical analyzer used in HW3 just updated to add terminals for the grammar language for later
class Lexer: 
    terminals = [               #list of all the terminals. Each tuple defines the regular expression of each token type
        (r'\b(int|string|char|float|double|short|bool|long)\b', 'DataType'),
        (r'\bif\b', 'IF'),
        (r'\bwhile\b', 'WHILE'),
        (r'\belse\b', 'ELSE'),
        (r';', 'SEMICOLON'),
        (r'\+', 'ADDITION'),
        (r'\-', 'SUBTRACTION'),
        (r'\*', 'MULTIPLICATION'),
        (r'/', 'DIVISION'),
        (r'%', 'MODULO'),
        (r'\(', 'OPEN_PARENTHESIS'),
        (r'\)', 'CLOSE_PARENTHESIS'),
        (r'\{', 'OPEN_BRACE'),
        (r'\}', 'CLOSE_BRACE'),
        (r'=', 'ASSIGNMENT'),
        (r'==', 'EQUALS'),
        (r'<', 'LESS_THAN'),
        (r'<=', 'LESS_THAN_OR_EQUAL_TO'),
        (r'>', 'GREATER_THAN'),
        (r'>=', 'GREATER_THAN_OR_EQUAL_TO'),
        (r'&&', 'LOGICAL_AND'),
        (r'\|\|', 'LOGICAL_OR'),
        (r'!=', 'NOT_EQUAL'),
        (r'[a-z]', 'VARIABLE_IDENTIFIER'),
        (r'\d+(i8|i16|i32|i64)?', 'INTEGER_LITERAL'),
        (r'\d+\.\d+', 'FLOATING_POINT_LITERAL'),
        (r'\s+', None),       #whitespaces will be ignored and not identified as a token identity (i.e. 'None')
        
    ]
    
    #Tokenization function will perform the lexical analysis on the given text
    #the function will return a list of the individual tokens identified from the text
    def Tokenization(self, text):
        the_tokens = []       
        current_position = 0     #index of current character that will be analyzed in the text
        
        while current_position < len(text):   #while loop that will iterate all the text
            matcher = None    #variable that will store the string (from text) that are a match with one of the operations in math_operantions
            for operator, token_identity in self.terminals:      #for loop that iterates all the tuples in math_operantions. Each tuples regex(operator) and token identity(token_identity)
                matcher = re.match(operator, text[current_position:])   #re.match() re module function used here to try to match the current 'operator' (regex) against the text at the current position (index). If a match is succesful then the object is assigned to matcher variable
                if matcher:   #if the above code is succesful and has a match then we continue
                    value = matcher.group(0)   #using .group(0) to group the entire substring of text that was found as a match starting from first index (0). the matched value(character(s) from text) is assigned to variable value
                    if token_identity:    #checking to make sure that matcher is not a whitespace. this would happen if 'operator' matched the whitespace regex which means the token_identity would be 'None'. If it's 'None' then we can't enter the if statement
                        the_tokens.append(Token(token_identity, value))  #creating a Token object with the matched token identity and it's value then appending to the tokens list
                    current_position = current_position + len(value)  #moving the current position to the next character(s) to be tokenized by adding it to the length of variable value so we know that we aren't tokenizing a piece of text that was already part of a substring that was already tokenized 
                    break
            
        return the_tokens    #while loop ends and the_tokens list containing all the identified tokens in the text is returned


#commands used to run this code in jupyter notebook
%load_ext autoreload
%autoreload 2

In [2]:
class RDA:
    
#__init__ constructor used to initialize the object 
    def __init__(self, tokenized_Terminals):
        self.tokenized_Terminals = tokenized_Terminals
        self.position = 0

        
#function that checks if the current token at the current position matches the 'token_identity'
#if there is a match then it returns True
#if it doesn't match then it returns False or returns False if there are no more tokens to process
    def matches(self, token_identity):
        if self.position < len(self.tokenized_Terminals) and self.tokenized_Terminals[self.position].token_identity == token_identity:
            self.position += 1
            return True
        return False
    
    
#beginning point of RDA process
#begins by calling the first grammar rule '<STMT>' which this rule parses the tokens based on the other grammar rules, invoking other grammar rules as needed.
#returns the final 'True' or 'False' result if the parsing is successful or not... i.e. if the tokens are in the grammar rules or not
    def Parse(self):
        return self.STMT()

    
# <STMT> --> <IF_STMT> | <BLOCK> | <ASSIGN> | <DECLARE> | <WHILE_LOOP>
    def STMT(self):
        if self.IF_STMT():
            return True
        if self.BLOCK():
            return True
        if self.ASSIGN():
            return True
        if self.DECLARE():
            return True 
        if self.WHILE_LOOP():
            return True
        return False
    
    
# <STMT_LIST> --> { <STMT> `;` }
    def STMT_LIST(self):
        while self.matches('SEMICOLON'):
            if not self.STMT():
                return False
        return True

    
# <WHILE_LOOP> --> `while` `(` <BOOL_EXPR> `)` <BLOCK>
    def WHILE_LOOP(self):
        if not self.matches('WHILE'):
            return False
        if not self.matches('OPEN_PARENTHESIS'):
            return False
        if not self.BOOL_EXPR():
            return False
        if not self.matches('CLOSE_PARENTHESIS'):
            return False
        if not self.BLOCK():
            return False
        return True
    

# <IF_STMT> --> `if` `(` <BOOL_EXPR> `)` <BLOCK> [ `else' <BLOCK> ] 
    def IF_STMT(self):
        if not self.matches('IF'):
            return False
        if not self.matches('OPEN_PARENTHESIS'):
            return False
        if not self.BOOL_EXPR():
            return False
        if not self.matches('CLOSE_PARENTHESIS'):
            return False
        if not self.BLOCK():
            return False
        if self.matches('ELSE'):
            if not self.BLOCK():
                return False
        return True

    
#<BLOCK> --> `{` <STMT_LIST> `}`
    def BLOCK(self):
        if not self.matches('OPEN_BRACE'):
            return False
        if not self.STMT_LIST():
            return False
        if not self.matches('CLOSE_BRACE'):
            return False
        return True


#<DECLARE> --> 'DataType' ID{',' ID}   
    def DECLARE(self):
        if not self.matches('DataType'):
            return False
        if not self.matches('VARIABLE_IDENTIFIER'):
            return False
        while self.matches('COMMA'):
            if not self.matches('VARIABLE_IDENTIFIER'):
                return False
        return True


#<ASSIGN> --> ID '=' <EXPR>
    def ASSIGN(self):
        if not self.matches('VARIABLE_IDENTIFIER'):
            return False
        if not self.matches('ASSIGNMENT'):
            return False
        if not self.EXPR():
            return False
        return True
    

    
# <EXPR> --> <TERM> {(`+`|`-`) <TERM>}
    def EXPR(self):
        if not self.TERM():
            return False
        while self.matches('ADDITION') or self.matches('SUBTRACTION'):
            if not self.TERM():
                return False
        return True

    
# <TERM> --> <FACT> {(`*`|`/`|`%`) <FACT>}
    def TERM(self):
        if not self.FACT():
            return False
        while self.matches('MULTIPLICATION') or self.matches('DIVISON') or self.matches('MODULO'):
            if not self.FACT():
                return False
        return True
    
    
# <FACT> --> ID | INT_LIT | FLOAT_LIT | `(` <EXPR> `)`
    def FACT(self):
        if self.matches('VARIABLE_IDENTIFIER') or self.matches('INTEGER_LITERAL') or self.matches('FLOATING_POINT_LITERAL'):
            return True
        if self.matches('OPEN_PARENTHESIS'):
            if not self.EXPR():
                return False
            if not self.matches('CLOSE_PARENTHESIS'):
                return False
            return True
        return False
    
    
# <BOOL_EXPR> --> <BTERM> {(`>`|`<`|`>=`|`<=`) <BTERM>}
    def BOOL_EXPR(self):
        if not self.BTERM():
            return False
        while self.matches('GREATER_THAN') or self.matches('LESS_THAN') or self.matches('GREATER_THAN_OR_EQUAL_TO') or self.matches('LESS_THAN_OR_EQUAL_TO'):
            if not self.BTERM():
                return False
        return True

    
# <BTERM> --> <BAND> {(`==`|`!=`) <BAND>}
    def BTERM(self):
        if not self.BAND():
            return False
        while self.matches('EQUALS') or self.matches('NOT_EQUAL'):
            if not self.BAND():
                return False
        return True

    
# <BAND> --> <BOR> {`&&` <BOR>}
    def BAND(self):
        if not self.BOR():
            return False
        while self.matches('LOGICAL_AND'):
            if not self.BOR():
                return False
        return True

    
# <BOR> --> <EXPR> {`||` <EXPR>}
    def BOR(self):
        if not self.EXPR():
            return False
        while self.matches('LOGICAL_OR'):
            if not self.EXPR():
                return False
        return True

In [8]:
# text = "int x; int y; x = 100; y = 6; int sum; sum = x + y; if (sum > 10) { int z; z = 1; } else { int z; z = 4; }"
#text file contains: 25 tokens that are valid for the grammar

with open("test2_textDoc_validTokens.txt", "r") as f:
    text = f.read()

   

lexer = Lexer()
token_terminals = lexer.Tokenization(text)
parser = RDA(token_terminals)
result = parser.Parse()
print(result)

True
