In [13]:
import re

# Token types
INTEGER = 'INTEGER'
BOOLEAN = 'BOOLEAN'
PLUS = 'PLUS'
MINUS = 'MINUS'
MULTIPLY = 'MULTIPLY'
DIVIDE = 'DIVIDE'
ASSIGN = 'ASSIGN'
EQUAL = 'EQUAL'
NOTEQUAL = 'NOTEQUAL'
IF = 'IF'
ELSE = 'ELSE'
PRINT = 'PRINT'
TRUE = 'TRUE'
FALSE = 'FALSE'
IDENTIFIER = 'IDENTIFIER'
COMMENT = 'COMMENT'
EOF = 'EOF'

# Regular expressions for token patterns
patterns = [
    (r'\d+', INTEGER),
    (r'true|false', BOOLEAN),
    (r'\+', PLUS),
    (r'-', MINUS),
    (r'\*', MULTIPLY),
    (r'/', DIVIDE),
    (r'=', ASSIGN),
    (r'==', EQUAL),
    (r'!=', NOTEQUAL),
    (r'if', IF),
    (r'else', ELSE),
    (r'print', PRINT),
    (r'[a-zA-Z][a-zA-Z0-9]*', IDENTIFIER),
    (r'//.*', COMMENT),
]

# Function to tokenize the input string
# Function to tokenize the input string
def tokenize(text):
    tokens = []
    while text:
        matched = False
        for pattern, token_type in patterns:
            match = re.match(pattern, text)
            if match:
                value = match.group(0)
                tokens.append((token_type, value))
                text = text[len(value):].strip()
                matched = True
                break
        if not matched:
            raise Exception('Invalid character: ' + text[0])
    return tokens


# Function to read MiniLang source code from a file and tokenize it
def tokenize_file(filename):
    with open(filename, 'r') as file:
        source_code = file.read()
    return tokenize(source_code)

# MiniLang parser class
class MiniLangParser:
    def __init__(self, tokens):
        self.tokens = tokens
        self.current_token = None
        self.current_index = -1
        self.advance()

    def advance(self):
        self.current_index += 1
        if self.current_index < len(self.tokens):
            self.current_token = self.tokens[self.current_index]
        else:
            self.current_token = (EOF, None)

    def eat(self, token_type):
        if self.current_token[0] == token_type:
            self.advance()
        else:
            raise SyntaxError(f"Expected {token_type}, found {self.current_token[0]}")

    def factor(self):
        token = self.current_token
        if token[0] == INTEGER:
            self.eat(INTEGER)
            return ('INTEGER', token[1])
        elif token[0] == BOOLEAN:
            self.eat(BOOLEAN)
            return ('BOOLEAN', token[1])
        elif token[0] == IDENTIFIER:
            self.eat(IDENTIFIER)
            return ('IDENTIFIER', token[1])
        elif token[0] == '(':
            self.eat('(')
            result = self.expr()
            self.eat(')')
            return result

    def term(self):
        result = self.factor()
        while self.current_token[0] in (MULTIPLY, DIVIDE):
            token = self.current_token
            if token[0] == MULTIPLY:
                self.eat(MULTIPLY)
            elif token[0] == DIVIDE:
                self.eat(DIVIDE)
            result = ('BINOP', result, token[1], self.factor())
        return result

    def expr(self):
        result = self.term()
        while self.current_token[0] in (PLUS, MINUS):
            token = self.current_token
            if token[0] == PLUS:
                self.eat(PLUS)
            elif token[0] == MINUS:
                self.eat(MINUS)
            result = ('BINOP', result, token[1], self.term())
        return result

    def parse(self):
        return self.expr()


# Test the parser with some MiniLang code
if __name__ == '__main__':
    filename = '/content/sample_data/example.minilang'  # MiniLang source code file
    try:
        tokens = tokenize_file(filename)
        parser = MiniLangParser(tokens)
        ast = parser.parse()
        print("Abstract Syntax Tree (AST):", ast)
    except Exception as e:
        print('Error:', e)


Abstract Syntax Tree (AST): ('BINOP', ('BINOP', None, '/', None), '/', ('IDENTIFIER', 'MiniLang'))
