In [4]:
import re

# Token types
INTEGER = 'INTEGER'
BOOLEAN = 'BOOLEAN'
PLUS = 'PLUS'
MINUS = 'MINUS'
MULTIPLY = 'MULTIPLY'
DIVIDE = 'DIVIDE'
ASSIGN = 'ASSIGN'
EQUAL = 'EQUAL'
NOTEQUAL = 'NOTEQUAL'
IF = 'IF'
ELSE = 'ELSE'
PRINT = 'PRINT'
TRUE = 'TRUE'
FALSE = 'FALSE'
IDENTIFIER = 'IDENTIFIER'
COMMENT = 'COMMENT'
EOF = 'EOF'

# Regular expressions for token patterns
patterns = [
    (r'\d+', INTEGER),
    (r'true|false', BOOLEAN),
    (r'\+', PLUS),
    (r'-', MINUS),
    (r'\*', MULTIPLY),
    (r'/', DIVIDE),
    (r'=', ASSIGN),
    (r'==', EQUAL),
    (r'!=', NOTEQUAL),
    (r'if', IF),
    (r'else', ELSE),
    (r'print', PRINT),
    (r'[a-zA-Z][a-zA-Z0-9]*', IDENTIFIER),
    (r'//.*', COMMENT),
]

# Function to tokenize the input string
def tokenize(text):
    tokens = []
    while text:
        matched = False
        for pattern, token_type in patterns:
            match = re.match(pattern, text)
            if match:
                value = match.group(0)
                tokens.append((token_type, value))
                text = text[len(value):].strip()
                matched = True
                break
        if not matched:
            raise Exception('Invalid character: ' + text[0])
    return tokens

# Function to read MiniLang source code from a file and tokenize it
def tokenize_file(filename):
    with open(filename, 'r') as file:
        source_code = file.read()
    return tokenize(source_code)

# Test the scanner with some MiniLang code
if __name__ == '__main__':
    filename = '/content/sample_data/example.minilang'
    try:
        tokens = tokenize_file(filename)
        for token in tokens:
            print(token)
    except Exception as e:
        print('Error:', e)


('DIVIDE', '/')
('DIVIDE', '/')
('IDENTIFIER', 'This')
('IDENTIFIER', 'is')
('IDENTIFIER', 'a')
('IDENTIFIER', 'MiniLang')
('IDENTIFIER', 'program')
('IDENTIFIER', 'x')
('ASSIGN', '=')
('INTEGER', '10')
('IDENTIFIER', 'y')
('ASSIGN', '=')
('INTEGER', '5')
('IF', 'if')
('IDENTIFIER', 'x')
('ASSIGN', '=')
('ASSIGN', '=')
('INTEGER', '10')
('PRINT', 'print')
('BOOLEAN', 'true')
('ELSE', 'else')
('PRINT', 'print')
('BOOLEAN', 'false')
