In [8]:
import re

class TokenType:
    INTEGER_LITERAL = "INTEGER_LITERAL"
    BOOLEAN_LITERAL = "BOOLEAN_LITERAL"
    IDENTIFIER = "IDENTIFIER"
    PLUS = "PLUS"
    MINUS = "MINUS"
    MULTIPLY = "MULTIPLY"
    DIVIDE = "DIVIDE"
    ASSIGN = "ASSIGN"
    EQUAL = "EQUAL"
    NOTEQUAL = "NOTEQUAL"
    IF = "IF"
    ELSE = "ELSE"
    PRINT = "PRINT"
    TRUE = "TRUE"
    FALSE = "FALSE"
    LPAREN = "LPAREN"
    RPAREN = "RPAREN"
    LBRACE = "LBRACE"
    RBRACE = "RBRACE"
    SEMICOLON = "SEMICOLON"
    COMMENT = "COMMENT"
    ERROR = "ERROR"

# Define regular expressions for tokens
int_literal_pattern = r"\d+"
bool_literal_pattern = r"true|false"
identifier_pattern = r"[a-zA-Z][a-zA-Z0-9]*"
operators_pattern = r"[+\-*/=]=?|!="
keywords_pattern = r"if|else|print|true|false"
parentheses_braces_pattern = r"[(){};]"
comment_pattern = r"//.*"

# Combine all patterns
pattern = f"({int_literal_pattern})|({bool_literal_pattern})|({identifier_pattern})|({operators_pattern})|({keywords_pattern})|({parentheses_braces_pattern})|({comment_pattern})"

# Tokenize function
def tokenize(source_code_file):
    tokens = []
    line_number = 1
    with open(source_code_file, 'r') as file:
        for line in file:
            for match in re.finditer(pattern, line):
                for group_id, group in enumerate(match.groups()):
                    if group:
                        token_type = list(TokenType.__dict__.values())[group_id]
                        lexeme = group
                        if token_type == TokenType.COMMENT:
                            break
                        if token_type == TokenType.IDENTIFIER and len(lexeme) > 20:
                            print(f"Lexical Error: Identifier too long at line {line_number}")
                            return
                        tokens.append((token_type, lexeme, line_number))
                        break
            line_number += 1
    return tokens

# Example usage
source_code_file = "input.code"
tokens = tokenize(source_code_file)
for token_type, lexeme, line_number in tokens:
    print(f"Token: {token_type}, Lexeme: {lexeme}, Line: {line_number}")


Token: IDENTIFIER, Lexeme: /, Line: 1
Token: IDENTIFIER, Lexeme: /, Line: 1
Token: BOOLEAN_LITERAL, Lexeme: Example, Line: 1
Token: BOOLEAN_LITERAL, Lexeme: MiniLang, Line: 1
Token: BOOLEAN_LITERAL, Lexeme: program, Line: 1
Token: BOOLEAN_LITERAL, Lexeme: a, Line: 2
Token: IDENTIFIER, Lexeme: =, Line: 2
Token: __main__, Lexeme: 5, Line: 2
Token: MINUS, Lexeme: ;, Line: 2
Token: BOOLEAN_LITERAL, Lexeme: b, Line: 3
Token: IDENTIFIER, Lexeme: =, Line: 3
Token: INTEGER_LITERAL, Lexeme: true, Line: 3
Token: MINUS, Lexeme: ;, Line: 3
Token: BOOLEAN_LITERAL, Lexeme: c, Line: 4
Token: IDENTIFIER, Lexeme: =, Line: 4
Token: BOOLEAN_LITERAL, Lexeme: a, Line: 4
Token: IDENTIFIER, Lexeme: *, Line: 4
Token: __main__, Lexeme: 2, Line: 4
Token: IDENTIFIER, Lexeme: +, Line: 4
Token: __main__, Lexeme: 1, Line: 4
Token: MINUS, Lexeme: ;, Line: 4
