In [5]:
import re

# Set of reserved words
RESERVED_WORDS = ['if', 'else', 'while', 'for', 'foreach', 'return']

# Set of token names
TOKEN_NAMES = ['ID', 'NUMBER', 'PLUS', 'MINUS', 'TIMES', 'DIVIDE', 'ASSIGN', 'LPAREN', 'RPAREN', 'LBRACKET', 'RBRACKET', 'LBRACE', 'RBRACE', 'LT', 'LE', 'GT', 'GE', 'EQ', 'NE'] + RESERVED_WORDS

def get_tokens(text):
    # Splitting the text into lines
    lines = text.split('\n')

    # Initialize the list of tokens
    tokens = []

    # Regular expressions for each token type
    token_regexes = {
        'ID': r'[a-zA-Z_][a-zA-Z0-9_]*',
        'NUMBER': r'\d+(\.\d+)?',
        'PLUS': r'\+',
        'MINUS': r'-',
        'TIMES': r'\*',
        'DIVIDE': r'/',
        'ASSIGN': r'=',
        'LPAREN': r'\(',
        'RPAREN': r'\)',
        'LBRACKET': r'\[',
        'RBRACKET': r'\]',
        'LBRACE': r'\{',
        'RBRACE': r'\}',
        'LT': r'<',
        'LE': r'<=',
        'GT': r'>',
        'GE': r'>=',
        'EQ': r'==',
        'NE': r'!=',
        'SE': r';',
        'SP': r' ',
        
    }

    # Compiling the regular expressions
    token_patterns = [ (name, re.compile(regex)) for name, regex in token_regexes.items() ]

    # Line and character position
    line_num = 1
    pos = 0

    # Iterate through each line of the text
    for line in lines:
        # While the line has characters
        while pos < len(line):
            # Iterate through the token patterns
            for name, pattern in token_patterns:
                # Check if the pattern matches
                match = pattern.match(line, pos)
                if match:
                    # Extract the token value
                    value = match.group()
                    # Check if the token is a reserved word
                    if name in RESERVED_WORDS:
                        tokens.append((name, value))
                    else:
                        tokens.append((name, value))
                    # Update the character position
                    pos = match.end()
                    break
            else:
                # If no pattern matches, it's an error
                raise Exception('Invalid character at line {0}, character {1}'.format(line_num, pos))
        # Update the line number and reset the character position
        line_num += 1
        pos = 0

    return tokens

In [7]:
# Read in the text file
with open('input.txt', 'r') as f:
    text = f.read()

# Get the list of tokens
tokens = get_tokens(text)

# Print the tokens
for token in tokens:
    print(token)


('ID', 'foreach')
('SP', ' ')
('LPAREN', '(')
('ID', 'int')
('SP', ' ')
('ID', 'i')
('SP', ' ')
('ID', 'in')
('SP', ' ')
('ID', 'list')
('RPAREN', ')')
('SP', ' ')
('LBRACE', '{')
('SP', ' ')
('SP', ' ')
('ID', 'if')
('SP', ' ')
('LPAREN', '(')
('ID', 'i')
('SP', ' ')
('GT', '>')
('SP', ' ')
('NUMBER', '0')
('RPAREN', ')')
('SP', ' ')
('LBRACE', '{')
('SP', ' ')
('SP', ' ')
('SP', ' ')
('SP', ' ')
('ID', 'sum')
('SP', ' ')
('PLUS', '+')
('ASSIGN', '=')
('SP', ' ')
('ID', 'i')
('SE', ';')
('SP', ' ')
('SP', ' ')
('RBRACE', '}')
('RBRACE', '}')
