In [1]:
keywords = [
    'auto', 'break', 'case', 'char', 'const', 'continue', 'default', 'do', 'double', 'else', 'enum', 'extern',
    'float', 'for', 'goto', 'if', 'include', 'int', 'long', 'register', 'return', 'short', 'signed', 'sizeof',
    'static', 'struct', 'switch', 'typedef', 'union', 'unsigned', 'void', 'volatile', 'while', '%d', '%c', '%f', '%s'
]

In [2]:
operators = [
    '+', '-', '*', '/', '%', '&', '|', '^', '!', '=', '~', '?', ':'
]

In [3]:
delimiters = [
    '{', '}',
    '(', ')',
    '[', ']',
    '<', '>',
    ';', ',',
]

In [4]:
class LexicalError(Exception):
    pass

In [5]:
import re

input_file = './input.c'
code = []

def pre_processing() -> None:
    # Get all code
    with open(input_file, 'r') as f:
        for line in f:
            words = line.strip()
            words = re.sub(r'\#.+|\\[abfnrtv]|\%[dcsfegxXioubpn]|//.*|/\*.*?\*/|\'|\"', r' ', words)

            for ch in delimiters:
                words = words.replace(ch, f' {ch} ')

            words = words.split()
            if len(words) != 0:
                code.append(words)

    # Process end back-slash
    for i, line in enumerate(code):
        if line[-1] == '\\' and i < len(code)-1:
            code[i].pop(-1)
            code[i].extend(code[i+1])
            code.pop(i+1)
        elif line[-1] == '\\' and i >= len(code)-1:
            raise LexicalError(f"Invalid character '{line[-1]}' at line {i+1}")
        else:
            pass

pre_processing()
code

[['const', 'int', 'b', '=', '100', ';'],
 ['static', 'int', 'a', ';'],
 ['int', 'main', '(', ')', '{'],
 ['101', ':', 'a', '=', '2', '*', '(', '1', '+', '3', ')', ';'],
 ['if', '(', 'b', '>', '10', ')', '{'],
 ['a', '=', '1', ';'],
 ['}', 'else', 'if', '(', 'b', '>', '=', '5', ')', '{'],
 ['a', '=', '2', ';'],
 ['}', 'else', '{'],
 ['goto', '101', ';'],
 ['}'],
 ['printf', '(', ',', 'b', ')', ';'],
 ['return', '0', ';'],
 ['}']]

In [6]:
class LexicalAnalyzer:
    def __init__(self, code: list) -> None:
        self.code = code
        self.tokens = {
            'Keyword': [],
            'Identifier': [],
            'Number': [],
            'Operator': [],
            'Delimiter': [],
            'Label': [],
        }
        self.is_label = False

    def lex(self) -> dict:
        for line_num, words in enumerate(self.code):
            for word in words:
                # goto
                if self.is_label:
                    self.show('Label', word)
                    self.tokens['Label'].append((line_num, word))
                    self.is_label = False
                else:
                    if word in keywords:
                        if word == 'goto':
                            self.is_label = True
                        self.show('Keyword', word)
                        self.tokens['Keyword'].append((line_num, word))
                    elif word in operators:
                        self.show('Operator', word)
                        self.tokens['Operator'].append((line_num, word))
                    elif word in delimiters:
                        self.show('Delimiter', word)
                        self.tokens['Delimiter'].append((line_num, word))
                    elif word.isdigit():
                        self.show('Number', word)
                        self.tokens['Number'].append((line_num, word))
                    else:
                        # Labels
                        if word[-1] == ':':
                            self.show('Label', word[:-1])
                            self.tokens['Label'].append((line_num, word[:-1]))
                        # Identifiers
                        elif word[0].isalpha() or word[0] == '_':
                            self.show('Identifier', word)
                            self.tokens['Identifier'].append((line_num, word))
                        else:
                            raise LexicalError(f"Invalid character '{word}' at line {line_num + 1}")
        return self.tokens

    @staticmethod
    def show(key: str, value: str) -> None:
        print(f'({key}, \'{value}\')')

    def show_dic(self) -> None:
        for key, value in self.tokens.items():
            self.show(key, value)

In [7]:
lexical_analyzer = LexicalAnalyzer(code)
dic = lexical_analyzer.lex()

(Keyword, 'const')
(Keyword, 'int')
(Identifier, 'b')
(Operator, '=')
(Number, '100')
(Delimiter, ';')
(Keyword, 'static')
(Keyword, 'int')
(Identifier, 'a')
(Delimiter, ';')
(Keyword, 'int')
(Identifier, 'main')
(Delimiter, '(')
(Delimiter, ')')
(Delimiter, '{')
(Number, '101')
(Operator, ':')
(Identifier, 'a')
(Operator, '=')
(Number, '2')
(Operator, '*')
(Delimiter, '(')
(Number, '1')
(Operator, '+')
(Number, '3')
(Delimiter, ')')
(Delimiter, ';')
(Keyword, 'if')
(Delimiter, '(')
(Identifier, 'b')
(Delimiter, '>')
(Number, '10')
(Delimiter, ')')
(Delimiter, '{')
(Identifier, 'a')
(Operator, '=')
(Number, '1')
(Delimiter, ';')
(Delimiter, '}')
(Keyword, 'else')
(Keyword, 'if')
(Delimiter, '(')
(Identifier, 'b')
(Delimiter, '>')
(Operator, '=')
(Number, '5')
(Delimiter, ')')
(Delimiter, '{')
(Identifier, 'a')
(Operator, '=')
(Number, '2')
(Delimiter, ';')
(Delimiter, '}')
(Keyword, 'else')
(Delimiter, '{')
(Keyword, 'goto')
(Label, '101')
(Delimiter, ';')
(Delimiter, '}')
(Identifier,

In [8]:
lexical_analyzer.show_dic()

(Keyword, '[(0, 'const'), (0, 'int'), (1, 'static'), (1, 'int'), (2, 'int'), (4, 'if'), (6, 'else'), (6, 'if'), (8, 'else'), (9, 'goto'), (12, 'return')]')
(Identifier, '[(0, 'b'), (1, 'a'), (2, 'main'), (3, 'a'), (4, 'b'), (5, 'a'), (6, 'b'), (7, 'a'), (11, 'printf'), (11, 'b')]')
(Number, '[(0, '100'), (3, '101'), (3, '2'), (3, '1'), (3, '3'), (4, '10'), (5, '1'), (6, '5'), (7, '2'), (12, '0')]')
(Operator, '[(0, '='), (3, ':'), (3, '='), (3, '*'), (3, '+'), (5, '='), (6, '='), (7, '=')]')
(Delimiter, '[(0, ';'), (1, ';'), (2, '('), (2, ')'), (2, '{'), (3, '('), (3, ')'), (3, ';'), (4, '('), (4, '>'), (4, ')'), (4, '{'), (5, ';'), (6, '}'), (6, '('), (6, '>'), (6, ')'), (6, '{'), (7, ';'), (8, '}'), (8, '{'), (9, ';'), (10, '}'), (11, '('), (11, ','), (11, ')'), (11, ';'), (12, ';'), (13, '}')]')
(Label, '[(9, '101')]')
