In [1]:
keywords = [
    'auto', 'break', 'case', 'char', 'const', 'continue', 'default', 'do', 'double', 'else', 'enum', 'extern',
    'float', 'for', 'goto', 'if', 'include', 'int', 'long', 'register', 'return', 'short', 'signed', 'sizeof',
    'static', 'struct', 'switch', 'typedef', 'union', 'unsigned', 'void', 'volatile', 'while', '%d', '%c', '%f', '%s'
]

In [2]:
operators = [
    '+', '-', '*', '/', '%', '&', '|', '^', '!', '=', '~', '?', ':'
]

In [3]:
delimiters = [
    '{', '}',
    '(', ')',
    '[', ']',
    '<', '>',
    ';', ',',
]

In [4]:
class LexicalError(Exception):
    pass

In [47]:
import re

input_file = './input.c'
characters = []
headers = []
code = []

def pre_processing() -> None:
    # Get all code
    with open(input_file, 'r') as f:
        for line in f:
            # Remove leading and trailing spaces
            words = line.strip()

            # Get header
            try:
                header = re.search(r'<(.+?)>', words)
            except Exception as err: 
                raise LexicalError(err)
                
            if header is not None:
                headers.append(header.group(1))
                
            # Remove headers, escape sequence, format specifiers, commands
            words = re.sub(r'\#.+|"%[dcsfegxXioubpn].*?(?<!\\)(?:\\\\)*\\[abfnrtv]?"?|//.*|/\*(.*?)\*/', r' ', words)
            
            for ch in delimiters:
                words = words.replace(ch, f' {ch} ')

            # Get characters
            try:
                character = re.search(r'\"(.*?)\"|\'\w\'', words)
                words = re.sub(r'\"(.*?)\"|\'\w\'', r' ', words)
            except Exception as err:
                raise LexicalError(err)
            
            if character is not None:
                print(f'character: {character.group()[1:-1]}')
                characters.append(character.group()[1:-1])
            
            words = words.split()
            if len(words) != 0:
                code.append(words)

    # Process end back-slash
    for i, line in enumerate(code):
        if line[-1] == '\\' and i < len(code)-1:
            code[i].pop(-1)
            code[i].extend(code[i+1])
            code.pop(i+1)
        elif line[-1] == '\\' and i >= len(code)-1:
            raise LexicalError(f"Invalid character '{line[-1]}' at line {i+1}")
        else:
            pass
    
    # Write code into pre-processing.txt
    with open('./pre-processing.txt', 'w') as f:
        for line in code:
            f.write(' '.join(line) + '\n')

pre_processing()
headers, characters, code

character: Hello ,  world!
character: c


(['stdio.h', 'stdlib.h'],
 ['Hello ,  world!', 'c'],
 [['static', 'int', 'a', ';'],
  ['const', 'int', 'b', '=', '100', ';'],
  ['char', '*', 'message', '=', ';'],
  ['char', 'ch', '=', ';'],
  ['int', 'main', '(', ')', '{'],
  ['101', ':', 'a', '=', '2', '*', '(', '1', '+', '3', ')', ';'],
  ['if', '(', 'b', '>', '10', ')', '{'],
  ['a', '=', '1', ';'],
  ['}', 'else', 'if', '(', 'b', '>', '=', '5', ')', '{'],
  ['a', '=', '2', ';'],
  ['}', 'else', '{'],
  ['goto', '101', ';'],
  ['}'],
  ['printf', '(', ',', 'b', ')', ';'],
  ['printf', '(', ',', 'message', ')', ';'],
  ['return', '0', ';'],
  ['}']])

In [48]:
class LexicalAnalyzer:
    def __init__(self, code: list) -> None:
        self.code = code
        self.tokens = {
            'Keyword': [],
            'Identifier': [],
            'Number': [],
            'Operator': [],
            'Delimiter': [],
            'Label': [],
            'Headers': headers,
            'Characters': characters,
        }
        self.is_label = False

    def lex(self) -> dict:
        for line_num, words in enumerate(self.code):
            for word in words:
                # goto
                if self.is_label:
                    self.show('Label', word)
                    self.tokens['Label'].append((line_num, word))
                    self.is_label = False
                else:
                    if word in keywords:
                        if word == 'goto':
                            self.is_label = True
                        self.show('Keyword', word)
                        self.tokens['Keyword'].append((line_num, word))
                    elif word in operators:
                        self.show('Operator', word)
                        self.tokens['Operator'].append((line_num, word))
                    elif word in delimiters:
                        self.show('Delimiter', word)
                        self.tokens['Delimiter'].append((line_num, word))
                    elif word.isdigit():
                        self.show('Number', word)
                        self.tokens['Number'].append((line_num, word))
                    else:
                        # Labels
                        if word[-1] == ':':
                            self.show('Label', word[:-1])
                            self.tokens['Label'].append((line_num, word[:-1]))
                        # Identifiers
                        elif word[0].isalpha() or word[0] == '_':
                            self.show('Identifier', word)
                            self.tokens['Identifier'].append((line_num, word))
                        else:
                            raise LexicalError(f"Invalid character '{word}' at line {line_num + 1}")
        return self.tokens

    @staticmethod
    def show(key: str, value: str) -> None:
        print(f'({key}, \'{value}\')')

    def show_dic(self) -> None:
        for key, value in self.tokens.items():
            self.show(key, value)

In [49]:
lexical_analyzer = LexicalAnalyzer(code)
dic = lexical_analyzer.lex()

(Keyword, 'static')
(Keyword, 'int')
(Identifier, 'a')
(Delimiter, ';')
(Keyword, 'const')
(Keyword, 'int')
(Identifier, 'b')
(Operator, '=')
(Number, '100')
(Delimiter, ';')
(Keyword, 'char')
(Operator, '*')
(Identifier, 'message')
(Operator, '=')
(Delimiter, ';')
(Keyword, 'char')
(Identifier, 'ch')
(Operator, '=')
(Delimiter, ';')
(Keyword, 'int')
(Identifier, 'main')
(Delimiter, '(')
(Delimiter, ')')
(Delimiter, '{')
(Number, '101')
(Operator, ':')
(Identifier, 'a')
(Operator, '=')
(Number, '2')
(Operator, '*')
(Delimiter, '(')
(Number, '1')
(Operator, '+')
(Number, '3')
(Delimiter, ')')
(Delimiter, ';')
(Keyword, 'if')
(Delimiter, '(')
(Identifier, 'b')
(Delimiter, '>')
(Number, '10')
(Delimiter, ')')
(Delimiter, '{')
(Identifier, 'a')
(Operator, '=')
(Number, '1')
(Delimiter, ';')
(Delimiter, '}')
(Keyword, 'else')
(Keyword, 'if')
(Delimiter, '(')
(Identifier, 'b')
(Delimiter, '>')
(Operator, '=')
(Number, '5')
(Delimiter, ')')
(Delimiter, '{')
(Identifier, 'a')
(Operator, '=')
(

In [50]:
lexical_analyzer.show_dic()

(Keyword, '[(0, 'static'), (0, 'int'), (1, 'const'), (1, 'int'), (2, 'char'), (3, 'char'), (4, 'int'), (6, 'if'), (8, 'else'), (8, 'if'), (10, 'else'), (11, 'goto'), (15, 'return')]')
(Identifier, '[(0, 'a'), (1, 'b'), (2, 'message'), (3, 'ch'), (4, 'main'), (5, 'a'), (6, 'b'), (7, 'a'), (8, 'b'), (9, 'a'), (13, 'printf'), (13, 'b'), (14, 'printf'), (14, 'message')]')
(Number, '[(1, '100'), (5, '101'), (5, '2'), (5, '1'), (5, '3'), (6, '10'), (7, '1'), (8, '5'), (9, '2'), (15, '0')]')
(Operator, '[(1, '='), (2, '*'), (2, '='), (3, '='), (5, ':'), (5, '='), (5, '*'), (5, '+'), (7, '='), (8, '='), (9, '=')]')
(Delimiter, '[(0, ';'), (1, ';'), (2, ';'), (3, ';'), (4, '('), (4, ')'), (4, '{'), (5, '('), (5, ')'), (5, ';'), (6, '('), (6, '>'), (6, ')'), (6, '{'), (7, ';'), (8, '}'), (8, '('), (8, '>'), (8, ')'), (8, '{'), (9, ';'), (10, '}'), (10, '{'), (11, ';'), (12, '}'), (13, '('), (13, ','), (13, ')'), (13, ';'), (14, '('), (14, ','), (14, ')'), (14, ';'), (15, ';'), (16, '}')]')
(Labe