# Lexical Analyzer for C (Extended)

**For the `input.c`**

```c
#include <stdio.h>
#include <stdlib.h>
#include <string.h>

/*example*/
static int a;
const int _b = 1\
    00; // new line test
char c = 'c';   // character test
double d = 4.2; // float number test
char * message = "Hello, world!";   // string test

int main() {
  // test goto
  L0: a = 2 * (1 + 3);
  if (_b > 10) {
    a = 1;
  } else if (_b >= 5) {
    a = 2;
  } else {
    goto L0;
  }

  /* 
   * Test 
   * multi-line
   * command 
  **/

  // print answer
  printf("%d\n", _b + a);
  printf("%s", message);
  puts("");
  return 0;
}
```

In [1]:
keywords = [
    'auto', 'break', 'case', 'char', 'const', 'continue', 'default', 'do', 'double', 'else', 'enum', 'extern',
    'float', 'for', 'goto', 'if', 'include', 'int', 'long', 'register', 'return', 'short', 'signed', 'sizeof',
    'static', 'struct', 'switch', 'typedef', 'union', 'unsigned', 'void', 'volatile', 'while',
]

In [2]:
operators = [
    '<=', '>=', '==', '!=', '++', '--', '&&', '||', '+=', '-=',
    '+', '-', '*', '/', '%', '&', '|', '^', '!', '=', '~', '?', '<', '>',
]

In [3]:
delimiters = [
    '{', '}',
    '(', ')',
    '[', ']',
    '<', '>',
    ';', ',',
]

In [4]:
from utils.error import LexicalError

In [5]:
import re

input_file = './input.c'
characters = []
headers = []
code = ''

def pre_processing() -> None:
    # Get all code
    with open(input_file, 'r') as f:
        lines = f.readlines()
        
    for line in lines:
        global code
        
        # Remove all single-line commands
        line = re.sub(r'//.*|/\*(.*?)\*/', ' ', line)
        
        # Remove leading and trailing spaces
        code += line.strip() + ' '
        
    # Remove multi-line commands
    code = re.sub(r'/\*(.*?)\*/', ' ', code)

    try:
        # Get headers
        global headers
        headers = re.findall(r'<(.+?)>', code)
        # Remove headers
        code = re.sub(r'\#(.*?)>', ' ', code)
        
        # Get characters
        global characters
        for s, c in re.findall(r'\"(.*?)\"|\'(.?)\'', code):
            if s == '':
                characters.append(c)
            elif c == '':
                characters.append(s)
        # Remove characters
        code = re.sub(r'\"(.*?)\"|\'(.?)\'', ' ', code)
    except Exception as err: 
        raise LexicalError(f'Invalid character:\n{err}')
    
    # Separate delimiters
    for c in delimiters:
        code = code.replace(c, f' {c} ')
    
    # Separate dual-operators
    for c in operators[:10]:
        code = code.replace(c, f' {c} ')

    # Process backslashes
    for i, c in enumerate(code):
        if c == '\\':
            code = code[:i] + code[i+2:]
    
    code = code.split()
    lines = []

    for line in code:
        if line in operators[:10]:
            continue
        for c in line:
            if c in operators:
                line.replace(c, f' {c} ')
        lines.append(line)
    code = ' '.join(lines)

    # Write code into pre-processing.txt
    with open('./pre-processing.txt', 'w') as f:
        f.write(code)

In [6]:
pre_processing()
headers, characters, code

(['stdio.h', 'stdlib.h', 'string.h'],
 ['c', 'Hello, world!', '%d\\n', '%s', ''],
 'static int a ; const int _b = 100 ; char c = ; double d = 4.2 ; char * message = ; int main ( ) { L0: a = 2 * ( 1 + 3 ) ; if ( _b > 10 ) { a = 1 ; } else if ( _b > = 5 ) { a = 2 ; } else { goto L0 ; } printf ( , _b + a ) ; printf ( , message ) ; puts ( ) ; return 0 ; }')

In [7]:
class LexicalAnalyzer:
    def __init__(self, code: list) -> None:
        self.code = code
        self.tokens = {
            'Keyword': [],
            'Identifier': [],
            'Number': [],
            'Operator': [],
            'Delimiter': [],
            'Label': [],
            'Headers': headers,
            'Characters': characters,
        }
        self.is_label = False

    def lex(self) -> dict:
        for word in self.code:
            # goto
            if self.is_label:
                self.display('Label', word)
                self.tokens['Label'].append(word)
                self.is_label = False
            else:
                if word in keywords:
                    if word == 'goto':
                        self.is_label = True
                    self.display('Keyword', word)
                    self.tokens['Keyword'].append(word)
                elif word in operators:
                    self.display('Operator', word)
                    self.tokens['Operator'].append(word)
                elif word in delimiters:
                    self.display('Delimiter', word)
                    self.tokens['Delimiter'].append(word)
                    self.tokens['Number'].append(word)
                else:
                    # Number or Float
                    if word.replace('.', '').isdigit():
                        self.display('Number', word)
                        self.tokens['Number'].append(word)
                    # Labels
                    elif word[-1] == ':':
                        self.display('Label', word[:-1])
                        self.tokens['Label'].append(word[:-1])
                    # Identifiers
                    elif word[0].isalpha() or word[0] == '_':
                        self.display('Identifier', word)
                        self.tokens['Identifier'].append(word)
                    else:
                        raise LexicalError(f'Invalid character: {word}')
        return self.tokens

    @staticmethod
    def display(key: str, value: str) -> None:
        print(f'Dual({key}, {value})')

    def show_dic(self) -> None:
        for key, value in self.tokens.items():
            self.display(key, value)

In [8]:
code_lst = code.split(' ')
lexical_analyzer = LexicalAnalyzer(code_lst)
dic = lexical_analyzer.lex()

Dual(Keyword, static)
Dual(Keyword, int)
Dual(Identifier, a)
Dual(Delimiter, ;)
Dual(Keyword, const)
Dual(Keyword, int)
Dual(Identifier, _b)
Dual(Operator, =)
Dual(Number, 100)
Dual(Delimiter, ;)
Dual(Keyword, char)
Dual(Identifier, c)
Dual(Operator, =)
Dual(Delimiter, ;)
Dual(Keyword, double)
Dual(Identifier, d)
Dual(Operator, =)
Dual(Number, 4.2)
Dual(Delimiter, ;)
Dual(Keyword, char)
Dual(Operator, *)
Dual(Identifier, message)
Dual(Operator, =)
Dual(Delimiter, ;)
Dual(Keyword, int)
Dual(Identifier, main)
Dual(Delimiter, ()
Dual(Delimiter, ))
Dual(Delimiter, {)
Dual(Label, L0)
Dual(Identifier, a)
Dual(Operator, =)
Dual(Number, 2)
Dual(Operator, *)
Dual(Delimiter, ()
Dual(Number, 1)
Dual(Operator, +)
Dual(Number, 3)
Dual(Delimiter, ))
Dual(Delimiter, ;)
Dual(Keyword, if)
Dual(Delimiter, ()
Dual(Identifier, _b)
Dual(Operator, >)
Dual(Number, 10)
Dual(Delimiter, ))
Dual(Delimiter, {)
Dual(Identifier, a)
Dual(Operator, =)
Dual(Number, 1)
Dual(Delimiter, ;)
Dual(Delimiter, })
Dual(Keyword

In [9]:
lexical_analyzer.show_dic()

Dual(Keyword, ['static', 'int', 'const', 'int', 'char', 'double', 'char', 'int', 'if', 'else', 'if', 'else', 'goto', 'return'])
Dual(Identifier, ['a', '_b', 'c', 'd', 'message', 'main', 'a', '_b', 'a', '_b', 'a', 'printf', '_b', 'a', 'printf', 'message', 'puts'])
Dual(Number, [';', '100', ';', ';', '4.2', ';', ';', '(', ')', '{', '2', '(', '1', '3', ')', ';', '(', '10', ')', '{', '1', ';', '}', '(', '5', ')', '{', '2', ';', '}', '{', ';', '}', '(', ',', ')', ';', '(', ',', ')', ';', '(', ')', ';', '0', ';', '}'])
Dual(Operator, ['=', '=', '=', '*', '=', '=', '*', '+', '>', '=', '>', '=', '=', '+'])
Dual(Delimiter, [';', ';', ';', ';', ';', '(', ')', '{', '(', ')', ';', '(', ')', '{', ';', '}', '(', ')', '{', ';', '}', '{', ';', '}', '(', ',', ')', ';', '(', ',', ')', ';', '(', ')', ';', ';', '}'])
Dual(Label, ['L0', 'L0'])
Dual(Headers, ['stdio.h', 'stdlib.h', 'string.h'])
Dual(Characters, ['c', 'Hello, world!', '%d\\n', '%s', ''])


# Lab 1 (Std)

**For `BASIC` program:**

```c
/*example*/
    b=1\
00
101:a=2*(1+3)
    IF(b>10) THEN
        a=1
    ELSE IF(b>=5) THEN
        a=2
    ELSE
        GOTO 101
```

In [10]:
D = ['(', ')', ',']
K = ['IF', 'ELSE', 'THEN', 'GOTO']
O = [
    '<=', '>=', '==', '!=', '++', '--', '&&', '||', '+=', '-=',
    '+', '-', '*', '/', '%', '&', '|', '^', '!', '=', '~', '?', '<', '>', ':',
]

In [11]:
from utils.pre_processing import _pre_processing
_code = _pre_processing()
_code

'b = 100 101 :  a = 2 *  ( 1 + 3 ) IF ( b > 10 ) THEN a = 1 ELSE IF ( b >= 5 ) THEN a = 2 ELSE GOTO 101'

In [12]:
def display(name: str, value: str):
    print(f'Dual({name}, {value})')

In [13]:
def lex(expr: str) -> None:
    expr = expr.split()
    is_label = False
    for word in expr:
        if is_label:
            display('L', word)
            is_label = False
        else:
            if word.isdigit():
                display('N', word)
            elif word in K:
                display('K', word)
                if word == 'GOTO':
                    is_label = True
            elif word in O:
                display('O', word)
            elif word in D:
                display('D', word)
            elif word[-1] == ':':
                display('L', word[:-1])
            elif word.isalpha():
                display('I', word)
            else:
                raise LexicalError('Invalid token')

In [14]:
lex(_code)

Dual(I, b)
Dual(O, =)
Dual(N, 100)
Dual(N, 101)
Dual(O, :)
Dual(I, a)
Dual(O, =)
Dual(N, 2)
Dual(O, *)
Dual(D, ()
Dual(N, 1)
Dual(O, +)
Dual(N, 3)
Dual(D, ))
Dual(K, IF)
Dual(D, ()
Dual(I, b)
Dual(O, >)
Dual(N, 10)
Dual(D, ))
Dual(K, THEN)
Dual(I, a)
Dual(O, =)
Dual(N, 1)
Dual(K, ELSE)
Dual(K, IF)
Dual(D, ()
Dual(I, b)
Dual(O, >=)
Dual(N, 5)
Dual(D, ))
Dual(K, THEN)
Dual(I, a)
Dual(O, =)
Dual(N, 2)
Dual(K, ELSE)
Dual(K, GOTO)
Dual(L, 101)
