LEXICAL ANALYZER

In [7]:
import re

def lexical_analyzer(code):
    token_spec = [
        ('NUMBER',   r'\d+'),
        ('ID',       r'[A-Za-z_]\w*'),
        ('ASSIGN',   r'='),
        ('END',      r';'),
        ('OP',       r'[\+\-\*/]'),
        ('COMPARE',  r'[<>!=]=?|=='),
        ('LPAREN',   r'\('),
        ('RPAREN',   r'\)'),
        ('LBRACE',   r'\{'),
        ('RBRACE',   r'\}'),
        ('IF',       r'if'),
        ('ELSE',     r'else'),
        ('WHILE',    r'while'),
        ('SKIP',     r'[ \t\n]+'),
        ('MISMATCH', r'.')
    ]
    tok_regex = '|'.join(f'(?P<{name}>{pattern})' for name, pattern in token_spec)

    tokens = []
    for mo in re.finditer(tok_regex, code):
        kind = mo.lastgroup
        value = mo.group()
        if kind == 'SKIP':
            continue
        elif kind == 'MISMATCH':
            raise RuntimeError(f'Unexpected token: {value}')
        else:
            tokens.append((kind, value))
    return tokens



sample_code = """
a = 10;
b = 20;
if (a < b) {
    c = a + b;
}
"""

tokens = lexical_analyzer(sample_code)
print("TOKENS:")
for t in tokens:
    print(t)


TOKENS:
('ID', 'a')
('ASSIGN', '=')
('NUMBER', '10')
('END', ';')
('ID', 'b')
('ASSIGN', '=')
('NUMBER', '20')
('END', ';')
('ID', 'if')
('LPAREN', '(')
('ID', 'a')
('COMPARE', '<')
('ID', 'b')
('RPAREN', ')')
('LBRACE', '{')
('ID', 'c')
('ASSIGN', '=')
('ID', 'a')
('OP', '+')
('ID', 'b')
('END', ';')
('RBRACE', '}')


SYNTAX ANALYZER

In [8]:
def syntax_analyzer(tokens):
    stack = []
    valid = True
    for t, v in tokens:
        if t == 'LBRACE':
            stack.append('{')
        elif t == 'RBRACE':
            if not stack:
                print("Syntax Error: unmatched '}' found")
                valid = False
            else:
                stack.pop()

    if stack:
        print("Syntax Error: missing '}'")
        valid = False

    semicolons = [v for t,v in tokens if t=='END']
    if not semicolons:
        print("Syntax Error: missing ';'")
        valid = False

    if valid:
        print("Syntax Analysis Passed.")
    return valid


syntax_analyzer(tokens)


Syntax Analysis Passed.


True

Symbol Table

In [9]:
def build_symbol_table(tokens):
    table = {}
    for t,v in tokens:
        if t == 'ID' and v not in ['if','else','while']:
            table[v] = {'type': 'int', 'value': None}
    print("\nSYMBOL TABLE:")
    for k,v in table.items():
        print(f"{k}: {v}")
    return table

symbol_table = build_symbol_table(tokens)



SYMBOL TABLE:
a: {'type': 'int', 'value': None}
b: {'type': 'int', 'value': None}
c: {'type': 'int', 'value': None}


Intermediate Code Generation

In [10]:
def generate_intermediate_code(tokens):
    ic = []
    i = 0
    while i < len(tokens):
        if tokens[i][0] == 'ID' and i+1 < len(tokens) and tokens[i+1][0] == 'ASSIGN':
            var = tokens[i][1]
            expr = ""
            i += 2
            while i < len(tokens) and tokens[i][0] != 'END':
                expr += tokens[i][1]
                i += 1
            ic.append(f"{var} = {expr}")

        elif tokens[i][0] == 'IF':
            cond = ""
            i += 2  # skip 'if' and '('
            while tokens[i][0] != 'RPAREN':
                cond += tokens[i][1]
                i += 1
            ic.append(f"IF {cond} GOTO L1")
            ic.append("GOTO L2")
            ic.append("LABEL L1:")

        elif tokens[i][0] == 'RBRACE':
            ic.append("LABEL L2:")
        i += 1

    print("\nINTERMEDIATE CODE:")
    for c in ic:
        print(c)
    return ic

intermediate_code = generate_intermediate_code(tokens)



INTERMEDIATE CODE:
a = 10
b = 20
c = a+b
LABEL L2:


Target Machine Code

In [11]:
def generate_machine_code(ic):
    asm = []
    for line in ic:
        if "=" in line:
            var, expr = line.split("=")
            var = var.strip()
            expr = expr.strip()
            asm.append(f"LOAD {expr}")
            asm.append(f"STORE {var}")
        elif line.startswith("IF") or line.startswith("GOTO") or line.startswith("LABEL"):
            asm.append(line)

    print("\nMACHINE CODE:")
    for a in asm:
        print(a)
    return asm

machine_code = generate_machine_code(intermediate_code)



MACHINE CODE:
LOAD 10
STORE a
LOAD 20
STORE b
LOAD a+b
STORE c
LABEL L2:


Combine All (Driver Function)

In [12]:
def mini_compiler(source_code):
    print("SOURCE CODE:\n", source_code)

    tokens = lexical_analyzer(source_code)
    if not syntax_analyzer(tokens):
        return
    build_symbol_table(tokens)
    ic = generate_intermediate_code(tokens)
    generate_machine_code(ic)
    print("\nCOMPILATION SUCCESSFUL!")


mini_compiler(sample_code)


SOURCE CODE:
 
a = 10;
b = 20;
if (a < b) {
    c = a + b;
}

Syntax Analysis Passed.

SYMBOL TABLE:
a: {'type': 'int', 'value': None}
b: {'type': 'int', 'value': None}
c: {'type': 'int', 'value': None}

INTERMEDIATE CODE:
a = 10
b = 20
c = a+b
LABEL L2:

MACHINE CODE:
LOAD 10
STORE a
LOAD 20
STORE b
LOAD a+b
STORE c
LABEL L2:

COMPILATION SUCCESSFUL!
