## Lexical Analyzer

The first step in designing a compiler is to find and create tokens from input.

There are two ways to tokenize text/code:

- The text is separated character by character, and then we check if the set of characters refers to a specific token or not.
- Split sections when a special character appears (like space, comma, etc.).

Here we follow the second method using regular expressions...

In [1]:
import re
from functools import partial
from dataclasses import dataclass, field
from typing import Optional, Callable, Type, List, Tuple

from nltk.tokenize import word_tokenize

In [4]:
code = """
# This is a useless comment
print `What is your name? `.
my $name <- readLn.
printLn `I think you are $name!`.

for my $i in 1..10 # Repeat from 1 to 10
  printLn `$i`.
done
"""

In [5]:
code2 = """
int age = 20;
std::cout << age << std::endl;
"""

Using built-in methods and natural language processing libraries:

In [6]:
print(code.split())

['#', 'This', 'is', 'a', 'useless', 'comment', 'print', '`What', 'is', 'your', 'name?', '`.', 'my', '$name', '<-', 'readLn.', 'printLn', '`I', 'think', 'you', 'are', '$name!`.', 'for', 'my', '$i', 'in', '1..10', '#', 'Repeat', 'from', '1', 'to', '10', 'printLn', '`$i`.', 'done']


In [7]:
print(word_tokenize(code))

['#', 'This', 'is', 'a', 'useless', 'comment', 'print', '`', 'What', 'is', 'your', 'name', '?', '`', '.', 'my', '$', 'name', '<', '-', 'readLn', '.', 'printLn', '`', 'I', 'think', 'you', 'are', '$', 'name', '!', '`', '.', 'for', 'my', '$', 'i', 'in', '1', '..', '10', '#', 'Repeat', 'from', '1', 'to', '10', 'printLn', '`', '$', 'i', '`', '.', 'done']


Create based on need and creativity maybe!

In [8]:
@dataclass(frozen=True)
class Token:
    name: str
    pattern: str
    calls: Optional[Callable] = None

In [9]:
@dataclass
class TokenState:
    name: str
    value: str
    calls: Optional[Callable] = None

In [10]:
@dataclass
class Tokens:
    dict_: dict = field(init=False, default_factory=dict)
    
    def add(self, name: str, pattern: str, calls: Optional[Callable] = None) -> None:
        """Adds the token to the dictionary."""
        self.dict_[name] = Token(name, pattern, calls)
    
    def add_from_sequence(self, seq: List[Tuple[str, str, Optional[Callable]]]) -> None:
        """Adds a set of tokens to the dictionary."""
        for name, pattern, *calls in seq:
            self.dict_[name] = Token(name, pattern, calls[0] if calls else None)

In [27]:
@dataclass
class Lexer:
    tokens: Type[Tokens]
    separator: Optional[str] = r'(\W)'
    
    def tokenize(self, code: str) -> List[Type[TokenState]]:
        """It is clear from the name of the function what it does."""
        parts = filter(lambda x: x, re.split(f'{self.separator}', code))
        result = []

        for part in parts:
            for token in self.tokens.dict_.values():
                match = re.match(f'{token.pattern}$', part)
                if match is not None:
                    result.append(TokenState(token.name, match.group(), token.calls))
                    break
            else:
                raise ValueError(f'{repr(part)} is not a valid token!')
        
        return result

Our anonymous programming language tokens:

In [28]:
tokens = Tokens()
tokens.add_from_sequence([
    ('COMMENT', r'#.*'),
    ('DOT', r'\.'),
    ('BETWEEN', r'\.{2}'),
    ('ARROW', r'\<\-'),
    ('NUMBER', r'[\+\-]?\d+(\.\d+)?'),
    ('STRING', r'`.*`'),
    ('PRINT', r'print', partial(print, end='')),
    ('PRINTLINE', r'printLn', print),
    ('READLINE', r'readLn', input),
    ('MY', r'my'),
    ('IN', r'in'),
    ('IDENTIFIER', r'\$[A-Za-z]+[_A-Za-z0-9]*'),
    ('FOR', r'for'),
    ('DONE', r'done'),
])
tokens.dict_

{'COMMENT': Token(name='COMMENT', pattern='#.*', calls=None),
 'DOT': Token(name='DOT', pattern='\\.', calls=None),
 'BETWEEN': Token(name='BETWEEN', pattern='\\.{2}', calls=None),
 'ARROW': Token(name='ARROW', pattern='\\<\\-', calls=None),
 'NUMBER': Token(name='NUMBER', pattern='[\\+\\-]?\\d+(\\.\\d+)?', calls=None),
 'STRING': Token(name='STRING', pattern='`.*`', calls=None),
 'PRINT': Token(name='PRINT', pattern='print', calls=functools.partial(<built-in function print>, end='')),
 'PRINTLINE': Token(name='PRINTLINE', pattern='printLn', calls=<built-in function print>),
 'READLINE': Token(name='READLINE', pattern='readLn', calls=<bound method Kernel.raw_input of <ipykernel.ipkernel.IPythonKernel object at 0x7f9f4a51e170>>),
 'MY': Token(name='MY', pattern='my', calls=None),
 'IN': Token(name='IN', pattern='in', calls=None),
 'IDENTIFIER': Token(name='IDENTIFIER', pattern='\\$[A-Za-z]+[_A-Za-z0-9]*', calls=None),
 'FOR': Token(name='FOR', pattern='for', calls=None),
 'DONE': Token(

Making the separator pattern a little complex!

In [30]:
def _token_or_pattern(separator: str) -> None:
    """Returns the separator itself if a token not found."""
    token = tokens.dict_.get(separator)
    return f'({token.pattern})' if token is not None else separator

In [31]:
separators = [
    'STRING', 'COMMENT', 'IDENTIFIER', 'BETWEEN',
    'ARROW', '\s', '(\W)']  # The order does matter
separator = '|'.join(map(_token_or_pattern, separators))
separator

'(`.*`)|(#.*)|(\\$[A-Za-z]+[_A-Za-z0-9]*)|(\\.{2})|(\\<\\-)|\\s|(\\W)'

In [32]:
lexer = Lexer(tokens, separator)
lexer.tokenize(code)

[TokenState(name='COMMENT', value='# This is a useless comment', calls=None),
 TokenState(name='PRINT', value='print', calls=functools.partial(<built-in function print>, end='')),
 TokenState(name='STRING', value='`What is your name? `', calls=None),
 TokenState(name='DOT', value='.', calls=None),
 TokenState(name='MY', value='my', calls=None),
 TokenState(name='IDENTIFIER', value='$name', calls=None),
 TokenState(name='ARROW', value='<-', calls=None),
 TokenState(name='READLINE', value='readLn', calls=<bound method Kernel.raw_input of <ipykernel.ipkernel.IPythonKernel object at 0x7f9f4a51e170>>),
 TokenState(name='DOT', value='.', calls=None),
 TokenState(name='PRINTLINE', value='printLn', calls=<built-in function print>),
 TokenState(name='STRING', value='`I think you are $name!`', calls=None),
 TokenState(name='DOT', value='.', calls=None),
 TokenState(name='FOR', value='for', calls=None),
 TokenState(name='MY', value='my', calls=None),
 TokenState(name='IDENTIFIER', value='$i', cal

Test on a C++ code:

In [23]:
lexer.tokenize(code2)

ValueError: 'int' is not a valid token!