## Lexical Analyzer

The first step in designing a compiler is to find and create tokens from input.

There are two ways to tokenize text/code:

- The text is separated character by character, and then we check if the set of characters refers to a specific token or not.
- Split sections when a special character appears (like space, comma, etc.).

Here we follow the second method using regular expressions...

In [1]:
import re
from dataclasses import dataclass, field
from typing import Optional, List, Tuple

from nltk.tokenize import word_tokenize, wordpunct_tokenize, line_tokenize

In [2]:
code = """
# This is a useless comment
print `What is your name? `.
my $name <- readLn.
printLn `I think you are $name!`.

for my $i in 1..10 # Repeat from 1 to 10
  printLn `$i`.
done
"""

In [3]:
code2 = """
int age = 20;
std::cout << age << std::endl;
"""

Using built-in methods and natural language processing libraries:

In [4]:
print(code.split())

['#', 'This', 'is', 'a', 'useless', 'comment', 'print', '`What', 'is', 'your', 'name?', '`.', 'my', '$name', '<-', 'readLn.', 'printLn', '`I', 'think', 'you', 'are', '$name!`.', 'for', 'my', '$i', 'in', '1..10', '#', 'Repeat', 'from', '1', 'to', '10', 'printLn', '`$i`.', 'done']


In [5]:
print(word_tokenize(code))

['#', 'This', 'is', 'a', 'useless', 'comment', 'print', '`', 'What', 'is', 'your', 'name', '?', '`', '.', 'my', '$', 'name', '<', '-', 'readLn', '.', 'printLn', '`', 'I', 'think', 'you', 'are', '$', 'name', '!', '`', '.', 'for', 'my', '$', 'i', 'in', '1', '..', '10', '#', 'Repeat', 'from', '1', 'to', '10', 'printLn', '`', '$', 'i', '`', '.', 'done']


In [6]:
print(wordpunct_tokenize(code))

['#', 'This', 'is', 'a', 'useless', 'comment', 'print', '`', 'What', 'is', 'your', 'name', '?', '`.', 'my', '$', 'name', '<-', 'readLn', '.', 'printLn', '`', 'I', 'think', 'you', 'are', '$', 'name', '!`.', 'for', 'my', '$', 'i', 'in', '1', '..', '10', '#', 'Repeat', 'from', '1', 'to', '10', 'printLn', '`$', 'i', '`.', 'done']


In [7]:
print(line_tokenize(code))

['# This is a useless comment', 'print `What is your name? `.', 'my $name <- readLn.', 'printLn `I think you are $name!`.', 'for my $i in 1..10 # Repeat from 1 to 10', '  printLn `$i`.', 'done']


Create based on need and creativity maybe!

In [8]:
@dataclass(frozen=True)
class Token:
    name: str
    pattern: str

In [9]:
@dataclass(frozen=True)
class TokenState:
    name: str
    value: str

In [10]:
@dataclass
class Tokens:
    dict_: dict = field(init=False, default_factory=dict)

    def add(self, name: str, pattern: str) -> None:
        """Adds the token to the dictionary."""
        self.dict_[name] = Token(name, pattern)

    def add_from_sequence(self, seq: List[Tuple[str, str]]) -> None:
        """Adds a set of tokens to the dictionary."""
        for name, pattern in seq:
            self.dict_[name] = Token(name, pattern)

In [17]:
class Lexer:
    def __init__(self, tokens: Tokens, separator: Optional[str] = None) -> None:
        self.tokens = tokens
        self.separator = separator if separator is not None else r'(\W)'

    def tokenize(self, code: str) -> List[TokenState]:
        """It is clear from the name of the function what it does."""
        parts = filter(lambda x: x, re.split(self.separator, code))
        result = []

        for part in parts:
            for token in self.tokens.dict_.values():
                match = re.fullmatch(token.pattern, part)
                if match is not None:
                    result.append(TokenState(token.name, match.group()))
                    break
            else:
                raise ValueError(f'{repr(part)} is not a valid token!')

        return result

Our anonymous programming language tokens:

In [12]:
tokens = Tokens()
tokens.add_from_sequence([
    ('COMMENT', r'#.*'),
    ('DOT', r'\.'),
    ('BETWEEN', r'\.{2}'),
    ('ARROW', r'\<\-'),
    ('MY', r'my'),
    ('FOR', r'for'),
    ('IN', r'in'),
    ('DONE', r'done'),
    ('IF', r'if'),
    ('DO', r'do'),
    ('ELSEIF', r'elseif'),
    ('END', r'end'),
    ('NUMBER', r'[\+\-]?\d+(\.\d+)?'),
    ('STRING', r'`.*`'),
    ('IDENTIFIER', r'\$?[A-Za-z]+[_A-Za-z0-9]*'),
])
tokens.dict_

{'COMMENT': Token(name='COMMENT', pattern='#.*'),
 'DOT': Token(name='DOT', pattern='\\.'),
 'BETWEEN': Token(name='BETWEEN', pattern='\\.{2}'),
 'ARROW': Token(name='ARROW', pattern='\\<\\-'),
 'MY': Token(name='MY', pattern='my'),
 'FOR': Token(name='FOR', pattern='for'),
 'IN': Token(name='IN', pattern='in'),
 'DONE': Token(name='DONE', pattern='done'),
 'IF': Token(name='IF', pattern='if'),
 'DO': Token(name='DO', pattern='do'),
 'ELSEIF': Token(name='ELSEIF', pattern='elseif'),
 'END': Token(name='END', pattern='end'),
 'NUMBER': Token(name='NUMBER', pattern='[\\+\\-]?\\d+(\\.\\d+)?'),
 'STRING': Token(name='STRING', pattern='`.*`'),
 'IDENTIFIER': Token(name='IDENTIFIER', pattern='\\$?[A-Za-z]+[_A-Za-z0-9]*')}

Making the separator pattern a little complex!

In [13]:
def _token_or_pattern(separator: str) -> None:
    """Returns the separator itself if a token not found."""
    token = tokens.dict_.get(separator)
    return f'({token.pattern})' if token is not None else separator

In [18]:
separators = [
    'STRING', 'COMMENT', 'IDENTIFIER', 'BETWEEN',
    'ARROW', r'\s', r'(\W)']  # The order does matter
separator = '|'.join(map(_token_or_pattern, separators))
separator

'(`.*`)|(#.*)|(\\$?[A-Za-z]+[_A-Za-z0-9]*)|(\\.{2})|(\\<\\-)|\\s|(\\W)'

In [19]:
lexer = Lexer(tokens, separator)
lexer.tokenize(code)

[TokenState(name='COMMENT', value='# This is a useless comment'),
 TokenState(name='IDENTIFIER', value='print'),
 TokenState(name='STRING', value='`What is your name? `'),
 TokenState(name='DOT', value='.'),
 TokenState(name='MY', value='my'),
 TokenState(name='IDENTIFIER', value='$name'),
 TokenState(name='ARROW', value='<-'),
 TokenState(name='IDENTIFIER', value='readLn'),
 TokenState(name='DOT', value='.'),
 TokenState(name='IDENTIFIER', value='printLn'),
 TokenState(name='STRING', value='`I think you are $name!`'),
 TokenState(name='DOT', value='.'),
 TokenState(name='FOR', value='for'),
 TokenState(name='MY', value='my'),
 TokenState(name='IDENTIFIER', value='$i'),
 TokenState(name='IN', value='in'),
 TokenState(name='NUMBER', value='1'),
 TokenState(name='BETWEEN', value='..'),
 TokenState(name='NUMBER', value='10'),
 TokenState(name='COMMENT', value='# Repeat from 1 to 10'),
 TokenState(name='IDENTIFIER', value='printLn'),
 TokenState(name='STRING', value='`$i`'),
 TokenState(na

Test on a C++ code:

In [20]:
lexer.tokenize(code2)

ValueError: '=' is not a valid token!