<a href="https://colab.research.google.com/github/rogerioag/rea-comp04-compiladores/blob/main/jupyter-notebooks/01-comp-analise-lexica-cmmlex.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [12]:
!pip install ply
!pip install anytree
!pip install graphviz
!pip install llvmlite
!jupyter nbextension install https://rawgit.com/jfbercher/small_nbextensions/master/highlighter.zip  --user
!jupyter nbextension enable highlighter/highlighter

Collecting ply
[?25l  Downloading https://files.pythonhosted.org/packages/a3/58/35da89ee790598a0700ea49b2a66594140f44dec458c07e8e3d4979137fc/ply-3.11-py2.py3-none-any.whl (49kB)
[K     |██████▋                         | 10kB 14.3MB/s eta 0:00:01[K     |█████████████▏                  | 20kB 10.3MB/s eta 0:00:01[K     |███████████████████▉            | 30kB 6.1MB/s eta 0:00:01[K     |██████████████████████████▍     | 40kB 5.5MB/s eta 0:00:01[K     |████████████████████████████████| 51kB 2.0MB/s 
[?25hInstalling collected packages: ply
Successfully installed ply-3.11
Collecting anytree
[?25l  Downloading https://files.pythonhosted.org/packages/a8/65/be23d8c3ecd68d40541d49812cd94ed0f3ee37eb88669ca15df0e43daed1/anytree-2.8.0-py2.py3-none-any.whl (41kB)
[K     |████████████████████████████████| 51kB 2.1MB/s 
Installing collected packages: anytree
Successfully installed anytree-2.8.0
Downloading: https://rawgit.com/jfbercher/small_nbextensions/master/highlighter.zip -> /tmp/tmp

In [None]:
%%javascript
require("base/js/utils").load_extensions("highlighter/highlighter")

<IPython.core.display.Javascript object>

In [5]:
from sys import argv, exit

import logging
logging.basicConfig(
     level = logging.DEBUG,
     filename = "log.txt",
     filemode = "w",
     format = "%(filename)10s:%(lineno)4d:%(message)s"
)
log = logging.getLogger()

import ply.lex as lex
from ply.lex import TOKEN

In [6]:
__all__ = ['tokens', 'TOKENS_SYMBOLS']

reserved = {
    'else': 'ELSE',
    'if': 'IF',
    'int': 'INT',
    'return': 'RETURN',
    'void': 'VOID',
    'while': 'WHILE',
}

math_symbols = [
    'PLUS',  # +
    'MINUS',  # -
    'TIMES',  # *
    'DIVIDE'  # /
]

comparison_symbols = [
    'LESS_EQUAL',  # <=
    'LESS',  # <
    'GREATER_EQUAL',  # >=
    'GREATER',  # >
    'EQUALS',  # ==
    'DIFFERENT',  # !=
]

control_symbols = [
    'LPAREN',  # (
    'RPAREN',  # )
    'LBRACKETS',  # [
    'RBRACKETS',  # ]
    'LBRACES',  # {
    'RBRACES',  # }
    'ATTRIBUTION',  # =
    'SEMICOLON',  # ;
    'COMMA',  # ,
]

markers = [
    'ID',
    'NUMBER',
]

TOKENS_SYMBOLS = {
    'PLUS': '+',
    'MINUS': '-',
    'TIMES': '*',
    'DIVIDE': '/',
    'LESS_EQUAL': '<=',
    'LESS': '<',
    'GREATER_EQUAL': '>=',
    'GREATER': '>',
    'EQUALS': '==',
    'DIFFERENT': '!=',
    'LPAREN': '(',
    'RPAREN': ')',
    'LBRACKETS': '[',
    'RBRACKETS': ']',
    'LBRACES': '{',
    'RBRACES': '}',
    'ATTRIBUTION': '=',
    'SEMICOLON': ';',
    'COMMA': ',',
    'ELSE': 'else',
    'IF': 'if',
    'INT': 'int',
    'RETURN': 'return',
    'VOID': 'void',
    'WHILE': 'while',
}

tokens = markers + math_symbols + comparison_symbols + \
    control_symbols + list(reserved.values())


In [7]:
id_regex = r'[a-zA-Z][a-zA-Z]*'
comment_regex = r'\/\*[^\r]*\*\/'

#! MATH
t_PLUS = r'\+'
t_MINUS = r'-'
t_TIMES = r'\*'
t_DIVIDE = r'/'

#! COMPARISON
t_LESS_EQUAL = r'<='
t_LESS = r'<'
t_GREATER_EQUAL = r'>='
t_GREATER = r'>'
t_EQUALS = r'=='
t_DIFFERENT = r'!='

#! CONTROL
t_LPAREN = r'\('
t_RPAREN = r'\)'
t_LBRACKETS = r'\['
t_RBRACKETS = r'\]'
t_LBRACES = r'{'
t_RBRACES = r'}'
t_ATTRIBUTION = r'='
t_SEMICOLON = r';'
t_COMMA = r','

t_NUMBER = r'[0-9][0-9]*'


In [8]:
t_ignore = ' \t'

@TOKEN(id_regex)
def t_ID(t):
    t.type = reserved.get(t.value, 'ID')
    return t

@TOKEN(comment_regex)
def t_ignore_COMMENT(r):
    pass

def t_newline(t):
    r'\n+'
    t.lexer.lineno += len(t.value)

def t_error(t):
    print("Símbolo não definido pela linguagem '%s'" % t.value[0])
    t.lexer.skip(1)

In [13]:
def get_tokens(input):
    lexer = lex.lex()
    lexer.input(input)

    tokens = []
    token = lexer.token()
    while token:
        tokens.append(token)
        token = lexer.token()
        pass

    return tokens
    pass


# lexer = lex.lex()

In [26]:
def main():
    argv[1] = 'prog-001.cm'
    aux = argv[1].split('.')
    if aux[-1] != 'cm':
      raise IOError("Not a .cm file!")
    data = open(argv[1])

    source_file = data.read()
    lexer.input(source_file)

    # Tokenize
    while True:
      tok = lexer.token()
      if not tok: 
        break      # No more input
      # print(tok)
      print(tok.type)
      #print(tok.value)

In [29]:
# Build the lexer.
__file__ = "01-comp-analise-lexica-cmmlex.ipynb"
lexer = lex.lex(optimize=True,debug=True,debuglog=log)

if __name__ == "__main__":
    main()

INT
ID
LPAREN
RPAREN
LBRACES
RETURN
NUMBER
SEMICOLON
RBRACES


In [28]:
%%writefile prog-001.cm

int main(){
    return 0;
}

Overwriting prog-001.cm


In [30]:
%%writefile prog-002.cm

int gcd (int u, int v){
  if (v == 0) return u;
  else return gcd(v,u-u/v*v);
  /8 u-u/v*v == u mod v */
}

void main(void){
  int x; int y;
  x = input();
  y = input();
  output(gcd(x,y));
}

Writing prog-002.cm


In [10]:
! mkdir lexer
! wget -O lexer/__init__.py https://raw.githubusercontent.com/rogerioag/rea-comp04-compiladores/main/cmmcompiler/lexer/__init__.py
! wget -O lexer/methods.py https://raw.githubusercontent.com/rogerioag/rea-comp04-compiladores/main/cmmcompiler/lexer/methods.py
! wget -O lexer/regexs.py https://raw.githubusercontent.com/rogerioag/rea-comp04-compiladores/main/cmmcompiler/lexer/regexs.py
! wget -O lexer/tokens.py https://raw.githubusercontent.com/rogerioag/rea-comp04-compiladores/main/cmmcompiler/lexer/tokens.py
! wget -O main.py https://raw.githubusercontent.com/rogerioag/rea-comp04-compiladores/main/cmmcompiler/main.py
! mkdir utils
! wget -O utils/__init__.py https://raw.githubusercontent.com/rogerioag/rea-comp04-compiladores/main/cmmcompiler/utils/__init__.py
! wget -O utils/args.py https://raw.githubusercontent.com/rogerioag/rea-comp04-compiladores/main/cmmcompiler/utils/args.py
! wget -O utils/graph.py https://raw.githubusercontent.com/rogerioag/rea-comp04-compiladores/main/cmmcompiler/utils/graph.py


mkdir: cannot create directory ‘lexer’: File exists
--2021-06-30 20:55:18--  https://raw.githubusercontent.com/rogerioag/rea-comp04-compiladores/main/cmmcompiler/lexer/__init__.py
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.110.133, 185.199.108.133, 185.199.111.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.110.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 662 [text/plain]
Saving to: ‘lexer/__init__.py’


2021-06-30 20:55:18 (24.6 MB/s) - ‘lexer/__init__.py’ saved [662/662]

--2021-06-30 20:55:18--  https://raw.githubusercontent.com/rogerioag/rea-comp04-compiladores/main/cmmcompiler/lexer/methods.py
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 431 [text/pl

In [23]:
! git clone https://github.com/rogerioag/rea-comp04-compiladores.git
! cp -R rea-comp04-compiladores/cmmcompiler/* .
! cp -R rea-comp04-compiladores/cmmcompiler/tests/* .


fatal: destination path 'rea-comp04-compiladores' already exists and is not an empty directory.


In [24]:
! python main.py -l prog-001.cm

INT int
ID main
LPAREN (
VOID void
RPAREN )
LBRACES {
RETURN return
LPAREN (
NUMBER 0
RPAREN )
SEMICOLON ;
RBRACES }
