Skip to content

Commit

Permalink
Add whitespace around punctuation before tokenization
Browse files Browse the repository at this point in the history
  • Loading branch information
rbaltrusch committed Nov 7, 2021
1 parent 7875881 commit 96b8248
Showing 1 changed file with 7 additions and 1 deletion.
8 changes: 7 additions & 1 deletion interpreter/lexer.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,8 @@
@author: Korean_Crimson
"""

import re

class Lexer:
def __init__(self, token_factory):
self.token_factory = token_factory
Expand All @@ -17,6 +19,10 @@ def lex(self, string):

@staticmethod
def _split(string):
string = string.replace('\n', ' \n ').strip()
string = string.replace('\n', ' \n ')

#matches non-word characters (i.e. punctuation) and adds whitespace around them
string = re.sub('(?P<match>\W)', ' \g<match> ', string).strip()

string_tokens = [s for s in string.split(' ') if s != '']
return string_tokens

0 comments on commit 96b8248

Please sign in to comment.