Add whitespace around punctuation before tokenization

rbaltrusch · Nov 7, 2021 · 96b8248 · 96b8248
1 parent 7875881
commit 96b8248
Showing 1 changed file with 7 additions and 1 deletion.
diff --git a/interpreter/lexer.py b/interpreter/lexer.py
@@ -5,6 +5,8 @@
 @author: Korean_Crimson
 """
 
+import re
+
 class Lexer:
     def __init__(self, token_factory):
         self.token_factory = token_factory
@@ -17,6 +19,10 @@ def lex(self, string):
 
     @staticmethod
     def _split(string):
-        string = string.replace('\n', ' \n ').strip()
+        string = string.replace('\n', ' \n ')
+
+        #matches non-word characters (i.e. punctuation) and adds whitespace around them
+        string = re.sub('(?P<match>\W)', ' \g<match> ', string).strip()
+
         string_tokens = [s for s in string.split(' ') if s != '']
         return string_tokens