Skip to content

Commit

Permalink
fix #20
Browse files Browse the repository at this point in the history
  • Loading branch information
s committed May 23, 2020
1 parent 7f4aeab commit ca37366
Show file tree
Hide file tree
Showing 3 changed files with 20 additions and 1 deletion.
3 changes: 2 additions & 1 deletion preprocessor/defines.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,8 @@
'RESERVED': 'reserved_words',
'EMOJI': 'emojis',
'SMILEY': 'smileys',
'NUMBER': 'numbers'
'NUMBER': 'numbers',
'ESCAPE_CHAR': 'escape_chars'
}
Options = enum(**opts)
Functions = enum('CLEAN', 'TOKENIZE', 'PARSE')
Expand Down
11 changes: 11 additions & 0 deletions preprocessor/preprocess.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,17 @@ def preprocess_smileys(self, tweet_string, repl):
def preprocess_numbers(self, tweet_string, repl):
return re.sub(Patterns.NUMBERS_PATTERN, lambda m: m.groups()[0] + repl, tweet_string)

def preprocess_escape_chars(self, tweet_string, repl):
"""
This method processes escape chars using ASCII control characters.
:param tweet_string: input string which will be used to remove escape chars
:param repl: unused for this method
:return: processed string
"""
escapes = ''.join([chr(char) for char in range(1, 32)])
translator = str.maketrans('', '', escapes)
return tweet_string.translate(translator)

def remove_unneccessary_characters(self, tweet_string):
return ' '.join(tweet_string.split())

Expand Down
7 changes: 7 additions & 0 deletions tests/test_api.py
Original file line number Diff line number Diff line change
Expand Up @@ -112,6 +112,12 @@ def test_clean_file(self):
check_against = self._get_test_data_for_option(raw_data, opt)
self._test_clean_file(full_input_path, check_against, opt)

def test_escape_chars(self):
input_str = u"\x01\x02\x03\x04I \x05\x06\x07\x10\x11have \x12\x13\x14" \
"\x15\x16\x17\x20escaped!\a\b\n\r\t\b\f"
cleaned_str = p.clean(input_str)
self.assertEqual("I have escaped!", cleaned_str)

def _test_clean_file(self, full_input_path, check_against, *options):
output_path = p.clean_file(full_input_path, True, options)
self.assertTrue(os.path.exists(output_path))
Expand All @@ -124,5 +130,6 @@ def _get_test_data_for_option(self, raw_data, *options):
clean_data.append(p.clean(d))
return clean_data


if __name__ == '__main__':
unittest.main()

0 comments on commit ca37366

Please sign in to comment.