From 708e55da02735b74153ab402c1dbb6f32210a4f0 Mon Sep 17 00:00:00 2001 From: Said Date: Sun, 31 Jan 2016 02:09:11 +0200 Subject: [PATCH] =?UTF-8?q?adds=20customized=20preprocessing=20functionali?= =?UTF-8?q?ty=20=F0=9F=8E=89?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- preprocessor/__init__.py | 5 +++-- preprocessor/api.py | 15 +++++++++++++-- preprocessor/defines.py | 38 +++++++++++++++++++------------------- preprocessor/enum.py | 7 +++++++ preprocessor/parse.py | 13 +++++++------ preprocessor/preprocess.py | 10 +++++----- preprocessor/utils.py | 20 +++++++++++++------- tests/test_api.py | 11 +++++++++++ 8 files changed, 78 insertions(+), 41 deletions(-) create mode 100644 preprocessor/enum.py diff --git a/preprocessor/__init__.py b/preprocessor/__init__.py index e791472..341fbf9 100755 --- a/preprocessor/__init__.py +++ b/preprocessor/__init__.py @@ -1,3 +1,4 @@ -from .api import clean, tokenize, parse +from .api import clean, tokenize, parse, set_options +from .defines import Options as OPT -__all__ = ['clean', 'tokenize', 'parse'] +__all__ = ['clean', 'tokenize', 'parse', 'set_options'] diff --git a/preprocessor/api.py b/preprocessor/api.py index 3a29b18..a12c609 100755 --- a/preprocessor/api.py +++ b/preprocessor/api.py @@ -11,7 +11,7 @@ import sys from .preprocess import Preprocess -from .constants import Functions +from .defines import Functions, Defines from .parse import Parse preprocessor = Preprocess() @@ -58,4 +58,15 @@ def parse(tweet_string): 25 """ parsed_tweet_obj = parser.parse(tweet_string) - return parsed_tweet_obj \ No newline at end of file + return parsed_tweet_obj + +def set_options(*args): + """Sets desired options for preprocessing`. + :param *args: A number of preprocessor.OPT options + :return: void + :rtype: void + Usage:: + >>> import preprocessor + >>> preprocessor.set_options(preprocessor.OPT.URL, preprocessor.OPT.SMILEY) + """ + Defines.FILTERED_METHODS = list(args) \ No newline at end of file diff --git a/preprocessor/defines.py b/preprocessor/defines.py index 1286b2a..23fdcbe 100644 --- a/preprocessor/defines.py +++ b/preprocessor/defines.py @@ -6,22 +6,27 @@ """ import re import sys -#from .utils import Util +from .enum import enum -PREPROCESS_METHODS_PREFIX = 'preprocess_' -PARSE_METHODS_PREFIX = 'parse_' -HIGH_PRIORITISED_METHODS = ['urls', 'mentions', 'hashtags', 'emojis', 'smileys'] -IS_PYTHON3 = sys.version_info > (3, 0, 0) +opts = { + 'URL':'urls', + 'MENTION':'mentions', + 'HASHTAG':'hashtags', + 'RESERVED':'reserved_words', + 'EMOJI':'emojis', + 'SMILEY':'smileys' +} +Options = enum(**opts) +Functions = enum('CLEAN', 'TOKENIZE', 'PARSE') -#Options = Util.enum('URL', 'HASHTAG', 'MENTION', 'HASHTAG', 'EMOJI', 'SMILEY') -class Options: - OPT_URL=1 - OPT_HASHTAG=2 - OPT_MENTION=3 - OPT_RESERVED=4 - OPT_EMOJI=5 - OPT_SMILEY=6 +class Defines: + PARSE_METHODS_PREFIX = 'parse_' + FILTERED_METHODS = opts.values() + PREPROCESS_METHODS_PREFIX = 'preprocess_' + IS_PYTHON3 = sys.version_info > (3, 0, 0) + PRIORITISED_METHODS = ['urls', 'mentions', 'hashtags', 'emojis', 'smileys'] + class Patterns: URL_PATTERN=re.compile(r'(?i)\b((?:https?://|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:\'".,<>?\xab\xbb\u201c\u201d\u2018\u2019]))') @@ -36,9 +41,4 @@ class Patterns: # UCS-2 EMOJIS_PATTERN = re.compile(u'([\u2600-\u27BF])|([\uD83C][\uDF00-\uDFFF])|([\uD83D][\uDC00-\uDE4F])|([\uD83D][\uDE80-\uDEFF])') - SMILEYS_PATTERN = re.compile(r"(?::|;|=)(?:-)?(?:\)|\(|D|P|S){1,}") - -class Functions: - CLEAN=1 - TOKENIZE=2 - PARSE=3 \ No newline at end of file + SMILEYS_PATTERN = re.compile(r"(?::|;|=)(?:-)?(?:\)|\(|D|P|S){1,}") \ No newline at end of file diff --git a/preprocessor/enum.py b/preprocessor/enum.py new file mode 100644 index 0000000..55e9ff2 --- /dev/null +++ b/preprocessor/enum.py @@ -0,0 +1,7 @@ +# -*- coding: utf-8 -*- + +def enum(*sequential, **named): + enums = dict(zip(sequential, range(len(sequential))), **named) + reverse = dict((value, key) for key, value in enums.items()) + enums['reverse_mapping'] = reverse + return type('Enum', (), enums) \ No newline at end of file diff --git a/preprocessor/parse.py b/preprocessor/parse.py index 52c74fe..797968e 100644 --- a/preprocessor/parse.py +++ b/preprocessor/parse.py @@ -5,8 +5,9 @@ """ -from .utils import Util -from .constants import * +import re +from .utils import Utils +from .defines import Defines, Patterns class ParseResult: urls = None @@ -33,12 +34,12 @@ def __repr__(self): class Parse: def __init__(self): - self.u = Util() + self.u = Utils() def parse(self, tweet_string): parse_result_obj = ParseResult() - parser_methods = self.u.get_worker_methods(self, PARSE_METHODS_PREFIX) + parser_methods = self.u.get_worker_methods(self, Defines.PARSE_METHODS_PREFIX) for a_parser_method in parser_methods: method_to_call = getattr(self, a_parser_method) @@ -54,7 +55,7 @@ def parser(self, pattern, string): items = [] for match_object in re.finditer(pattern, string): - if not IS_PYTHON3: + if not Defines.IS_PYTHON3: parse_item = ParseItem(match_object.start(), match_object.end(), match_object.group().encode('utf-8')) else: parse_item = ParseItem(match_object.start(), match_object.end(), match_object.group()) @@ -76,7 +77,7 @@ def parse_reserved_words(self, tweet_string): return self.parser(Patterns.RESERVED_WORDS_PATTERN, tweet_string) def parse_emojis(self, tweet_string): - if not IS_PYTHON3: + if not Defines.IS_PYTHON3: tweet_string = tweet_string.decode('utf-8') return self.parser(Patterns.EMOJIS_PATTERN, tweet_string) diff --git a/preprocessor/preprocess.py b/preprocessor/preprocess.py index dfc5063..c58ad9a 100644 --- a/preprocessor/preprocess.py +++ b/preprocessor/preprocess.py @@ -8,8 +8,8 @@ """ import re -from .constants import * -from .utils import Util +from .defines import * +from .utils import Utils class Preprocess: @@ -17,11 +17,11 @@ class Preprocess: def __init__(self): self.repl = None - self.u = Util() + self.u = Utils() def clean(self, tweet_string, repl): - cleaner_methods = self.u.get_worker_methods(self, PREPROCESS_METHODS_PREFIX) + cleaner_methods = self.u.get_worker_methods(self, Defines.PREPROCESS_METHODS_PREFIX) for a_cleaner_method in cleaner_methods: token = self.get_token_string_from_method_name(a_cleaner_method) @@ -48,7 +48,7 @@ def preprocess_reserved_words(self, tweet_string, repl): return Patterns.RESERVED_WORDS_PATTERN.sub(repl, tweet_string) def preprocess_emojis(self, tweet_string, repl): - if not IS_PYTHON3: + if not Defines.IS_PYTHON3: tweet_string = tweet_string.decode('utf-8') return Patterns.EMOJIS_PATTERN.sub(repl, tweet_string) diff --git a/preprocessor/utils.py b/preprocessor/utils.py index b268b36..1122d8a 100644 --- a/preprocessor/utils.py +++ b/preprocessor/utils.py @@ -4,9 +4,9 @@ This module includes utility methods which are used in Preprocessor """ -from .constants import PRIORITISED_METHODS +from .defines import Defines -class Util: +class Utils: def __init__(self): pass @@ -14,11 +14,17 @@ def __init__(self): def get_worker_methods(self, object, prefix): all_methods = dir(object) relevant_methods = list(filter(lambda x: x.startswith(prefix), all_methods)) - prefixed_prioritised_methods = [prefix+m for m in PRIORITISED_METHODS] + # Filtering according to user's options + prefixed_filtered_methods = [prefix+fm for fm in Defines.FILTERED_METHODS] + filtered_methods = list(filter(lambda x: x in prefixed_filtered_methods, relevant_methods)) + + # Prioritising offset = 0 - for ind, pri_method in enumerate(prefixed_prioritised_methods): - relevant_methods.remove(pri_method) - relevant_methods.insert(offset+ind, pri_method) + for ind, pri_method in enumerate(Defines.PRIORITISED_METHODS): + prefixed_pri_method = prefix + pri_method + if pri_method in filtered_methods: + filtered_methods.remove(prefixed_pri_method) + filtered_methods.insert(offset+ind, prefixed_pri_method) - return relevant_methods \ No newline at end of file + return filtered_methods \ No newline at end of file diff --git a/tests/test_api.py b/tests/test_api.py index 060103f..7220f47 100755 --- a/tests/test_api.py +++ b/tests/test_api.py @@ -9,16 +9,19 @@ class PreprocessorTest(unittest.TestCase): def test_clean(self): tweet = "Hello there! @pyistanbul #packathon was awesome 😀. http://packathon.org" + p.set_options(p.OPT.URL, p.OPT.HASHTAG, p.OPT.MENTION, p.OPT.EMOJI, p.OPT.SMILEY) cleaned_tweeet = p.clean(tweet) self.assertEqual(cleaned_tweeet, 'Hello there! was awesome .') def test_tokenize(self): tweet = 'Packathon was a really #nice :) challenging 👌. @packathonorg http://packathon.org' + p.set_options(p.OPT.URL, p.OPT.HASHTAG, p.OPT.MENTION, p.OPT.EMOJI, p.OPT.SMILEY) tokenized_tweet = p.tokenize(tweet) self.assertEqual(tokenized_tweet, 'Packathon was a really $HASHTAG$ $SMILEY$ challenging $EMOJI$. $MENTION$ $URL$') def test_parse(self): tweet = 'A tweet with #hashtag :) @mention 😀 and http://github.com/s.' + p.set_options(p.OPT.URL, p.OPT.HASHTAG, p.OPT.MENTION, p.OPT.EMOJI, p.OPT.SMILEY) parsed_tweet = p.parse(tweet) self.assertIsNotNone(parsed_tweet.urls) @@ -39,6 +42,14 @@ def test_parse(self): self.assertIsNotNone(parsed_tweet.smileys) self.assertEqual(1, len(parsed_tweet.smileys)) self.assertEqual(":)", parsed_tweet.smileys[0].match) + + def test_set_options(self): + tweet = 'Preprocessor now has custom #options support! https://github.com/s/preprocessor' + p.set_options(p.OPT.URL) + parsed_tweet = p.parse(tweet) + + self.assertIsNone(parsed_tweet.hashtags) + self.assertIsNotNone(parsed_tweet.urls) if __name__ == '__main__': unittest.main()