Skip to content

Commit

Permalink
adds customized preprocessing functionality πŸŽ‰
Browse files Browse the repository at this point in the history
  • Loading branch information
s committed Jan 31, 2016
1 parent 472d64e commit 708e55d
Show file tree
Hide file tree
Showing 8 changed files with 78 additions and 41 deletions.
5 changes: 3 additions & 2 deletions preprocessor/__init__.py
@@ -1,3 +1,4 @@
from .api import clean, tokenize, parse
from .api import clean, tokenize, parse, set_options
from .defines import Options as OPT

__all__ = ['clean', 'tokenize', 'parse']
__all__ = ['clean', 'tokenize', 'parse', 'set_options']
15 changes: 13 additions & 2 deletions preprocessor/api.py
Expand Up @@ -11,7 +11,7 @@

import sys
from .preprocess import Preprocess
from .constants import Functions
from .defines import Functions, Defines
from .parse import Parse

preprocessor = Preprocess()
Expand Down Expand Up @@ -58,4 +58,15 @@ def parse(tweet_string):
25
"""
parsed_tweet_obj = parser.parse(tweet_string)
return parsed_tweet_obj
return parsed_tweet_obj

def set_options(*args):
"""Sets desired options for preprocessing`.
:param *args: A number of preprocessor.OPT options
:return: void
:rtype: void
Usage::
>>> import preprocessor
>>> preprocessor.set_options(preprocessor.OPT.URL, preprocessor.OPT.SMILEY)
"""
Defines.FILTERED_METHODS = list(args)
38 changes: 19 additions & 19 deletions preprocessor/defines.py
Expand Up @@ -6,22 +6,27 @@
"""
import re
import sys
#from .utils import Util
from .enum import enum

PREPROCESS_METHODS_PREFIX = 'preprocess_'
PARSE_METHODS_PREFIX = 'parse_'
HIGH_PRIORITISED_METHODS = ['urls', 'mentions', 'hashtags', 'emojis', 'smileys']
IS_PYTHON3 = sys.version_info > (3, 0, 0)
opts = {
'URL':'urls',
'MENTION':'mentions',
'HASHTAG':'hashtags',
'RESERVED':'reserved_words',
'EMOJI':'emojis',
'SMILEY':'smileys'
}
Options = enum(**opts)
Functions = enum('CLEAN', 'TOKENIZE', 'PARSE')

#Options = Util.enum('URL', 'HASHTAG', 'MENTION', 'HASHTAG', 'EMOJI', 'SMILEY')

class Options:
OPT_URL=1
OPT_HASHTAG=2
OPT_MENTION=3
OPT_RESERVED=4
OPT_EMOJI=5
OPT_SMILEY=6
class Defines:
PARSE_METHODS_PREFIX = 'parse_'
FILTERED_METHODS = opts.values()
PREPROCESS_METHODS_PREFIX = 'preprocess_'
IS_PYTHON3 = sys.version_info > (3, 0, 0)
PRIORITISED_METHODS = ['urls', 'mentions', 'hashtags', 'emojis', 'smileys']


class Patterns:
URL_PATTERN=re.compile(r'(?i)\b((?:https?://|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:\'".,<>?\xab\xbb\u201c\u201d\u2018\u2019]))')
Expand All @@ -36,9 +41,4 @@ class Patterns:
# UCS-2
EMOJIS_PATTERN = re.compile(u'([\u2600-\u27BF])|([\uD83C][\uDF00-\uDFFF])|([\uD83D][\uDC00-\uDE4F])|([\uD83D][\uDE80-\uDEFF])')

SMILEYS_PATTERN = re.compile(r"(?::|;|=)(?:-)?(?:\)|\(|D|P|S){1,}")

class Functions:
CLEAN=1
TOKENIZE=2
PARSE=3
SMILEYS_PATTERN = re.compile(r"(?::|;|=)(?:-)?(?:\)|\(|D|P|S){1,}")
7 changes: 7 additions & 0 deletions preprocessor/enum.py
@@ -0,0 +1,7 @@
# -*- coding: utf-8 -*-

def enum(*sequential, **named):
enums = dict(zip(sequential, range(len(sequential))), **named)
reverse = dict((value, key) for key, value in enums.items())
enums['reverse_mapping'] = reverse
return type('Enum', (), enums)
13 changes: 7 additions & 6 deletions preprocessor/parse.py
Expand Up @@ -5,8 +5,9 @@
"""

from .utils import Util
from .constants import *
import re
from .utils import Utils
from .defines import Defines, Patterns

class ParseResult:
urls = None
Expand All @@ -33,12 +34,12 @@ def __repr__(self):
class Parse:

def __init__(self):
self.u = Util()
self.u = Utils()

def parse(self, tweet_string):
parse_result_obj = ParseResult()

parser_methods = self.u.get_worker_methods(self, PARSE_METHODS_PREFIX)
parser_methods = self.u.get_worker_methods(self, Defines.PARSE_METHODS_PREFIX)

for a_parser_method in parser_methods:
method_to_call = getattr(self, a_parser_method)
Expand All @@ -54,7 +55,7 @@ def parser(self, pattern, string):
items = []

for match_object in re.finditer(pattern, string):
if not IS_PYTHON3:
if not Defines.IS_PYTHON3:
parse_item = ParseItem(match_object.start(), match_object.end(), match_object.group().encode('utf-8'))
else:
parse_item = ParseItem(match_object.start(), match_object.end(), match_object.group())
Expand All @@ -76,7 +77,7 @@ def parse_reserved_words(self, tweet_string):
return self.parser(Patterns.RESERVED_WORDS_PATTERN, tweet_string)

def parse_emojis(self, tweet_string):
if not IS_PYTHON3:
if not Defines.IS_PYTHON3:
tweet_string = tweet_string.decode('utf-8')
return self.parser(Patterns.EMOJIS_PATTERN, tweet_string)

Expand Down
10 changes: 5 additions & 5 deletions preprocessor/preprocess.py
Expand Up @@ -8,20 +8,20 @@
"""

import re
from .constants import *
from .utils import Util
from .defines import *
from .utils import Utils

class Preprocess:

tweet = None

def __init__(self):
self.repl = None
self.u = Util()
self.u = Utils()

def clean(self, tweet_string, repl):

cleaner_methods = self.u.get_worker_methods(self, PREPROCESS_METHODS_PREFIX)
cleaner_methods = self.u.get_worker_methods(self, Defines.PREPROCESS_METHODS_PREFIX)

for a_cleaner_method in cleaner_methods:
token = self.get_token_string_from_method_name(a_cleaner_method)
Expand All @@ -48,7 +48,7 @@ def preprocess_reserved_words(self, tweet_string, repl):
return Patterns.RESERVED_WORDS_PATTERN.sub(repl, tweet_string)

def preprocess_emojis(self, tweet_string, repl):
if not IS_PYTHON3:
if not Defines.IS_PYTHON3:
tweet_string = tweet_string.decode('utf-8')
return Patterns.EMOJIS_PATTERN.sub(repl, tweet_string)

Expand Down
20 changes: 13 additions & 7 deletions preprocessor/utils.py
Expand Up @@ -4,21 +4,27 @@
This module includes utility methods which are used in Preprocessor
"""

from .constants import PRIORITISED_METHODS
from .defines import Defines

class Util:
class Utils:

def __init__(self):
pass

def get_worker_methods(self, object, prefix):
all_methods = dir(object)
relevant_methods = list(filter(lambda x: x.startswith(prefix), all_methods))
prefixed_prioritised_methods = [prefix+m for m in PRIORITISED_METHODS]

# Filtering according to user's options
prefixed_filtered_methods = [prefix+fm for fm in Defines.FILTERED_METHODS]
filtered_methods = list(filter(lambda x: x in prefixed_filtered_methods, relevant_methods))

# Prioritising
offset = 0
for ind, pri_method in enumerate(prefixed_prioritised_methods):
relevant_methods.remove(pri_method)
relevant_methods.insert(offset+ind, pri_method)
for ind, pri_method in enumerate(Defines.PRIORITISED_METHODS):
prefixed_pri_method = prefix + pri_method
if pri_method in filtered_methods:
filtered_methods.remove(prefixed_pri_method)
filtered_methods.insert(offset+ind, prefixed_pri_method)

return relevant_methods
return filtered_methods
11 changes: 11 additions & 0 deletions tests/test_api.py
Expand Up @@ -9,16 +9,19 @@ class PreprocessorTest(unittest.TestCase):

def test_clean(self):
tweet = "Hello there! @pyistanbul #packathon was awesome πŸ˜€. http://packathon.org"
p.set_options(p.OPT.URL, p.OPT.HASHTAG, p.OPT.MENTION, p.OPT.EMOJI, p.OPT.SMILEY)
cleaned_tweeet = p.clean(tweet)
self.assertEqual(cleaned_tweeet, 'Hello there! was awesome .')

def test_tokenize(self):
tweet = 'Packathon was a really #nice :) challenging πŸ‘Œ. @packathonorg http://packathon.org'
p.set_options(p.OPT.URL, p.OPT.HASHTAG, p.OPT.MENTION, p.OPT.EMOJI, p.OPT.SMILEY)
tokenized_tweet = p.tokenize(tweet)
self.assertEqual(tokenized_tweet, 'Packathon was a really $HASHTAG$ $SMILEY$ challenging $EMOJI$. $MENTION$ $URL$')

def test_parse(self):
tweet = 'A tweet with #hashtag :) @mention πŸ˜€ and http://github.com/s.'
p.set_options(p.OPT.URL, p.OPT.HASHTAG, p.OPT.MENTION, p.OPT.EMOJI, p.OPT.SMILEY)
parsed_tweet = p.parse(tweet)

self.assertIsNotNone(parsed_tweet.urls)
Expand All @@ -39,6 +42,14 @@ def test_parse(self):
self.assertIsNotNone(parsed_tweet.smileys)
self.assertEqual(1, len(parsed_tweet.smileys))
self.assertEqual(":)", parsed_tweet.smileys[0].match)

def test_set_options(self):
tweet = 'Preprocessor now has custom #options support! https://github.com/s/preprocessor'
p.set_options(p.OPT.URL)
parsed_tweet = p.parse(tweet)

self.assertIsNone(parsed_tweet.hashtags)
self.assertIsNotNone(parsed_tweet.urls)

if __name__ == '__main__':
unittest.main()

0 comments on commit 708e55d

Please sign in to comment.