In [1]:
import re

In [2]:
opts = {
    'URL': 'urls',
    'MENTION': 'mentions',
    'HASHTAG': 'hashtags',
    'RESERVED': 'reserved_words',
    'EMOJI': 'emojis',
    'SMILEY': 'smileys',
    'NUMBER':  'numbers',
    'ESCAPE_CHAR': 'escape_chars'
}
class Defines:
    PARSE_METHODS_PREFIX = 'parse_'
    FILTERED_METHODS = opts.values()
    PREPROCESS_METHODS_PREFIX = 'preprocess_'
    PRIORITISED_METHODS = ['urls', 'mentions', 'hashtags', 'emojis', 'smileys']

def get_worker_methods(obj, prefix):
    all_methods = dir(obj)
    relevant_methods = list(filter(lambda x: x.startswith(prefix), all_methods))

    # Filtering according to user's options
    prefixed_filtered_methods = [prefix + fm for fm in Defines.FILTERED_METHODS]
    filtered_methods = list(filter(lambda x: x in prefixed_filtered_methods, relevant_methods))

    # Prioritising
    offset = 0
    for ind, pri_method in enumerate(Defines.PRIORITISED_METHODS):
        prefixed_pri_method = prefix + pri_method
        if pri_method in filtered_methods:
            filtered_methods.remove(prefixed_pri_method)
            filtered_methods.insert(offset + ind, prefixed_pri_method)

    return filtered_methods

class ParseItem:
    def __init__(self, start_index, end_index, match):
        self.start_index = start_index
        self.end_index = end_index
        self.match = match

    def __repr__(self):
        return '(%d:%d) => %s' % (self.start_index, self.end_index, self.match)

class ParseResult:
    urls = None
    emojis = None
    smileys = None
    numbers = None
    hashtags = None
    mentions = None
    reserved_words = None

    def __init__(self):
        pass


class Parse:

    def __init__(self):
        pass

    def parse(self, tweet_string):
        parse_result_obj = ParseResult()

        parser_methods = get_worker_methods(self, Defines.PARSE_METHODS_PREFIX)

        for a_parser_method in parser_methods:
            method_to_call = getattr(self, a_parser_method)
            attr = a_parser_method.split('_')[1]
            #attr = a_parser_method.replace(Defines.PARSE_METHODS_PREFIX,'')

            items = method_to_call(tweet_string)
            setattr(parse_result_obj, attr, items)
            print('a_parser_method:',a_parser_method)
            print('attr:',attr)
            print('items:',items)

        return parse_result_obj

    def parser(self, pattern, string):

        match_items = []
        number_match_max_group_count = 2

        for match_object in re.finditer(pattern, string):
            start_index = match_object.start()
            end_index = match_object.end()

            if Patterns.NUMBERS_PATTERN == pattern and number_match_max_group_count == len(match_object.groups()):
                match_str = match_object.groups()[1]
            else:
                match_str = match_object.group()

            parse_item = ParseItem(start_index, end_index, match_str)
            match_items.append(parse_item)

        if len(match_items):
            return match_items

    def parse_urls(self, tweet_string):
        return self.parser(Patterns.URL_PATTERN, tweet_string)

    def parse_hashtags(self, tweet_string):
        return self.parser(Patterns.HASHTAG_PATTERN, tweet_string)

    def parse_mentions(self, tweet_string):
        return self.parser(Patterns.MENTION_PATTERN, tweet_string)

    def parse_reserved_words(self, tweet_string):
        return self.parser(Patterns.RESERVED_WORDS_PATTERN, tweet_string)

    def parse_emojis(self, tweet_string):
        return self.parser(Patterns.EMOJIS_PATTERN, tweet_string)

    def parse_smileys(self, tweet_string):
        return self.parser(Patterns.SMILEYS_PATTERN, tweet_string)

    def parse_numbers(self, tweet_string):
        return self.parser(Patterns.NUMBERS_PATTERN, tweet_string)
    
class Patterns:
    URL_PATTERN_STR = r"""(?i)((?:https?:(?:/{1,3}|[a-z0-9%])|[a-z0-9.\-]+[.](?:com|net|org|edu|gov|mil|aero|asia|biz|cat|coop|info
                      |int|jobs|mobi|museum|name|post|pro|tel|travel|xxx|ac|ad|ae|af|ag|ai|al|am|an|ao|aq|ar|as|at|au|aw|ax|az|ba|
                      bb|bd|be|bf|bg|bh|bi|bj|bm|bn|bo|br|bs|bt|bv|bw|by|bz|ca|cc|cd|cf|cg|ch|ci|ck|cl|cm|cn|co|cr|cs|cu|cv|cx|cy|
                      cz|dd|de|dj|dk|dm|do|dz|ec|ee|eg|eh|er|es|et|eu|fi|fj|fk|fm|fo|fr|ga|gb|gd|ge|gf|gg|gh|gi|gl|gm|gn|gp|gq|gr|
                      gs|gt|gu|gw|gy|hk|hm|hn|hr|ht|hu|id|ie|il|im|in|io|iq|ir|is|it|je|jm|jo|jp|ke|kg|kh|ki|km|kn|kp|kr|kw|ky|kz|
                      la|lb|lc|li|lk|lr|ls|lt|lu|lv|ly|ma|mc|md|me|mg|mh|mk|ml|mm|mn|mo|mp|mq|mr|ms|mt|mu|mv|mw|mx|my|mz|na|nc|ne|
                      nf|ng|ni|nl|no|np|nr|nu|nz|om|pa|pe|pf|pg|ph|pk|pl|pm|pn|pr|ps|pt|pw|py|qa|re|ro|rs|ru|rw|sa|sb|sc|sd|se|sg|
                      sh|si|sj|Ja|sk|sl|sm|sn|so|sr|ss|st|su|sv|sx|sy|sz|tc|td|tf|tg|th|tj|tk|tl|tm|tn|to|tp|tr|tt|tv|tw|tz|ua|ug|
                      uk|us|uy|uz|va|vc|ve|vg|vi|vn|vu|wf|ws|ye|yt|yu|za|zm|zw)/)(?:[^\s()<>{}\[\]]+|\([^\s()]*?\([^\s()]+\)[^\s()]
                      *?\)|\([^\s]+?\))+(?:\([^\s()]*?\([^\s()]+\)[^\s()]*?\)|\([^\s]+?\)|[^\s`!()\[\]{};:'\".,<>?«»“”‘’])|(?:(?<!@)
                      [a-z0-9]+(?:[.\-][a-z0-9]+)*[.](?:com|net|org|edu|gov|mil|aero|asia|biz|cat|coop|info|int|jobs|mobi|museum|name
                      |post|pro|tel|travel|xxx|ac|ad|ae|af|ag|ai|al|am|an|ao|aq|ar|as|at|au|aw|ax|az|ba|bb|bd|be|bf|bg|bh|bi|bj|bm|bn
                      |bo|br|bs|bt|bv|bw|by|bz|ca|cc|cd|cf|cg|ch|ci|ck|cl|cm|cn|co|cr|cs|cu|cv|cx|cy|cz|dd|de|dj|dk|dm|do|dz|ec|ee|eg
                      |eh|er|es|et|eu|fi|fj|fk|fm|fo|fr|ga|gb|gd|ge|gf|gg|gh|gi|gl|gm|gn|gp|gq|gr|gs|gt|gu|gw|gy|hk|hm|hn|hr|ht|hu|id
                      |ie|il|im|in|io|iq|ir|is|it|je|jm|jo|jp|ke|kg|kh|ki|km|kn|kp|kr|kw|ky|kz|la|lb|lc|li|lk|lr|ls|lt|lu|lv|ly|ma|mc|
                      md|me|mg|mh|mk|ml|mm|mn|mo|mp|mq|mr|ms|mt|mu|mv|mw|mx|my|mz|na|nc|ne|nf|ng|ni|nl|no|np|nr|nu|nz|om|pa|pe|pf|pg|
                      ph|pk|pl|pm|pn|pr|ps|pt|pw|py|qa|re|ro|rs|ru|rw|sa|sb|sc|sd|se|sg|sh|si|sj|Ja|sk|sl|sm|sn|so|sr|ss|st|su|sv|sx|
                      sy|sz|tc|td|tf|tg|th|tj|tk|tl|tm|tn|to|tp|tr|tt|tv|tw|tz|ua|ug|uk|us|uy|uz|va|vc|ve|vg|vi|vn|vu|wf|ws|ye|yt|yu|
                      za|zm|zw)\b/?(?!@)))"""
    URL_PATTERN = re.compile(URL_PATTERN_STR, re.IGNORECASE)
    HASHTAG_PATTERN = re.compile(r'#\w*')
    MENTION_PATTERN = re.compile(r'@\w*')
    RESERVED_WORDS_PATTERN = re.compile(r'\b(?<![@#])(RT|FAV)\b')

    try:
        # UCS-4
        EMOJIS_PATTERN = re.compile(u'([\U00002600-\U000027BF])|([\U0001f300-\U0001f64F])|([\U0001f680-\U0001f6FF])')
    except re.error:
        # UCS-2
        EMOJIS_PATTERN = re.compile(u'([\u2600-\u27BF])|([\uD83C][\uDF00-\uDFFF])|([\uD83D][\uDC00-\uDE4F])|([\uD83D][\uDE80-\uDEFF])')

    SMILEYS_PATTERN = re.compile(r"(\s?:X|:|;|=)(?:-)?(?:\)+|\(|O|D|P|S|\\|\/\s){1,}", re.IGNORECASE)
    NUMBERS_PATTERN = re.compile(r"(^|\s)(-?\d+([.,]?\d+)*)")

In [3]:
parse_res = ParseResult()
[x for x in dir(parse_res) if '__' not in x]

['emojis',
 'hashtags',
 'mentions',
 'numbers',
 'reserved_words',
 'smileys',
 'urls']

In [4]:
parse_obj = Parse()
[x for x in dir(parse_obj.parse('RT RT')) if '__' not in x] 

a_parser_method: parse_emojis
attr: emojis
items: None
a_parser_method: parse_hashtags
attr: hashtags
items: None
a_parser_method: parse_mentions
attr: mentions
items: None
a_parser_method: parse_numbers
attr: numbers
items: None
a_parser_method: parse_reserved_words
attr: reserved
items: [(0:2) => RT, (3:5) => RT]
a_parser_method: parse_smileys
attr: smileys
items: None
a_parser_method: parse_urls
attr: urls
items: None


['emojis',
 'hashtags',
 'mentions',
 'numbers',
 'reserved',
 'reserved_words',
 'smileys',
 'urls']