In [2]:
import pandas as pd
import re
import numpy as np
import html
import warnings
from functools import reduce
from collections import OrderedDict as od
import matplotlib.pyplot as plt

In [1]:
class DataCleaner(object):
    
    def __init__(self,stopWords,removeProcudreInd:int=False):
        super().__init__()
        self.hashtag = str()
        self.stopwords = set(stopWords) #hash it for O(1) check
        
        #use reflection to define the cleaning pipeline -> form of list of method pointers, sorted by name
        self.procedures = [getattr(self,attr) for attr in \
                           re.findall(r'(run_.*?)(?:\s)',' '.join(sorted(dir(self))))]
        
        #drop the disired procudure if user needs
        if removeProcudreInd is not False:
            self.procedures.pop(removeProcudreInd-1)
            
    #use reduce to sequentially clean the raw data, equivelent to funcN(...func2(func1(data)))
    def clean(self,data):
        return reduce(lambda data,func:func(data),[' '+data+' ']+self.procedures),self.hashtags
    
    def run_01_to_ascii(self,data):
        return html.unescape(data)
    
    #remove html tag in form of <...>
    def run_02_remove_html_tag(self,data):
        return re.sub(r"\s*<.*?>",'', data)
    
    #remove url in form of (http or https://)address.domain(/file)
    def run_03_remove_url(self,data):
        return re.sub(r'\s*(?:https?:\/\/)?[\w.-]+(?:\.[\w.-]+)+[\w\-._~:/?#[\]@!\$&\'\(\)\*\+,;=.]+','',data)

    def run_04_to_lower(self,data):
        return data.lower()
    
    def run_07_remove_all_nonAlphaNumerical_char(self,data):
        return re.sub(r'[^\s\w]+','',data)
    
    def run_05_remove_stop_words(self,data):
        words = data.split()
        for ind,word in enumerate(words):
            if word in self.stopwords:
                words[ind]=''
        return ' '.join(filter(lambda x:x, words)) #clean up empty charactor
    
    def run_06_hash_tag_retrieval(self,data):
        self.hashtags = ' '.join(re.findall(r'(?:#)(.*?)(?:\s)',data))
        return re.sub(r'#.*?\s','',data)

In [4]:
thisClean = DataCleaner([""])

In [7]:
print(type(thisClean.procedures))

<class 'list'>


In [10]:
re.findall(r'(run_.*?)(?:\s)',' '.join(["run_04_to_lower","run_05_remove_stop_words","run_07_remove_all_nonAlphaNumerical_char"]))

['run_04_to_lower', 'run_05_remove_stop_words']

In [11]:
str()

''

In [14]:
re.findall(r'(run_.*?)(?:\s)',' '.join(sorted(dir(thisClean))))

['run_01_to_ascii',
 'run_02_remove_html_tag',
 'run_03_remove_url',
 'run_04_to_lower',
 'run_05_remove_stop_words',
 'run_06_hash_tag_retrieval',
 'run_07_remove_all_nonAlphaNumerical_char']

In [19]:
['data'+' ']+thisClean.procedures

['data ',
 <bound method DataCleaner.run_01_to_ascii of <__main__.DataCleaner object at 0x7f1b987f05c0>>,
 <bound method DataCleaner.run_02_remove_html_tag of <__main__.DataCleaner object at 0x7f1b987f05c0>>,
 <bound method DataCleaner.run_03_remove_url of <__main__.DataCleaner object at 0x7f1b987f05c0>>,
 <bound method DataCleaner.run_04_to_lower of <__main__.DataCleaner object at 0x7f1b987f05c0>>,
 <bound method DataCleaner.run_05_remove_stop_words of <__main__.DataCleaner object at 0x7f1b987f05c0>>,
 <bound method DataCleaner.run_06_hash_tag_retrieval of <__main__.DataCleaner object at 0x7f1b987f05c0>>,
 <bound method DataCleaner.run_07_remove_all_nonAlphaNumerical_char of <__main__.DataCleaner object at 0x7f1b987f05c0>>]