In [7]:
import pandas as pd
import numpy as np
from collections import Counter
from functools import reduce
import matplotlib
import matplotlib.pyplot as plt
%matplotlib inline
import html
import re
import nltk
from nltk.tokenize import RegexpTokenizer

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix

from scipy import stats
import random
random.seed()

In [8]:
# load original data and stop words
generic_tweets = pd.read_csv('generic_tweets.txt')
airline_tweets = pd.read_csv('US_airline_tweets.csv')

with open('stop_words.txt','r') as f:
    stop_words = f.read().split('\n')

In [11]:
class DataCleaner(object):
    def __init__(self,stop_words=[""], remove_steps = []):
        super().__init__()
        self.hashtags = str()
        self.remove_steps = remove_steps
        self.stop_words = stop_words
        self.procedures = [getattr(self,procedure) for procedure in \
                        [step for step in sorted(dir(self)) if step.startswith("step_")]]
        # remove some procedures according to users' needs
        for i in sorted(self.remove_steps,reverse=True):
                self.procedures.pop(i-1)

    #use reduce to sequentially clean the raw data, equivelent to funcN(...func2(func1(data)))
    def clean(self, data):
            return reduce(lambda data,func:func(data),[data]+self.procedures),self.hashtags

    # Html character codes (i.e., &...;) are replaced with an ASCII equivalent.
    def step_01_to_ascii(self,data):
        return html.unescape(data)

    # Remove html tags and attributes in form of <...>
    def step_02_remove_html_tag(self,data):
        return re.sub(r"\s*<.*?>",'', data)

    # Remove url in form of (http or https://)address.domain(/file)
    def step_03_remove_url(self,data):
        return re.sub(r'\s*(?:https?:\/\/)?[\w.-]+(?:\.[\w.-]+)+[\w\-._~:/?#[\]@!\$&\'\(\)\*\+,;=.]+','',data)

    # Change all characters in the text to lowercase.
    def step_04_to_lower(self,data):
        return data.lower()

    # Retrive @ tag message
    def step_05_hash_tag_retrieval(self,data):
        self.hashtags = ' '.join(re.findall(r'(?<=@)[^\s]+\s?',data))
        return re.sub(r'@.*?\s','',data)

    # Remove punctuation
    def step_06_remove_punctuation(self,data):
        tokenizer = RegexpTokenizer(r'\w+')
        words_list = tokenizer.tokenize(data)
        return ' '.join(words_list)

    # Remove stop words
    def step_07_remove_stop_words(self,data):
        words = data.split()
        new_data = [word for word in words if word not in self.stop_words]
        return ' '.join(new_data) #clean up empty charactor

    # Remove non alphanumeric characters except space
    def step_08_remove_nonAlphaNumerical(self,data):
        return re.sub(r'[^\s\w]+','',data)

In [12]:
myCleaner = DataCleaner(stop_words)
airline_tweets['clean_text'],airline_tweets['@_tags'] = np.vectorize(myCleaner.clean)(airline_tweets['text'])

In [13]:
x = Counter(airline_tweets['@_tags'])
x

Counter({'': 6,
         'americanair': 3,
         'americanair\n': 2,
         'americanair ': 2012,
         'americanair  _emmaclifford ': 1,
         'americanair  _lucy_may ': 1,
         'americanair  abc7newsbayarea ': 1,
         'americanair  actingoutmgmnt ': 1,
         'americanair  active_aly ': 1,
         'americanair  airsouthwest ': 1,
         'americanair  airtahitinui ': 1,
         'americanair  americanair ': 3,
         'americanair  americanair  usairways, ': 1,
         'americanair  americanair! ': 1,
         'americanair  americanairlines ': 1,
         'americanair  amexserve  usairways ': 1,
         'americanair  andyellwood  delk ': 1,
         'americanair  arminrosen  ggreenwald ': 1,
         'americanair  barrettkarabis ': 1,
         'americanair  bdindallas ': 1,
         'americanair  beantownmatty ': 2,
         'americanair  bershawnjackson ': 1,
         'americanair  boeingairplanes ': 1,
         'americanair  brewcrewfan8 ': 1,
         'am

In [10]:
re.findall(r'(run_.*?)(?:\s)',' '.join(["run_04_to_lower","run_05_remove_stop_words","run_07_remove_all_nonAlphaNumerical_char"]))

['run_04_to_lower', 'run_05_remove_stop_words']

In [11]:
str()

''

In [14]:
re.findall(r'(run_.*?)(?:\s)',' '.join(sorted(dir(thisClean))))

['run_01_to_ascii',
 'run_02_remove_html_tag',
 'run_03_remove_url',
 'run_04_to_lower',
 'run_05_remove_stop_words',
 'run_06_hash_tag_retrieval',
 'run_07_remove_all_nonAlphaNumerical_char']

In [19]:
['data'+' ']+thisClean.procedures

['data ',
 <bound method DataCleaner.run_01_to_ascii of <__main__.DataCleaner object at 0x7f1b987f05c0>>,
 <bound method DataCleaner.run_02_remove_html_tag of <__main__.DataCleaner object at 0x7f1b987f05c0>>,
 <bound method DataCleaner.run_03_remove_url of <__main__.DataCleaner object at 0x7f1b987f05c0>>,
 <bound method DataCleaner.run_04_to_lower of <__main__.DataCleaner object at 0x7f1b987f05c0>>,
 <bound method DataCleaner.run_05_remove_stop_words of <__main__.DataCleaner object at 0x7f1b987f05c0>>,
 <bound method DataCleaner.run_06_hash_tag_retrieval of <__main__.DataCleaner object at 0x7f1b987f05c0>>,
 <bound method DataCleaner.run_07_remove_all_nonAlphaNumerical_char of <__main__.DataCleaner object at 0x7f1b987f05c0>>]