In [1]:
import pandas as pd
import numpy as np
from collections import Counter
from functools import reduce
import matplotlib
import matplotlib.pyplot as plt
%matplotlib inline
import html
import re
import nltk
from nltk.tokenize import RegexpTokenizer

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix

from scipy import stats
import random
random.seed()

In [2]:
# load original data and stop words
generic_tweets = pd.read_csv('generic_tweets.txt')
airline_tweets = pd.read_csv('US_airline_tweets.csv')

with open('stop_words.txt','r') as f:
    stop_words = f.read().split('\n')

In [3]:
class DataCleaner(object):
    def __init__(self,stop_words=[""], remove_steps = []):
        super().__init__()
        self.hashtags = str()
        self.remove_steps = remove_steps
        self.stop_words = stop_words
        self.procedures = [getattr(self,procedure) for procedure in \
                        [step for step in sorted(dir(self)) if step.startswith("step_")]]
        # remove some procedures according to users' needs
        for i in sorted(self.remove_steps,reverse=True):
                self.procedures.pop(i-1)

    #use reduce to sequentially clean the raw data, equivelent to funcN(...func2(func1(data)))
    def clean(self, data):
            return reduce(lambda data,func:func(data),[data]+self.procedures),self.hashtags

    # Html character codes (i.e., &...;) are replaced with an ASCII equivalent.
    def step_01_to_ascii(self,data):
        return html.unescape(data)

    # Remove html tags and attributes in form of <...>
    def step_02_remove_html_tag(self,data):
        return re.sub(r"\s*<.*?>",'', data)

    # Remove url in form of (http or https://)address.domain(/file)
    def step_03_remove_url(self,data):
        return re.sub(r'\s*(?:https?:\/\/)?[\w.-]+(?:\.[\w.-]+)+[\w\-._~:/?#[\]@!\$&\'\(\)\*\+,;=.]+','',data)

    # Change all characters in the text to lowercase.
    def step_04_to_lower(self,data):
        return data.lower()

    # Retrive @ tag message
    def step_05_hash_tag_retrieval(self,data):
        self.hashtags = ' '.join(re.findall(r'(?<=@)[^\s]+\s?',data))
        return re.sub(r'@.*?\s','',data)

    # Remove punctuation
    def step_06_remove_punctuation(self,data):
        tokenizer = RegexpTokenizer(r'\w+')
        words_list = tokenizer.tokenize(data)
        return ' '.join(words_list)

    # Remove stop words
    def step_07_remove_stop_words(self,data):
        words = data.split()
        new_data = [word for word in words if word not in self.stop_words]
        return ' '.join(new_data) #clean up empty charactor

    # Remove non alphanumeric characters except space
    def step_08_remove_nonAlphaNumerical(self,data):
        return re.sub(r'[^\s\w]+','',data)

In [4]:
myCleaner = DataCleaner(stop_words)
airline_tweets['clean_text'],airline_tweets['@_tags'] = np.vectorize(myCleaner.clean)(airline_tweets['text'])

In [6]:
x = Counter(airline_tweets[airline_tweets['sentiment']=='negative']['negative_reason'])
x

Counter({'Bad Flight': 580,
         "Can't Tell": 1190,
         'Cancelled Flight': 847,
         'Customer Service Issue': 2910,
         'Damaged Luggage': 74,
         'Flight Attendant Complaints': 481,
         'Flight Booking Problems': 529,
         'Late Flight': 1665,
         'Lost Luggage': 724,
         'longlines': 178})

In [10]:
airline_tweets['class'] = [1 if row['sentiment']=='positive' else 0 for index,row in airline_tweets.iterrows()]

In [11]:
airline_tweets.head()

Unnamed: 0,id,sentiment,negative_reason,user,retweet_count,text,clean_text,@_tags,class
0,5.70301e+17,positive,,jnardino,0,@VirginAmerica plus you've added commercials t...,ve commercials tacky,virginamerica,1
1,5.70301e+17,negative,Bad Flight,jnardino,0,@VirginAmerica it's really aggressive to blast...,aggressive blast obnoxious entertainment guest...,virginamerica,0
2,5.70301e+17,negative,Can't Tell,jnardino,0,@VirginAmerica and it's a really big bad thing...,big bad thing,virginamerica,0
3,5.70301e+17,negative,Can't Tell,jnardino,0,@VirginAmerica seriously would pay $30 a fligh...,seriously pay 30 flight seats didn t playing b...,virginamerica,0
4,5.70301e+17,positive,,cjmcginnis,0,"@VirginAmerica yes, nearly every time I fly VX...",time fly vx â œear wormâ wonâ t,virginamerica,1


In [14]:
re.findall(r'(run_.*?)(?:\s)',' '.join(sorted(dir(thisClean))))

['run_01_to_ascii',
 'run_02_remove_html_tag',
 'run_03_remove_url',
 'run_04_to_lower',
 'run_05_remove_stop_words',
 'run_06_hash_tag_retrieval',
 'run_07_remove_all_nonAlphaNumerical_char']