In [15]:
import os
import re
import sys
import math
import numpy as np
from nltk.tokenize import TweetTokenizer
from nltk import word_tokenize
from nltk.stem import PorterStemmer
from itertools import chain

In [2]:
# Logger: redirect the stream on screen and to file.
class Logger(object):
    def __init__(self, filename = "log.txt"):
        self.terminal = sys.stdout
        self.log = open(filename, "a")
    def write(self, message):
        self.terminal.write(message)
        self.log.write(message)
    def flush(self):
        pass

In [3]:
# The main function.
def main():
    # initialize the log file.
    logPath = './logistic_regression.txt'
    if os.path.exists(logPath):
        os.remove(logPath)
    sys.stdout = Logger(logPath)

    print("-- AIT726 Homework 1 from Julia Jeng, Shu Wang, and Arman Anwar --")
    
    # create the vocabulary.
    CreateVocabulary()
    
    # run demo.
    DemoLogisticRegression('noStem', 'freq')
    DemoLogisticRegression('noStem', 'bin')
    DemoLogisticRegression('noStem', 'tfidf')
    DemoLogisticRegression('Stem', 'freq')
    DemoLogisticRegressions('Stem', 'bin')
    DemoLogisticRegression('Stem', 'tfidf')
    return


In [4]:
def DemoLogisticRegression(lStem = 'noStem', method = 'freq'):
    '''
    a demo of Logistic Regression classifier with different dataset and features.
    :param lStem: stem setting - 'noStem', 'Stem'
    :param method: feature selection - 'freq', 'tfidf'
    :return: none
    '''
    # input validation.
    if lStem not in ['noStem', 'Stem']:
        print('Error: stem setting invalid!')
        return
    if method not in ['freq', 'bin', 'tfidf']:
        print('Error: method setting invalid!')
        return

    # extract training features with 'method' on 'lStem' dataset.
    featTrain = ExtractFeatures('Train', lStem, method)
    
    # get the model parameters.
    prior, likelihood = TrainLogisticRegression(featTrain)
    
    # extract testing features with 'method' on 'lStem' dataset.
    featTest = ExtractFeatures('Test', lStem, method)
    
    # get testing predictions using model parameters.
    accuracy, confusion = TestLogisticRegression(prior, likelihood, featTest)
    
    # output the results on screen and to files.
    OutputLogisticRegression(accuracy, confusion, lStem, method)
    
    # debug
    return

### Read train/test sets and create vocabulary.

In [33]:
def CreateVocabulary():
    def Preprocess(data):
        # remove url
        pattern = r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+'
        data = re.sub(pattern, '', data)
        
        # remove html special characters. 
        pattern = r'&[(amp)(gt)(lt)]+;'
        data = re.sub(pattern, '', data)
        
        # remove independent numbers.
        pattern = r' \d+ '
        data = re.sub(pattern, ' ', data)
        
        # lower case capitalized words.
        pattern = r'([A-Z][a-z]+)'
        def LowerFunc(matched):
            return matched.group(1).lower()
        data = re.sub(pattern, LowerFunc, data)
        
        # remove hashtags.
        pattern = r'[@#]([A-Za-z]+)'
        data = re.sub(pattern, '', data)
        return data
    
    def GetTokens(data):
        # use tweet tokenizer
        tknzr = TweetTokenizer()
        tokens = tknzr.tokenize(data)
        
        # tokenize at each punctuation
        pattern = r'[A-Za-z]+\'[A-Za-z]+'
        for tk in tokens:
            if re.match(pattern, tk):
                subtokens = word_toknize(tk)
                tokens.remove(tk)
                tokens = tokens + subtokens
        return tokens
    
    # process tokens with setmming.
    def WithStem(tokens):
        poster = PorterStemmer()  ####
        tokensStem = []
        for tk in tokens:
            tokensStem.append(porter.stem(tk))  ####
        return tokensStem
    
    # if there is no 'tmp' folder, create one.
    if not os.path.exists('./tmp/'):
        os.mkdir('./tmp/')
        
    #### read training data
    labelTrain = []
    dataTrain = []
    dataTrainStem = []
    
    for root, ds, fs in os.walk('./tweet/train/'):
        for file in fs:
            fullname = os.path.join(root, file)
            
            # get the training label
            if "positve" in fullname:
                label = 1
            else: 
                label = 0
            data = open(fullname, encoding = "utf8").read()
            print("Label_dataTrain", data)
            
            # Preprocess Labeled Data
            data = Preprocess(data)
            print("Preprocess_Lable_dataTrain",data)
            
            # Get tokens for the data
            tokens = GetTokens(data)
            dataTrain.append(tokens)
            print("Token_dataTrain[] =",tokens)
            
            # Getthe stemmed tokens for the data
            tokensStem = WithStem(tokens)
            dataTrainStem.append(tokensStem)
            print("Check Stemmed token data",tokensStem)
            
    print('Load TrainSet:%d/%d positive/negative samples.' %(sum(labelTrain), len(labelTrain)-sum(lavelTrain)))
    np.savez('tmp/Train.npz', 
             labelTrain = labelTrain, 
             dataTrain =dataTrain, 
             dataTrainStem = dataTrainStem) 
    
    
    #### build the vocabulary from training set.
    vocab = list(set(list(chain.from_iterable(dataTrain))))
    vocabStem = list(set(list(chain.from_iterable(dataTrainStem))))
    print('Vocabulary: %d items.' % len(vocab))
    print('Vocabulary (stem): %d items.' % len(vocabStem))
    np.savez('tmp/Vocab.npz', vocab = vocab, vocabStem = vocabStem)
    
    #### read the testing data.
    labelTest = []
    dataTest = []
    dataTestStem = []
    for root, ds, fs in os.walk('./tweet/test/'):
        for file in fs:
            fullname = os.path.join(root, file)
            # get the testing label.
            if "positive" in fullname:
                label = 1
            else:
                label = 0
            labelTest.append(label)
            # get the testing data.
            data = open(fullname, encoding="utf8").read()
            # print(data)
            # preprocess the data.
            data = Preprocess(data)
            # print(data)
            # get the tokens for the data.
            tokens = GetTokens(data)
            dataTest.append(tokens)
            # print(tokens)
            # get the stemmed tokens for the data.
            tokensStem = WithStem(tokens)
            dataTestStem.append(tokensStem)
            # print(tokensStem)
    print('Load TestSet: %d/%d positive/negative samples.' % (sum(labelTest), len(labelTest)-sum(labelTest)))
    np.savez('tmp/Test.npz', labelTest = labelTest, dataTest = dataTest, dataTestStem = dataTestStem)
    
    return
        

### Extract Features for a dataset with /without "stemming" using "method"

In [43]:
def ExtractFeatrues(dataset = 'Train', lStem = 'noStem', method = 'freq'):
    if dataset not in ['Train','Test']:
        print('Error: dataset input invalid!')
        return
    if lStem not in ['noStem', 'Stem']:
        print('Error: stem setting invalid!')
        return
    
    if method not in ['freq', 'bin', 'tfidf']:
        print('Error: method setting invalid!')
        return
    
    
    # Sparse the corrsponding dataset.
    
    dset = np.load('./tmp/' + dataset + '.npz', allow_pickle = True)
    if 'Stem' == lStem:
        data = dset['data' + dataset + lStem]
    else:
        data = dset['data' + dataset]
    D = len(data)
    
    
    #Sparse the corrsponding vocavulary.
    vset = np.load('./tmp/Vocab.npz', allow_pickle = True)
    if 'Stem' == lStem:
        vocab = vset['vocab' + lStem]
    else:
        vocab = vset['vocab']
    V = len(vocab)
                   
    vocabDic = dic(zip(vocab, range(V)))
                   
    # Get the feature matix (freq)
    if 'freq' == method:
        features = np.zeros((D, V))
                   
    ind = 0
    for doc in data:
        for item in doc:
            if item in vocabDict:
                features[ind][vocabDict[item]] += 1
        ind += 1
    return features
                   
    # get the feature matrix (bin).
    if 'bin' == method:
        features = np.zeros((D, V))
        ind = 0
        for doc in data:
            for item in doc:
                if item in vocabDict:
                    features[ind][vocabDict[item]] = 1
            ind += 1
        return features

    # get the feature matrix (tfidf):
    if 'tfidf' == method:
        # get freq and bin features.
        termFreq = np.zeros((D, V))
        termBin = np.zeros((D, V))
        for ind, doc in enumerate(data):
            for item in doc:
                if item in vocabDict:
                    termFreq[ind][vocabDict[item]] += 1
                    termBin[ind][vocabDict[item]] = 1
        # get tf (1+log10)
        tf = np.zeros((D, V))
        for ind in range(D):
            for i in range(V):
                if termFreq[ind][i] > 0:
                    tf[ind][i] = 1 + math.log(termFreq[ind][i], 10)
        del termFreq
                   
        # find idf
        if 'Train' == dataset:
            # get df
            df = np.zeros((V, 1))
            for ind in range(D):
                for i in range(V):
                    df[i] += termBin[ind][i]
            # get idf (log10(D/df))
            idf = np.zeros((V, 1))
            for i in range(V):
                if df[i] > 0:
                    idf[i] = math.log(D, 10) - math.log(df[i], 10)
            del df
            np.save('./tmp/idf.npy', idf)
        else:
            # if 'Test' == dataset, get idf from arguments.
            idf = np.load('./tmp/idf.npy')
        del termBin
        # get tfidf
        tfidf = np.zeros((D, V))
        for ind in range(D):
            for i in range(V):
                tfidf[ind][i] = tf[ind][i] * idf[i]
        return tfidf
    return                  

### Train the Logistic Regression Model

In [None]:
### Referece from Github 

import os
import pandas as pd
import re
import time
import numpy as np
import itertools
from nltk.stem.porter import PorterStemmer
from nltk import word_tokenize
from collections import defaultdict
from sklearn.metrics import confusion_matrix
import string
import pickle

class Logistic_Regression_L2_SGD:
    """ Defining a Logistic Regression class with L2 regularization and 
        Stochastic Gradient Descent
    
    The parameters are:
        l2: lambda value for l2 regularization
        n_iter: number of iterations over the dataset
        eta: learning rate
        batch_size: size of each batch (SGD=1 and full batch = len(X))
    """
    
    def __init__(self, l2=0.0, n_iter=1000, eta=0.05, batch_size=1):
        self.l2 = l2
        self.n_iter = n_iter
        self.eta = eta
        self.batch_size = batch_size
            
    def sigmoid(self, z):
        # This is the sigmoid function of z
        return 1/(1+ np.exp(-z))
    
    def fit(self, X, y):
        print("Fitting Logistic Regression, eta = %s, %s iterations, L2 = %s --- %s minutes ---" % (self.eta, self.n_iter,self.l2,round((time.time() - start_time)/60,2)))
        # fit the training data
        
        y = y.reshape(-1,1)
        # initialize the values of the weights to zero
        self.theta = np.zeros((X.shape[1],1))
        m = y.shape[0]
        pad = 1e-6
        self.cost_values = []
        for _ in range(self.n_iter):
            # shuffling each iteration as to prevent overfitting
            shuffled_values = np.random.permutation(m)
            X_shuffled = X[shuffled_values]
            y_shuffled = y[shuffled_values]
            # iterating over each batch
            for batch in range(0, m, self.batch_size):
                x_batch = X_shuffled[batch:batch+self.batch_size]
                y_batch = y_shuffled[batch:batch+self.batch_size]
                z = self.sigmoid(np.dot(x_batch, self.theta))
                # calculating the gradient with the derived formula
                gradient = x_batch.T.dot(z-y_batch)/m + (self.l2/m*self.theta)
                self.theta -= self.eta * gradient
                # implementing the cost (objective) function given
                cost = np.average(-y_batch*np.log(z+pad) - ((1-y_batch)*np.log(1-z+pad)))
                l2_cost = cost + (self.l2/(2*m) * np.linalg.norm(self.theta[1:])**2)  # we don't regularize the intersect
                self.cost_values.append(l2_cost)

        return self
    

    
    def predict(self, X, threshold=0.5):
        # return the predicted values in (0,1) format
        return np.where(self.sigmoid(X.dot(self.theta)) >= threshold,1,0)
    
    def predict_prob(self, X):
        # return the predicted values in percentage format
        return self.sigmoid(X.dot(self.theta))
    
#Logistic Regression
#nostem no l2
LR_model = Logistic_Regression_L2_SGD(n_iter=15,eta=1, batch_size=10000)
LR_model.fit(np.load('Stored/Vectors/trainbow_freq.npy'), np.load('Stored/DocsVocab/y_train.npy'))
save_obj('Stored/Models/LR-bowfreq-noL2',LR_model)

predictions = LR_model.predict(np.load('Stored/Vectors/testbow_freq.npy'))
evaluate(predictions, y_test, "LOGISTIC_FREQ_NOL2")

LR_model = Logistic_Regression_L2_SGD(n_iter=15,eta=1, batch_size=10000)
LR_model.fit(np.load('Stored/Vectors/trainbow_binary.npy'), np.load('Stored/DocsVocab/y_train.npy'))
save_obj('Stored/Models/LR-bowbinary-noL2',LR_model)
predictions = LR_model.predict(np.load('Stored/Vectors/testbow_binary.npy'))
evaluate(predictions, y_test, "LOGISTIC_BINARY_NOL2")
LR_model = Logistic_Regression_L2_SGD(n_iter=15,eta=1, batch_size=10000)
LR_model.fit(np.load('Stored/Vectors/train_tfidf.npy'), np.load('Stored/DocsVocab/y_train.npy'))
save_obj('Stored/Models/LR-tfidf-noL2',LR_model)
predictions = LR_model.predict(np.load('Stored/Vectors/test_tfidf.npy'))
evaluate(predictions, y_test, "LOGISTIC_TFIDF_NOL2")

#stem no l2
LR_model = Logistic_Regression_L2_SGD(n_iter=15,eta=1, batch_size=10000)
LR_model.fit(np.load('Stored/Vectors/trainbow_stem_freq.npy'), np.load('Stored/DocsVocab/y_train.npy'))
save_obj('Stored/Models/LR-bowfreq-stem-noL2',LR_model)
predictions = LR_model.predict(np.load('Stored/Vectors/testbow_stem_freq.npy'))
evaluate(predictions, y_test, "LOGISTIC_FREQ_STEM_NOL2")
LR_model = Logistic_Regression_L2_SGD(n_iter=15,eta=1, batch_size=10000)
LR_model.fit(np.load('Stored/Vectors/trainbow_stem_binary.npy'), np.load('Stored/DocsVocab/y_train.npy'))
save_obj('Stored/Models/LR-bowbinary-stem-noL2',LR_model)
predictions = LR_model.predict(np.load('Stored/Vectors/testbow_stem_binary.npy'))
evaluate(predictions, y_test, "LOGISTIC_BINARY_STEM_NOL2")
LR_model = Logistic_Regression_L2_SGD(n_iter=15,eta=1, batch_size=10000)
LR_model.fit(np.load('Stored/Vectors/train_tfidf_stem.npy'), np.load('Stored/DocsVocab/y_train.npy'))
save_obj('Stored/Models/LR-tfidf-stem-noL2',LR_model)
predictions = LR_model.predict(np.load('Stored/Vectors/test_tfidf_stem.npy'))
evaluate(predictions, y_test, "LOGISTIC_TFIDF_STEM_NOL2")

#l2
LR_model = Logistic_Regression_L2_SGD(n_iter=15,eta=0.1,l2=5, batch_size=10000)
LR_model.fit(np.load('Stored/Vectors/trainbow_freq.npy'), np.load('Stored/DocsVocab/y_train.npy'))
save_obj('Stored/Models/LR-bowfreq-L2',LR_model)
predictions = LR_model.predict(np.load('Stored/Vectors/testbow_freq.npy'))
evaluate(predictions, y_test, "LOGISTIC_FREQ_L2")
LR_model = Logistic_Regression_L2_SGD(n_iter=15,eta=0.1,l2=5, batch_size=10000)
LR_model.fit(np.load('Stored/Vectors/trainbow_binary.npy'), np.load('Stored/DocsVocab/y_train.npy'))
save_obj('Stored/Models/LR-bowbinary-L2',LR_model)
predictions = LR_model.predict(np.load('Stored/Vectors/testbow_binary.npy'))
evaluate(predictions, y_test, "LOGISTIC_BINARY_L2")
LR_model = Logistic_Regression_L2_SGD(n_iter=15,eta=0.1,l2=5, batch_size=10000)
LR_model.fit(np.load('Stored/Vectors/train_tfidf.npy'), np.load('Stored/DocsVocab/y_train.npy'))
save_obj('Stored/Models/LR-tfidf-L2',LR_model)
predictions = LR_model.predict(np.load('Stored/Vectors/test_tfidf.npy'))
evaluate(predictions, y_test, "LOGISTIC_TFIDF_L2")

#stem l2
LR_model = Logistic_Regression_L2_SGD(n_iter=15,eta=0.1,l2=5, batch_size=10000)
LR_model.fit(np.load('Stored/Vectors/trainbow_stem_freq.npy'), np.load('Stored/DocsVocab/y_train.npy'))
save_obj('Stored/Models/LR-bowfreq-stem-L2',LR_model)
predictions = LR_model.predict(np.load('Stored/Vectors/trainbow_stem_freq.npy'))
evaluate(predictions, y_test, "LOGISTIC_FREQ_STEM_L2")
LR_model = Logistic_Regression_L2_SGD(n_iter=15,eta=0.1,l2=5, batch_size=10000)
LR_model.fit(np.load('Stored/Vectors/trainbow_stem_binary.npy'), np.load('Stored/DocsVocab/y_train.npy'))
save_obj('Stored/Models/LR-bowbinary-stem-L2',LR_model)
predictions = LR_model.predict(np.load('Stored/Vectors/trainbow_stem_binary.npy'))
evaluate(predictions, y_test, "LOGISTIC_BINARY_STEM_L2")
LR_model = Logistic_Regression_L2_SGD(n_iter=15,eta=0.1,l2=5, batch_size=10000)
LR_model.fit(np.load('Stored/Vectors/train_tfidf_stem.npy'), np.load('Stored/DocsVocab/y_train.npy'))
save_obj('Stored/Models/LR-tfidf-stem-L2',LR_model)
predictions = LR_model.predict(np.load('Stored/Vectors/test_tfidf_stem.npy'))
evaluate(predictions, y_test, "LOGISTIC_TFIDF_STEM_L2")

### Logistic Regression Functions

In [None]:
def sigmoid(z):
    return 1/(1+ np.exp(-z))

def Cost(theta):
    z = 

def fit(X, y):
    y = r.reshape(-1,1)  
    theta = np.zeros((X.shape[1],1))
    
def predict(X, threshold=0.5):
    return np.where(sigmoid(X.dot(self.theta)) > = threshold,1,0)

def predict_prob(X):
    return sigmoid(X.dot(theta)

### Other online reference

In [None]:
def get_top_k_predictions(model, X_test, k):
    
    #get probabilities instead of predicted lables
    probs = model.predict_proba(X_test)
    
    #get top k prdictions by prob (these are just index)
    best_n = np.argsort(probs, axis =1)[:,-k:]
    
    # Get Category of Prediction
    preds = [[model.classes_[predicted_cat] for predicted_cat in prediction] for prediction in best_n]
    
    # Reverse Categories (Descending order of importance)
    preds = [item[::-1] for item in preds]
    return preds 

In [None]:
### Build Model

X_train =
y_train =

# from scikit package: Logistic Regression with built-in corss validation
scikit_log_reg = LogisticRegressionCV(cv = verbose =1, solver = 'liblinear', random_state = 0, C =5, penalty = '12', max_iter = 1000m)
model = scikit_log_reg.fit(X_train,y_train) 

X_test =
y_test =

# Get Top k prideictions
preds = get_top_k_predictions(model, X_test, top_k)

# get predicted value and ground truth into a list of list
eval_items = collect_preds(y_test, preds)

# Get evaluation number on Testset
accuracy = compute_accuracy(eval_items)
mrr_at_k = compute_mrr_at_k(eval_items)

### Test Functions

In [44]:
def sigmoid(z):
    return np.exp(z)/(np.exp(z)+1)

In [45]:
sigmoid(0.5)

0.6224593312018546

In [24]:
def Preprocess(data):
        # remove url
        pattern = r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+'
        data = re.sub(pattern, '', data)
        
        # remove html special characters. 
        pattern = r'&[(amp)(gt)(lt)]+;'
        data = re.sub(pattern, '', data)
        
        # remove independent numbers.
        pattern = r' \d+ '
        data = re.sub(pattern, ' ', data)
        
        # lower case capitalized words.
        pattern = r'([A-Z][a-z]+)'
        def LowerFunc(matched):
            return matched.group(1).lower()
        data = re.sub(pattern, LowerFunc, data)
        
        # remove hashtags.
        pattern = r'[@#]([A-Za-z]+)'
        data = re.sub(pattern, '', data)
        return data

In [20]:
def GetTokens(data):
    # use tweet tokenizer
    tknzr = TweetTokenizer()
    tokens = tknzr.tokenize(data)

    # tokenize at each punctuation
    pattern = r'[A-Za-z]+\'[A-Za-z]+'
    for tk in tokens:
        if re.match(pattern, tk):
            subtokens = word_tokenize(tk)
            tokens.remove(tk)
            tokens = tokens + subtokens
    return tokens

In [18]:
# process tokens with setmming.
def WithStem(tokens):
    porter = PorterStemmer()  ####
    tokensStem = []
    for tk in tokens:
        tokensStem.append(porter.stem(tk))  ####
    return tokensStem

In [30]:
labelTrain = []
dataTrain = []
dataTrainStem = []
    
for root, ds, fs in os.walk('./tweet/train/'):
    for file in fs:
        fullname = os.path.join(root, file)
            
        # get the training label
        if "positve" in fullname:
            label = 1
        else: 
            label = 0
        data = open(fullname, encoding = "utf8").read()
        #print("Label_dataTrain", data)
            
        # Preprocess Labeled Data
        data = Preprocess(data)
        #print("Preprocess_Lable_dataTrain",data)

        # Get tokens for the data
        tokens = GetTokens(data)
        dataTrain.append(tokens)
        #print("Token_dataTrain[] =",tokens)

        # Getthe stemmed tokens for the data
        tokensStem = WithStem(tokens)
        dataTrainStem.append(tokensStem)
        print("Check Stemmed token data",tokensStem)

Check Stemmed token data ['so', 'I', 'still', 'need', 'to', 'stay', 'on', 'hold', '?', 'so', 'I', 'still', 'need', 'to', 'stay', 'on', 'hold', '?']
Check Stemmed token data ['realli', '...', 'you', 'charg', 'me', '$', '25', 'to', 'check', 'a', 'bag', 'and', 'then', 'you', 'put', 'it', 'on', 'a', 'differ', 'flight', '...', 'still', 'have', 'my', 'bag', '!', '!', '!', 'do', "n't"]
Check Stemmed token data ['.', 'thank', 'for', 'make', 'an', 'effort', '.', 'credit', 'where', 'credit', 'is', 'due', ':', 'flight', 'delay', 'hour', 'instead', 'of', 'hour', '.']
Check Stemmed token data ['plz', 'advertis', 'wifi', 'if', 'not', 'gonna', 'work', 'thank', 'do', "n't", 'it', "'s"]
Check Stemmed token data ['-', 'is', 'not', 'friendli', 'int', "'", 'l']
Check Stemmed token data ['thank', 'for', 'a', 'subpar', 'travel', 'experi', 'and', 'not', 'even', 'over', 'yet', 'it', "'s"]
Check Stemmed token data ['my', 'ex-boyfriend', 'pick', 'up', 'my', 'call', 'more', 'than', 'you', 'do', '!', 'flightedfli

Check Stemmed token data ['whi', 'would', 'I', 'even', 'consid', 'continu', 'your', 'point', 'program', 'when', 'I', 'receiv', 'no', 'perk', 'or', 'continu', 'bad', 'custom', 'servic', '?']
Check Stemmed token data ['1/2', 'just', 'be', 'honest', 'with', 'your', 'custom', '.', 'the', 'new', 'seat', 'are', 'design', 'to', 'fit', 'more', 'row', 'on', 'the', 'plane', '.', 'more', 'row', '=', 'more', 'money', '.']
Check Stemmed token data ['the', 'internet', 'is', 'a', 'great', 'thing', '.', 'I', 'am', 'email', 'execut', 'in', 'your', 'compani', 'mayb', 'they', 'will', 'respond', 'to', 'me', 'in', 'a', 'time', 'manner', '.']
Check Stemmed token data ['after', 'min', 'they', 'tell', 'us', 'they', 'check', 'temperatur', 'for', 'at', 'least', 'more', 'hour', '.', 'so', 'back', 'to', 'the', 'gate', 'to', 'wait', '.', 'ca', "n't", 'it', "'s"]
Check Stemmed token data ['are', 'you', 'kid', 'ME', '?', '?', 'bad', 'custom', 'servic', 'folk', '.', 'flew', 'from', 'psp', 'to', 'phx', 'flight', 'on',

Check Stemmed token data ['I', 'guess', '.', 'but', 'what', 'is', 'the', 'excus', 'for', 'understaf', 'costum', 'servic', '?', 'peopl', 'had', 'to', 'deal', 'with', 'lot', 'of', 'angri', 'passeng', '.', 'weather', 'too', '?']
Check Stemmed token data ['no', '.', 'I', 'need', 'you', 'to', 'have', 'more', 'than', 'agent', 'at', 'check', 'in', '.']
Check Stemmed token data ['also', 'not', 'have', 'food', 'avail', 'even', 'for', 'purchas', 'is', 'quit', 'shock', 'sinc', 'thi', 'is', 'a', 'hour', 'intern', 'flight', '.']
Check Stemmed token data ['will', 'tweet', 'all', 'day', 'until', 'I', 'get', 'a', 'live', 'cust', 'rep', '.', 'I', 'will', 'make', 'sure', 'the', 'world', 'know', 'you', 'care', 'about', 'your', 'custom', 'health', '.', 'do', "n't"]
Check Stemmed token data ['made', 'me', 'remov', 'pound', 'from', 'my', 'check', 'bag', 'otherwis', 'they', 'would', 'charg', 'me', '$', '200', '.', 'loyal', 'flier', 'for', 'year', '-', 'time', 'for', 'chang', '.']
Check Stemmed token data ['n

Check Stemmed token data ['ha', 'me', 'on', 'my', 'toe', 'whether', 'go', 'to', 'make', 'my', 'flight', 'back', 'to', 'boston', 'tonight', 'I', "'m"]
Check Stemmed token data ['ruin', 'my', 'sunday', '.', '3', '+', 'hour', 'of', 'delay', 'out', 'of', 'dia', 'now', 'my', 'bag', 'are', 'at', 'the', 'wrong', 'airport', '.', '$', '35', 'to', 'deliv', 'them', '?', 'insult', 'to', 'injuri', '.']
Check Stemmed token data ['defin', '"', 'sincer', '"', '.', 'your', 'action', 'do', 'not', 'reflect', 'what', 'say', 'in', 'a', 'poor', 'attempt', 'to', 'redeem', 'yourselv', 'on', 'social', 'media', '.', 'you', "'re"]
Check Stemmed token data ['it', 'would', 'make', 'my', 'day', 'if', 'I', 'could', 'take', 'a', 'flight', 'on', 'your', 'airlin', 'without', 'ani', 'hiccup', 'on', 'your', 'end', '.', 'train', 'your', 'employe', 'to', 'commun']
Check Stemmed token data ['flight', 'in', 'D', '/', 'FW', 'now', 'delay', 'hour', 'wait', 'sit', 'in', 'plane', 'for', 'baggag', 'to', 'be', 'load', '-', 'whi', 

Check Stemmed token data ['take', 'a', 'look', 'at', '1715', '.', 'all', 'F', 'seat', 'fill', 'so', 'someon', 'upgrad', 'their', 'friend', 'instead', 'of', 'follow', 'rule', '.']
Check Stemmed token data ['nope', '.', 'make', 'chang', 'onlin', 'and', 'after', 'min', 'on', 'hold', 'and', 'time', 'deal', 'w', '/', 'the', 'rude', 'rep', 'the', 'hour', 'window', 'ha', 'close', '.', 'could', "n't"]
Check Stemmed token data ['do', 'you', 'have', 'the', 'phone', 'number', 'of', 'a', 'supervisor', 'i', 'can', 'speak', 'to', 'regard', 'my', 'travel', 'today']
Check Stemmed token data ['I', 'have', 'never', 'been', 'so', 'gross', 'out', 'in', 'my', 'life', '.', 'what', 'a', 'disappointment-yet', 'again-fli', 'with', 'you', '.']
Check Stemmed token data ['I', 'did', '.', 'time', '.', 'with', 'of', 'those', 'be', 'hung', 'up', 'on', '.', 'the', 'other', 'one', 'it', 'final', 'sent', 'me', 'to', 'the', 'call', 'back', 'notif', '.']
Check Stemmed token data ['#', 'rudestaff', '#', 'incompetentmanag'

Check Stemmed token data ['-', 'I', 'realli', 'down', 'think', 'downgrad', 'the', 'booz', '/', 'food', 'select', 'at', 'the', 'club', 'is', 'a', 'good', 'way', 'to', 'go', '.', 'how', 'about', 'you', 'look', '?']
Check Stemmed token data ['you', 'might', 'be', 'the', 'onli', 'airlin', 'not', 'offer', 'waiver', '...', 'keep', 'up', 'the', 'good', 'fight']
Check Stemmed token data ['ye', 'they', 'said', 'the', 'runway', 'readi', 'and', 'wait', 'for', 'port', 'author', 'to', 'remov', 'the', 'snow', 'and', 'they', 'are', 'no', 'where', 'in', 'sight', '.', 'is', "n't", 'they', "'re"]
Check Stemmed token data ['told', 'me', 'I', 'deserv', 'to', 'be', 'delay', 'becaus', 'I', 'book', 'the', 'last', 'flight', 'and', 'then', 'I', 'hear', '"', 'the', 'onli', 'inconvi', 'is', 'miss', 'your', 'flight', '"']
Check Stemmed token data ['site', 'show', 'on-tim', 'but', 'flightawar', 'site', 'show', 'delay', '.', 'tri', 'to', 'decid', 'if', 'I', 'get', 'out', 'in', 'thi', 'ice', 'to', 'catch', 'flt', 't

Check Stemmed token data ['thank', 'for', 'reach', 'out', '.', 'the', 'seat', 'wa', 'hard', 'not', 'enough', 'cushion', '.', 'OK', 'for', 'short', 'flight', '.', 'btw', 'ua6465', 'flight', 'and', 'crew', 'were', 'great', '!']
Check Stemmed token data ['you', 'cancel', 'flightl', 'my', 'flight', 'to', 'bna', '.', 'I', 'have', 'flight', 'from', 'bna', 'mdw', 'tomorrow', 'AM', '.', 'you', 'strand', 'me', 'in', 'den', '.', 'been', 'on', 'hold', 'all', 'day', 'I', "'ve"]
Check Stemmed token data ['help', 'need', '!', 'ticket', 'passeng', 'with', 'no', 'avail', 'seat', 'schedul', 'to', 'leav', 'late', 'flightr', 'today']
Check Stemmed token data ['bag', 'is', 'supposedli', 'here', 'in', 'boston']
Check Stemmed token data ['to', 'arriv', 'the', 'plane', 'sit', 'on', 'need', 'to', 'take', 'off', '...', 'wish', 'someon', 'would', 'tell', 'us', 'what', 'the', 'holdup', 'is', 'I', "'m"]
Check Stemmed token data ['.', 'be', 'receiv', 'a', 'long', 'detail', 'complaint', 'letter', 'soon', '.', 'long

Check Stemmed token data ['ye', 'myself', 'and', 'about', 'other', 'peopl', 'also', '.']
Check Stemmed token data ['I', 'put', 'a', 'call', 'back', 'in', 'at', '1am', 'and', 'still', 'noth', 'at', '8am', '.', 'you', 'think', 'you', 'might', 'get', 'back', 'to', 'me', '?']
Check Stemmed token data ['monday', 'even', '.']
Check Stemmed token data ['had', 'cancel', 'flight', 'my', 'trip', 'to', 'LA', 'bc', 'weather', 'wa', 'easi', 'but', 'US', 'air', 'is', 'make', 'me', 'pay', '$', '200', 'to', 'use', 'the', 'credit', '.', 'never', 'again']
Check Stemmed token data ['am', 'I', 'illiter', 'or', 'is', 'there', 'a', 'magic', 'code', 'to', 'make', 'your', 'custom', 'servic', 'site', 'work', '?']
Check Stemmed token data ['flight', '(', 'ha', '!', ')', '-', 'one', 'employe', 'said', 'they', 'were', 'reassign', 'anoth', 'said', 'they', 'just', 'left', '.', 'I', 'just', 'want', 'some', 'warm', 'weather', '!']
Check Stemmed token data ['.', 'glad', 'you', 'appreci', 'it', 'it', '’', 'll', 'be', '

Check Stemmed token data ['ok', 'make', 'no', 'sens', 'tho', 'sinc', 'give', 'me', 'a', 'free', 'upgrad', 'to', 'first', '.', 'you', "'ll"]
Check Stemmed token data ['hi', 'im', 'tri', 'to', 'speak', 'with', 'someon', 'to', 'book', 'travel', 'for', 'a', 'minor', '.', 'the', 'phone', 'keep', 'disconnect', '.', 'pleas', 'advis', '.', 'websit', 'wont', 'allow', '.']
Check Stemmed token data ['I', 'have', 'to', 'spend', 'more', 'than', 'the', 'cost', 'of', 'the', 'flight', 'just', 'to', 'get', 'a', 'free', 'room', 'and', 'transport', 'to', 'and', 'from', 'the', 'airport']
Check Stemmed token data ['no', 'but', 'serious', 'wtf', '?']
Check Stemmed token data ['is', 'the', 'worst', '!', '!', 'still', 'left', '..', 'everyon', 'miss', 'connect', '.', 'have', "n't"]
Check Stemmed token data ['second', 'time', 'fli', 'into', 'houston', 'and', '45', '+', 'min', 'wait', 'for', 'luggag', 'at', 'baggag', '.', 'typic', '?', 'still', 'wait', '..']
Check Stemmed token data ['not', 'even', 'on', 'the', 

Check Stemmed token data ['you', 'advertis', 'the', 'flight', 'and', 'it', 'still', 'on', 'you', 'websit', 'and', 'still', 'be', 'book', '.', 'spent', 'over', 'hour', 'with', 'unit', 'support', 'ca', "n't"]
Check Stemmed token data ['good', 'expect', 'that', 'but', 'I', 'get', 'through', 'on', 'the', 'phone', 'to', 'make', 'ani', 'chang', '.', 'can', 'I', 'chang', 'it', 'onlin', '?', 'that', "'s", 'I', "'d", 'ca', "n't"]
Check Stemmed token data ['I', 'have', 'been', 'on', 'hold', 'for', 'over', 'one', 'hour', 'wait', 'to', 'reschedul', 'a', 'flight', '.', 'how', 'much', 'longer', 'should', 'I', 'expect', '?']
Check Stemmed token data ['sat', 'on', 'the', 'ramp', 'for', 'min', 'and', 'miss', 'my', 'connect', '.', 'no', 'one', 'seem', 'to', 'concern', 'at', 'usairway', '.']
Check Stemmed token data ['zcc', '82u', 'cancel', 'flight', 'flight', '16h', 'in', 'advanc', '?', '?', '?', 'need', 'connect', 'flight', 'reschedul', 'so', 'the', 'link', 'help', '.', 'help', '!', '?', 'gf', 'wait', 

Check Stemmed token data ['uh', '-', 'I', 'book', 'it', 'through', 'the', 'UA', 'websit', '.', 'whi', 'the', 'price', 'chang', '?']
Check Stemmed token data ['as', 'far', 'as', 'be', 'delay', 'goe', '…', 'look', 'like', 'tailwind', 'are', 'go', 'to', 'make', 'up', 'for', 'it', '.', 'good', 'news', '!']
Check Stemmed token data ['usair', '#703', 'ha', 'return', 'to', 'phl', 'due', 'to', 'issu', 'with', 'the', 'fire', 'detector', '.', 'what', 'happen', 'next', '?']
Check Stemmed token data ['thi', 'is', 'how', 'concern', 'your', 'gate', 'attend', 'at', 'c26', 'are', 'thi', 'afternoon', '.']
Check Stemmed token data ['my', 'seat', 'is', 'disgust', '.', 'old', 'and', 'dirti', '.', 'when', 'are', 'you', 'go', 'to', 'refurbish', 'thi', 'plane', '?', 'US', 'air', 'jfk', 'to', 'phx']
Check Stemmed token data ['thi', 'is', 'realli', 'unhelp', '.', 'whi', 'I', 'rebook', 'onlin', '?', 'ca', "n't"]
Check Stemmed token data ['not', 'your', 'finest', 'moment', 'in', 'boston', 'right', 'now', '!']
Ch

Check Stemmed token data ['help', '!', 'it', 'say', '"', 'the', 'payment', 'is', 'deni', '"', 'but', 'my', 'money', 'ha', 'been', 'taken', 'out', 'of', 'my', 'bank', '?', 'and', 'it', 'say', 'I', 'have', 'no', 'trip', 'come', 'up', '?', '?']
Check Stemmed token data ['doe', 'anyon', 'from', 'your', 'airlin', 'know', 'how', 'to', 'do', 'their', 'job', '?']
Check Stemmed token data ['no', '.', 'pleas', 'stop', '.']
Check Stemmed token data ['slowest', 'baggag', 'claim', 'ever']
Check Stemmed token data ['I', 'got', 'rebook', 'on', 'on', 'us', 'airway', 'by', 'you', 'guy', 'but', 'the', 'flight', 'I', 'got', 'work', '.', 'ani', 'help', 'would', 'be', 'appreci', '.', 'wo', "n't"]
Check Stemmed token data ['well', 'your', 'new', 'baggag', 'fee', 'fare', 'structur', 'and', 'reduc', 'legroom', 'ha', 'turn', 'you', 'into', 'everi', 'other', 'hate', 'carrier', 'in', 'the', 'US', '.', 'congrat']
Check Stemmed token data ['heyyyi', 'guyyyi', '..', 'been', 'tri', 'to', 'get', 'through', 'for', 'an

Check Stemmed token data ['you', 'are', 'beyond', 'redempt', '.', 'jfk', '.', 'baggag', 'claim', 'look', 'like', 'a', 'luggag', 'warehous']
Check Stemmed token data ['are', 'you', 'kid', 'me', '?', 'no', 'one', 'answer', 'call', 'on', 'reserv', 'line', 'due', 'to', 'high', 'call', 'volum', 'and', 'not', 'even', 'an', 'option', 'to', 'wait', '.', 'brutal']
Check Stemmed token data ['stuck', 'on', 'the', 'ramp', 'at', 'dca', 'US', 'air', 'comput', 'system', 'crash', '...', 'everywher', '.']
Check Stemmed token data ['know', 'what', 'I', 'like', 'about', 'gate', '35x', '?', 'nothin', '.', 'I', 'like', 'nothin', 'bout', 'yo', 'gate']
Check Stemmed token data ['can', 'I', 'get', 'ani', 'kind', 'of', 'updat', 'on', 'the', 'delay', 'flight', 'from', 'boston', 'to', 'houston', 'at', '7:30', '?', 'realli', 'need', 'to', 'be', 'back', 'home', 'tonight', '!']
Check Stemmed token data ['just', 'call', 'reserv', '.', 'cannot', 'chang', 'just', 'abl', 'to', 'put', 'note', 'on', 're', '.', 'problem',

Check Stemmed token data [':', 'ceo', 'ha', 'decid', 'to', 'outsourc', 'and', 'or', 'push', 'out', 'more', 'and', 'more', 'of', 'their', 'skill', 'and', 'loyal', 'employe', '..', 'lack', 'of', 'staff', 'unit', "'s"]
Check Stemmed token data ['now', 'the', 'plane', 'is', 'degre', 'and', 'we', 'are', 'burn', 'up', '.', 'thi', 'is', 'complet', 'illeg', '.']
Check Stemmed token data ['_njairport', 'aa1224', ':', 'min', 'for', 'prioriti', 'bag', 'to', 'arriv', 'now', 'late', 'flight', 'for', 'even', 'plan', '!', 'delta', 'min', 'bag', 'guarante', 'anyon', '?', ':(']
Check Stemmed token data ['serious', 'on', 'make', 'stranger', 'share', 'a', 'room']
Check Stemmed token data ['all', 'flt', 'to', 'jfk', 'cancel', 'flightl', 'thx', 'to', 'UR', 'agent', 'at', 'sfo', 'im', 'rebook', 'on', 'UA', '.', 'get', 'name', '.', 'she', 'wa', 'awesom', '!', 'did', "n't"]
Check Stemmed token data ['delay', 'cancel', 'flightlat', 'a', 'doubl', 'layov', 'and', 'hour', 'late', 'flightr', '...', 'could', 'I', '

Check Stemmed token data ['i', 'did', 'and', 'not', 'stupid', '.', 'done', 'web', 'develop', 'for', 'yr', '.', 'the', 'issu', '(', 'now', 'confirm', 'by', 'your', 'support', ')', 'is', 'your', 'websit', '.', 'i', "'m", 'i', "'ve"]
Check Stemmed token data ['we', 'knew', 'it', 'wa', 'not', 'go', 'to', 'leav', 'but', 'we', 'had', 'hour', 'wait', '...', 'that', 'wa', 'the', 'hurri', '..']
Check Stemmed token data ['also', 'we', 'are', 'in', 'famili', 'crisi', 'you', 'charg', 'full', 'price', 'for', 'all', 'these', 'flight', 'plu', '$', '200', 'chang', 'flight', 'even', 'for', 'emerg', '.']
Check Stemmed token data ['all', 'reserv', 'number', 'and', 'none', 'are', 'in', 'hungari', '.', 'and', 'my', 'phone', 'not', 'work', 'here', '.', 'you', 'make', 'it', 'too', 'difficult', '.', 'they', "'re"]
Check Stemmed token data ['hey', '!', 'I', 'have', 'a', 'name', 'issu', 'on', 'my', 'reserv', 'can', 'you', 'help', '?']
Check Stemmed token data ['I', 'had', 'to', 'pay', 'extra', '$', '$', 'co', '

Check Stemmed token data ['it', 'would', 'be', 'aw', 'nice', 'if', 'you', 'could', 'send', 'an', 'alert', 'when', 'my', 'flight', 'is', 'cancel', 'flight', '.', 'no', 'text', 'no', 'email', 'no', 'love', '...']
Check Stemmed token data ['ive', 'tri', 'on', 'my', 'phone', 'and', 'laptop', ':(']
Check Stemmed token data ['if', 'you', 'want', 'to', 'fli', 'in', 'a', 'storm', 'your', 'right', '.', 'but', 'give', 'us', 'the', 'choic', '.', 'we', 'have', 'kid', 'and', 'want', 'to', 'chanc', 'it', '.', 'that', "'s", 'do', "n't"]
Check Stemmed token data ['serious', '?', 'min', 'on', 'the', 'plane', 'at', 'bhm', 'and', 'now', 'min', 'in', 'baggag', 'and', 'still', 'no', 'bag', '?']
Check Stemmed token data ['been', 'on', 'hold', 'on', 'the', 'phone', 'for', 'well', 'over', 'half', 'an', 'hour', 'wait', 'to', 'make', 'a', 'reserv', 'chang', 'I', '*', 'want', '*', 'to', 'make', 'onlin', '.', 'help', '!']
Check Stemmed token data ['hour', 'late', 'flightr', 'and', 'site', 'on', 'anoth', 'plane', 

Check Stemmed token data ['tri', 'to', 'charg', 'us', '$', '550', 'dollar', 'cash', 'for', 'ticket', 'we', 'paid', 'for', 'month', 'in', 'advanc']
Check Stemmed token data ['-', 'burn', 'a', 'lot', 'of', 'time', 'and', 'cell', 'minut', 'work', 'through', 'thi', 'flight', 'issu', '-', 'hope', 'we', 'fli', 'home', 'in', 'the', 'morn', '.']
Check Stemmed token data ['obvious', 'we', 'did', 'see', 'an', 'agent-book', 'us', 'for', 'tomorrow', 'morn', '.', 'no', 'hotel', 'no', 'transport', '.']
Check Stemmed token data ['my', 'friend', 'at', 'kphl', 'were', 'told', 'by', 'your', 'rep', '"', 'if', 'it', 'were', 'me', 'I', 'woulda', 'just', 'gone', 'home', 'thi', 'am', 'and', 'tri', 'again', 'tomorrow', '.', '"', '1/2']
Check Stemmed token data ['if', 'you', 'have', 'a', 'rule', 'you', 'should', 'probabl', 'appli', 'them', 'all', 'the', 'time', 'not', 'just', 'sometim', '.']
Check Stemmed token data ['see', '?', 'we', 'were', 'told', 'repeatedli', 'that', 'the', 'pilot', 'wa', 'late', 'flight'

Check Stemmed token data ['thank', 'you', 'for', 'the', 'quick', 'custom', 'servic', 'today', '.', 'I', 'know', 'that', 'winter', 'weather', 'is', 'not', 'your', 'fault', '.']
Check Stemmed token data ['you', 'are', 'the', 'best', 'airway', '!', 'follow', 'me', 'pleas', '!', '!', '!', '🙏', '🙏', '🙏']
Check Stemmed token data ['and', 'again', '!', 'anoth', 'rep', 'kick', 'butt', '!', 'naelah', 'repres', 'your', 'team', 'so', 'beauti', '!', '!', 'thank', 'you', '!', '!', '!']
Check Stemmed token data ['thank', 'for', 'repli', 'I', 'my', 'conf', '#', '!', 'I', 'love', 'aviat', 'and', 'southwest', 'and', 'all', 'I', 'want', 'for', 'my', '18th', 'wa', 'to', 'fli', 'swa', 'for', 'the', 'day', '!', 'se', "n't"]
Check Stemmed token data ['fantast', 'thank', '!', 'will', 'tri', 'and', 'tweet', 'a', 'photo', 'of', 'the', 'view', ':)']
Check Stemmed token data ['thank', '!', 'have', 'a', 'good', 'sunday', '.']
Check Stemmed token data ['so', 'far', 'so', 'good', '.', 'just', 'step', 'down', 'in', 

Check Stemmed token data ['thank', 'connect', 'thru', 'nashvil', 'have', 'A1', 'board', 'pass', 'get', 'to', 'dalla', 'gate', 'board', 'min', 'befor', 'flt', 'get', 'end', 'of', 'B', 'group']
Check Stemmed token data ['😭', '😭', '😭', 'yall', 'are', 'realli', 'better', 'then', 'american', 'airlin', 'though', '.']
Check Stemmed token data ['thank', 'for', 'the', 'upgrad', 'today', 'great', 'way', 'to', 'start', 'my', 'week', '!', 'cc', ':']
Check Stemmed token data ['aww', 'thank', 'AA', '..', 'dfw', 'wa', 'on', 'gma', 'up', 'here', 'thi', 'AM', '..', 'so', 'i', 'understand', '..', 'btw', 'A', '.', 'A', 'is', 'my', 'airlin', 'when', 'im', 'abl', 'to', 'trv', '..', 'love', 'you', 'guy', '.', ':)']
Check Stemmed token data ['thank', 'so', 'much', '!']
Check Stemmed token data ['thank', 'you', '.', 'been', 'tri', 'for', 'two', 'day', 'to', 'set', 'thi', 'up', '.']
Check Stemmed token data ['ye', 'call', 'your', 'UK', 'number', 'on', 'skype', '…', 'well', 'worth', 'the', '$', '.', '50', '.', 

Check Stemmed token data ['ok', 'gotcha', '!', '✈', '️', '😃', '👍']
Check Stemmed token data ['pass', 'along', 'the', 'advic', '.', 'you', 'guy', 'rock', '!', '!', 'I', "'ll"]
Check Stemmed token data ['thank', '!', 'everyth', 'a-ok', 'now', '.']
Check Stemmed token data ['gate', 'agent', 'alavera', 'is', 'amaz']
Check Stemmed token data ['tell', 'my', 'fam', 'in', 'vega', 'now', '.', ':)']
Check Stemmed token data ['land', 'in', 'nashvil', '!', 'thank', 'for', 'take', 'care', 'of', 'us', '!']
Check Stemmed token data ['been', 'with', 'my', 'GF', 'for', '2.5', 'yr', 'from', 'SF', 'I', 'live', 'in', 'tulsa', '.', 'swa', 'alway', 'take', 'me', 'there', 'to', 'see', 'my', 'love', '!', 'she', "'s"]
Check Stemmed token data ['10', 'thank', 'you', 'for', 'your', 'kind', 'respons', '.', 'the', 'acknowledg', 'and', 'apolog', 'go', 'a', 'long', 'way', '!']
Check Stemmed token data ['anoth', 'perfect', 'flight', '.', 'how', 'come', 'on', 'your', 'plane', 'the', 'sun', 'visor', 'can', 'stay', 'dow

Check Stemmed token data ['ye', 'pleas', '...', 'how', 'do', 'we', 'get', 'that', 'companion', 'pass', '?']
Check Stemmed token data ['got', 'it', 'squar', 'away', '.', 'someon', 'pick', 'up', 'as', 'soon', 'as', 'I', 'tweet', '.', 'should', 'have', 'tweet', 'sooner', '.', '😉']
Check Stemmed token data ['thank', 'have', 'email', 'them', '.', 'how', 'long', 'should', 'I', 'expect', 'for', 'a', 'respons', '?']
Check Stemmed token data ['thank', 'you', '.', 'finger', 'cross', '.']
Check Stemmed token data ['jackpot', '!']
Check Stemmed token data ['how', 'about', 'some', 'free', 'drink', 'on', 'the', 'flight', 'back', 'for', 'the', 'free', 'promo', '?']
Check Stemmed token data ['I', 'can', 'hardli', 'believ', 'it', '!', 'bundl', 'up', 'and', 'stay', 'warm', '!', '😉']
Check Stemmed token data ['tonight', 'you', 'made', 'thi', 'fred', 'flintston', 'happi', 'with', 'an', 'upgrad', 'to', '1st', 'class', '.']
Check Stemmed token data ['twitter', 'team', '.', 'you', 'guy', 'kill', 'it', 'for',

Check Stemmed token data ['pleas', 'than', 'robin', 'at', 'ewr', 'premier', 'desk', 'for', 'help', 'me', 'get', 'on', 'a', 'an', 'earlier', 'flight', '.', 'she', 'did', 'the', 'work', '.', 'huge', 'thank', '.']
Check Stemmed token data ['IT', 'ha', 'been', 'found', 'thank', 'you']
Check Stemmed token data ['you', 'reimburs', 'everyon', 'on', 'the', 'flight', 'a', 'portion', 'of', 'their', 'ticket', '.', 'I', 'still', 'love', 'jetblu', '!', 'best', 'american', 'airlin', '!']
Check Stemmed token data ['thank', 'you', '!', 'seattl', 'and', 'here', 'we', 'come', '!', '__roe']
Check Stemmed token data ['those', 'were', 'snack', 'we', 'left', 'on', 'purpos', 'for', 'your', 'team', '.', ':)', 'for', 'be', 'so', 'help', 'thi', 'morn', 'at', 'the', 'desk', 'and', 'on', 'the', 'phone', '!']
Check Stemmed token data ['I', 'manag', 'to', 'get', 'sort', 'out', 'over', 'the', 'phone', '.', 'good', 'luck', 'deal', 'with', 'the', 'snow', 'in', 'texa', '!']
Check Stemmed token data ['thank', 'you', 'fo

Check Stemmed token data ['but', 'thank', 'you', '!', 'love', 'an', 'anxiou', 'flyer', '.']
Check Stemmed token data ['ok', 'thank', 'you', '!']
Check Stemmed token data ['awesom', '.', 'I', 'flew', 'yall', 'sat', 'morn', '.', 'ani', 'way', 'we', 'can', 'correct', 'my', 'bill', '?']
Check Stemmed token data ['no', 'thank', '.', 'took', 'care', 'of', 'it', 'when', 'I', 'call']
Check Stemmed token data ['thank', 'you', '!']
Check Stemmed token data ['awesom', '.', 'thank', 'you', '!']
Check Stemmed token data ['got', 'help', 'from', 'a', 'nice', 'ladi', 'on', 'the', 'phone', 'in', 'georgia', '.', 'thank', 'you', '!']
Check Stemmed token data ['still', 'not', 'in', 'the', 'air', 'for', 'deic', 'of', 'plane', '.', 'miss', 'connect', 'but', 'custom', 'servic', 'wa', 'help', '.', 'hope', 'to', 'catch', 'the', 'next', 'flight', 'I', "'ll", 'I', "'m"]
Check Stemmed token data ['thank', 'you', 'for', 'check', 'in', '.', 'ticket', 'are', 'purchas', 'and', 'custom', 'is', 'happi', ';-)']
Check St

In [31]:
print('Load TrainSet:%d/%d positive/negative samples.' 
      %(sum(labelTrain), len(labelTrain)-sum(labelTrain)))
np.savez('tmp/Train.npz', 
         labelTrain = labelTrain, 
         dataTrain =dataTrain, 
         dataTrainStem = dataTrainStem)

# Load TrainSet: 1181/3000 positive/negative samples.

Load TrainSet:0/0 positive/negative samples.


In [47]:
 print(labelTrain)

[]


In [52]:
dset = np.load('./tmp/Train.npz', allow_pickle = True)
labelTrain = dset['labelTrain']
print(dset)

<numpy.lib.npyio.NpzFile object at 0x000001E8A2369DD8>


### Linear Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression
my_model = LogisticRegression()

In [None]:
# Scikit-Learn
def TrainLogisticRegression(features):
    
    def sigmoid(z):
        return np.exp(z)/(np.exp(z)+1)
    #sigmoid(0.5)   
    dset = np.load('./tmp/Train.npz', allow_pickle = True)
    labelTrain = dset['labelTrain']
    
    my_model = LogisticRegression()
    
    # Define Dataset
    X = np.load('labelTrain.npy')
    Y = np.load('lavelTest.npy')
    
    # fit X, Y
    my_model.fit(X,Y)
    
    # predict 
    print("Accuracy = ", my_model.score(X,Y))