In [2]:
import numpy as np
import pandas as pd

import gensim
from gensim.utils import simple_preprocess
from gensim import corpora, models

from nltk.stem import WordNetLemmatizer
import nltk
nltk.download('wordnet')
from nltk.tokenize import TweetTokenizer
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))
from symspellpy.symspellpy import SymSpell, Verbosity
from sklearn.model_selection import train_test_split

import process_tweet
import importlib
importlib.reload(process_tweet)

import warnings;
warnings.filterwarnings('ignore');

[nltk_data] Downloading package wordnet to /home/ashwin/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package wordnet to /home/ashwin/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package wordnet to /home/ashwin/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [4]:
#read in the word embeddings
vec_length = 100
embeddings = np.zeros((1193514, vec_length))

#two-way map, index->word and word->index
glove = {}

index = 0
with open('data/glove.twitter.27B/glove.twitter.27B.%dd.txt' % vec_length) as f:
    for l in f:
        line = []
        try:
            line = l.split()
            if len(line) != vec_length+1:
                print('empty line')
                continue
            
            word = line[0]
            embeddings[index] = np.array(line[1:]).astype(np.float)
            glove[index] = word
            glove[word] = index
            index += 1
        except:
            break

empty line


In [3]:
#read in the dataset first
df = pd.read_csv('data/mturk_0-4000_manual3.csv').iloc[0:1000, :]
print(df.shape)
df.head()

(1000, 3)


Unnamed: 0,Text,Relevancy,Urgency
0,\n#Harvey a marathon not a sprint | Severe th...,0,0
1,"your rescue boats, vehicles, volunteer craft ...",0,0
2,! @Billcassidy on adding short-term debt limit...,0,0
3,"!! According to #PBS, #Houston convention cen...",0,0
4,"""#FEMA flood maps outdated do not reflect incr...",0,0


In [5]:
from sklearn.base import BaseEstimator, TransformerMixin

class ProcessTweet(BaseEstimator, TransformerMixin):

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        tknzr = TweetTokenizer(strip_handles=False, reduce_len=True)
        X['Text'] = X['Text'].map(lambda x: process_tweet.process_tweet(x, tknzr, None, True))
        return X

In [6]:
#takes dataframe with processed tweets and returns dataframe with word embeddings
def tweets_to_df(df, labels, embeddings, glove):
    
    weights = []
    index_omit = []
    index = -1
    tweets = df['Text']
    
    #a column for each entry in the embedding vector
    for i in range(vec_length+1):
        weights.append([])
    
    for i in range(len(tweets)):
        index += 1
        cur_embed = []
        cur_tweet = tweets[i]
        cur_label = labels[i]
        for i in cur_tweet.split():
            if i in glove:
                cur_embed.append(embeddings[glove[i]])
        
        if len(cur_embed) == 0:
            #make sure we drop this row from the input dataframe
            index_omit.append(index)
            continue
        
        x = np.asarray(np.mean(cur_embed, axis=0))
        
        for j in range(vec_length):
            weights[j].append(x[j])
        
        weights[vec_length].append(0 if cur_label == 0 else 1)
        #weights[vec_length].append(cur_label)
        
    df_pruned = df.drop(index_omit)
    
    #convert to dataframe
    cols = {}
    for i in range(vec_length):
       cols['v' + str(i)] = weights[i]
    
    cols['class'] = weights[vec_length]
    return pd.DataFrame(data=cols)


class GenerateEmbeddings(BaseEstimator, TransformerMixin):
    
    def __init__(self, embeddings, glove, class_name):
        self.embeddings = embeddings
        self.glove = glove
        self.class_name = class_name

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        return tweets_to_df(X, X[self.class_name], self.embeddings, self.glove)

In [7]:
from sklearn.pipeline import Pipeline

#now create a simple pre-processing pipeline
tweet_process = Pipeline([
                ('process tweet', ProcessTweet()),
                ('generate embeddings', GenerateEmbeddings(embeddings, glove, 'Relevancy'))
            ])

dfv = tweet_process.fit_transform(df)
labels = dfv.pop('class')

In [8]:
from sklearn.feature_selection import RFECV
from sklearn.linear_model import LogisticRegression

#now conduct feature selection
df_rfs = RFECV(LogisticRegression(), step=1, cv=10).fit_transform(dfv, labels)
print(df_rfs.shape)

(997, 10)


In [9]:
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import AdaBoostClassifier, VotingClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.model_selection import cross_validate
from sklearn.metrics import roc_auc_score

def average(x):
    return sum(x)/len(x)

def get_stats(model, X, y, cv, verbose=False):
    
    accuracy = []
    precision = []
    recall = []
    f1 = []
    auc = []
        
    cv_results = cross_validate(model, X, y, scoring = ['accuracy', 'precision', 'recall', 'f1', 'roc_auc'], 
                                cv=cv, return_train_score=False)
    
    if verbose:
        print(cv_results)
    
    return cv_results

def print_stats(models, method, dfv, labels):
    
    vals = []
    metric = []
    model_name = []

    f1 = []
    precision = []
    recall = []
    accuracy = []
    auc = []

    cv = 10
    for k,v in models.items():
        stats = get_stats(v, dfv, labels, cv)
        accuracy_avg = np.average(stats['test_accuracy'])
        accuracy_std = np.std(stats['test_accuracy'])
        precision_avg = np.average(stats['test_precision'])
        precision_std = np.std(stats['test_precision'])
        recall_avg = np.average(stats['test_recall'])
        recall_std = np.std(stats['test_recall'])
        f1_avg = np.average(stats['test_f1'])
        f1_std = np.std(stats['test_f1'])
        auc_avg = np.average(stats['test_roc_auc'])

        f1.append('%.2f ± %.2f' % (f1_avg, f1_std))
        precision.append('%.2f ± %.2f' % (precision_avg, precision_std))
        recall.append('%.2f ± %.2f' % (recall_avg, recall_std))
        accuracy.append('%.2f ± %.2f' % (accuracy_avg, accuracy_std))
        auc.append('%.2f' % auc_avg)

    df_view = pd.DataFrame(data={'Method': method, 'f1': f1, 
                                 'precision':precision, 'recall':recall,
                                 'accuracy':accuracy, 'auc':auc})
    display(df_view)

In [11]:
models = {'Naive Bayes': GaussianNB(),
          'Voting': VotingClassifier(estimators=[('mlp', MLPClassifier()),
                                            ('ada', AdaBoostClassifier()),
                                            ('nb', GaussianNB())], voting='soft'),
          'Perceptron': MLPClassifier(),
          'AdaBoost': AdaBoostClassifier(),
          'Support Vector Machine': SVC()
        }
method = ['Naive Bayes', 'Voting', 'MLP', 'AdaBoost', 'Support Vector Machine']
print_stats(models, method, dfv, labels)

Unnamed: 0,Method,f1,precision,recall,accuracy,auc
0,Naive Bayes,0.59 ± 0.06,0.45 ± 0.08,0.87 ± 0.09,0.57 ± 0.10,0.73
1,Voting,0.60 ± 0.07,0.48 ± 0.09,0.82 ± 0.08,0.61 ± 0.11,0.77
2,MLP,0.56 ± 0.08,0.61 ± 0.08,0.53 ± 0.13,0.71 ± 0.05,0.75
3,AdaBoost,0.55 ± 0.10,0.58 ± 0.07,0.53 ± 0.14,0.71 ± 0.04,0.75
4,Support Vector Machine,0.03 ± 0.04,0.32 ± 0.41,0.02 ± 0.02,0.65 ± 0.01,0.76
