In [1]:
import numpy as np
import pandas as pd
from pprint import pprint

import gensim
from gensim.utils import simple_preprocess
from gensim import corpora, models

from nltk.stem import WordNetLemmatizer
import nltk
nltk.download('wordnet')
from nltk.tokenize import TweetTokenizer
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))
from symspellpy.symspellpy import SymSpell, Verbosity
from sklearn.model_selection import train_test_split

import collections
import torch
import torch.nn as nn
from torch import optim
import random
import time

import warnings;
warnings.filterwarnings('ignore');

# import process_tweet
# import importlib
# importlib.reload(process_tweet)

import numpy as np
import pandas as pd
from pprint import pprint

import gensim
from gensim.utils import simple_preprocess
from gensim import corpora, models

from nltk.stem import WordNetLemmatizer
import nltk
nltk.download('wordnet')
from nltk.tokenize import TweetTokenizer
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))
from symspellpy.symspellpy import SymSpell, Verbosity
from sklearn.model_selection import train_test_split

#create spell checker/word splitter
def create_symspell(max_edit_distance, prefix_length, freq_file_path):
    # create object
    sym_spell = SymSpell(max_edit_distance, prefix_length)
    
    # create dictionary using corpus.txt
    if not sym_spell.create_dictionary(freq_file_path):
        print("Corpus file not found")
        return None
    return sym_spell

def is_valid_token(w):
    special = ['<url>','<number>', '<user>']
    return w.isalpha() or w in special

def process_tweet(tweet, tknzr, sym_spell=None, advanced=False):
    st_1 = []
    for w in tknzr.tokenize(tweet):
        #remove retweet annotation if present:
        if w == 'RT':
            if advanced:
                st_1.append('rt')
        elif w[0] == '@':
            if advanced:
                st_1.append('<user>')
        #remove hashtag symbol
        elif w[0] == '#':
            st_1.append(w[1:])
        #replace link with LINK keyword
        elif w[:4] == 'http':
            st_1.append('<url>')
        elif w.isnumeric():
            if advanced:
                st_1.append('<number>')
        else:
            st_1.append(w)
    
    st_2 = []
    
    #remove stop words and punctuation, make everything lowercase
    if sym_spell != None:
        st_2 = [sym_spell.word_segmentation(w.lower()).corrected_string 
                for w in st_1 if w.isalpha() and not w.lower() in stop_words]
    elif advanced:
        st_2 = [w.lower() for w in st_1 if is_valid_token(w) and 
                    not w.lower() in stop_words]
    else:
        st_2 = [w.lower() for w in st_1 if w.isalpha() and
                not w.lower() in stop_words]
    
    #lemmatization (converts all words to root form for standardization)
    lem = WordNetLemmatizer()
    st_3 = list(map(lambda x: lem.lemmatize(x, pos='v'), st_2))
    
    #now do word segmentation/spell check
    return ' '.join(st_3)

[nltk_data] Downloading package wordnet to /home/ashwin/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package wordnet to /home/ashwin/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [2]:
df = pd.read_csv('data/labeled_prelim.csv').dropna()
df.pop('Id')
df = df.astype({'Relevancy':np.int32, 'Urgency':np.int32}).reset_index(drop=True)
df.head()

Unnamed: 0,Text,Relevancy,Urgency
0,More millions in #Afghanistan even with ZERO a...,0,0
1,These are the last post my brother made on soc...,2,1
2,In @cityofcc listening to local officials abou...,0,0
3,So so so damn proud of @5ugarcane who is tirel...,3,0
4,How can you help with #Harvey disaster respons...,0,0


In [3]:
#sym_spell = create_symspell(2,7,'data/frequency_dictionary_en_82_765.txt')
tknzr = TweetTokenizer(strip_handles=False, reduce_len=True)
df['Text'] = df['Text'].map(lambda x: process_tweet(x, tknzr, None, True))
df.head()

Unnamed: 0,Text,Relevancy,Urgency
0,millions afghanistan even zero attack isis <nu...,0,0
1,last post brother make social media phone go v...,2,1
2,<user> listen local officials epa help harvey ...,0,0
3,damn proud <user> tirelessly help fellow texan...,3,0
4,help harvey disaster response help victims nat...,0,0


In [4]:
process_tweet('@user #hello world', tknzr, None,True)

'<user> hello world'

In [6]:
#list of embeddings
vec_length = 50
embeddings = np.zeros((1193514, vec_length))

#two-way map, index->word and word->index
glove = {}

index = 0
with open('data/glove.twitter.27B/glove.twitter.27B.%dd.txt' % vec_length) as f:
    for l in f:
        line = []
        try:
            line = l.split()
            if len(line) != vec_length+1:
                print('empty line')
                continue
            
            word = line[0]
            embeddings[index] = np.array(line[1:]).astype(np.float)
            glove[index] = word
            glove[word] = index
            index += 1
        except:
            print(line)
            print(index)
            break

empty line


In [7]:
#only handles binary classification for now
def tweets_to_df(df, labels, embeddings, glove):
    
    weights = []
    index_omit = []
    index = -1
    tweets = df['Text']
    
    #a column for each entry in the embedding vector
    for i in range(vec_length+1):
        weights.append([])
    
    for i in range(len(tweets)):
        index += 1
        cur_embed = []
        cur_tweet = tweets[i]
        cur_label = labels[i]
        for i in cur_tweet.split():
            if i in glove:
                cur_embed.append(embeddings[glove[i]])
        
        if len(cur_embed) == 0:
            #make sure we drop this row from the input dataframe
            index_omit.append(index)
            continue
        
        x = np.asarray(np.mean(cur_embed, axis=0))
        
        for j in range(vec_length):
            weights[j].append(x[j])
        weights[vec_length].append(0 if cur_label == 0 else 1)
        #weights[vec_length].append(cur_label)
        
    df_pruned = df.drop(index_omit)
    
    #convert to dataframe
    cols = {}
    for i in range(vec_length):
       cols['v' + str(i)] = weights[i]
    
    cols['class'] = weights[vec_length]
    
    df2 = pd.DataFrame(data=cols)
    return df2

In [15]:
dfv = tweets_to_df(df, df['Relevancy'], embeddings, glove)
labels = dfv.pop('class')
dfv.head()

Unnamed: 0,v0,v1,v2,v3,v4,v5,v6,v7,v8,v9,...,v40,v41,v42,v43,v44,v45,v46,v47,v48,v49
0,0.145285,0.249597,-0.022808,-0.239989,-0.089231,0.137348,-0.11584,-0.154314,0.180048,-0.446195,...,-0.288326,-0.137274,-0.02505,-0.286796,0.122503,-0.205003,0.066433,-0.032183,0.089838,0.364138
1,0.501971,0.402457,-0.249822,-0.262696,0.0831,-0.011195,0.801255,-0.111244,0.115108,0.036542,...,-1.043849,0.247093,-0.034573,-0.021694,0.356123,-0.281959,0.013459,-0.126864,-0.351086,0.064725
2,0.629816,0.439907,-0.291955,-0.3383,0.089501,-0.135476,0.28545,0.134249,0.282678,-0.452744,...,-0.657474,0.140981,0.320352,0.254273,0.120237,0.076802,0.043022,0.034155,-0.342334,-0.069852
3,0.428845,0.142557,-0.211487,-0.277093,-0.118845,0.066956,0.393465,0.308246,-0.150303,-0.074032,...,-0.357974,-0.023626,0.257242,0.200877,0.0691,-0.032368,0.2309,0.030372,-0.283986,0.070049
4,0.525467,0.297099,-0.519886,-0.454296,0.389153,-0.321658,0.43176,-0.061013,0.126266,-0.593969,...,-0.391478,-0.115271,0.374848,-0.047085,0.366144,-0.014719,0.114251,0.043626,-0.217644,0.097062


In [16]:
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import * 
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import *
from sklearn.model_selection import cross_validate
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2

def average(x):
    return sum(x)/len(x)

def get_stats(model, X, y, cv, verbose=False):
    
    accuracy = []
    precision = []
    recall = []
    f1 = []
    auc = []
        
    cv_results = cross_validate(model, X, y, scoring = ['accuracy', 'precision', 'recall', 'f1', 'roc_auc'], 
                                cv=cv, return_train_score=False)
    
    if verbose:
        print(cv_results)
    
    #now return the data
    return cv_results

In [17]:
models = {'Naive Bayes': GaussianNB(),
          'Voting': VotingClassifier(estimators=[('mlp', MLPClassifier()),
                                            ('ada', AdaBoostClassifier()),
                                            ('nb', GaussianNB())], voting='soft'),
          'Perceptron': MLPClassifier(),
          'AdaBoost': AdaBoostClassifier()}

vals = []
metric = []
model_name = []

f1 = []
precision = []
recall = []
accuracy = []
auc = []
method = ['Naive Bayes', 'Voting', 'MLP', 'AdaBoost']

cv = 10
for k,v in models.items():
    stats = get_stats(v, dfv, labels, cv)
    accuracy_avg = np.average(stats['test_accuracy'])
    accuracy_std = np.std(stats['test_accuracy'])
    precision_avg = np.average(stats['test_precision'])
    precision_std = np.std(stats['test_precision'])
    recall_avg = np.average(stats['test_recall'])
    recall_std = np.std(stats['test_recall'])
    f1_avg = np.average(stats['test_f1'])
    f1_std = np.std(stats['test_f1'])
    auc_avg = np.average(stats['test_roc_auc'])
    
    f1.append('%.2f ± %.2f' % (f1_avg, f1_std))
    precision.append('%.2f ± %.2f' % (precision_avg, precision_std))
    recall.append('%.2f ± %.2f' % (recall_avg, recall_std))
    accuracy.append('%.2f ± %.2f' % (accuracy_avg, accuracy_std))
    auc.append('%.2f' % auc_avg)

df_view = pd.DataFrame(data={'Method': method, 'f1': f1, 
                             'precision':precision, 'recall':recall,
                             'accuracy':accuracy, 'auc':auc})
df_view

Unnamed: 0,Method,f1,precision,recall,accuracy,auc
0,Naive Bayes,0.54 ± 0.04,0.44 ± 0.04,0.69 ± 0.07,0.70 ± 0.03,0.76
1,Voting,0.56 ± 0.06,0.51 ± 0.05,0.61 ± 0.09,0.75 ± 0.03,0.78
2,MLP,0.50 ± 0.05,0.57 ± 0.07,0.45 ± 0.06,0.77 ± 0.03,0.77
3,AdaBoost,0.44 ± 0.05,0.52 ± 0.04,0.40 ± 0.08,0.75 ± 0.01,0.74
