In [79]:
import numpy as np
import pandas as pd
from pprint import pprint

import gensim
from gensim.utils import simple_preprocess
from gensim import corpora, models

from gensim.models import Word2Vec
from gensim.test.utils import common_texts, get_tmpfile
import gensim.downloader as api
from gensim.models import KeyedVectors

from nltk.stem import WordNetLemmatizer
import nltk
nltk.download('wordnet')
from nltk.tokenize import TweetTokenizer
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))
#from symspellpy import SymSpell, Verbosity
from sklearn.model_selection import train_test_split

from process_tweet import *

import collections
import torch
import torch.nn as nn
from torch import optim
import random
import time

import warnings;
warnings.filterwarnings('ignore');

[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/amitjoshi/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [80]:
df = pd.read_csv('data/labeled_prelim_processed.csv')
df = df.dropna()
df.pop('Id')
df = df.astype({'Relevancy':np.int32, 'Urgency':np.int32})
df.head()

Unnamed: 0,Text,Relevancy,Urgency
0,millions afghanistan even zero attack isis sym...,0,0
1,last post brother make social media phone go v...,2,1
2,listen local officials epa help harvey respons...,0,0
3,damn proud tirelessly help fellow texans affec...,3,0
4,help harvey disaster response help victims nat...,0,0


In [81]:
#sym_spell = create_symspell(2,7,'data/frequency_dictionary_en_82_765.txt')
tknzr = TweetTokenizer(strip_handles=True, reduce_len=True)

doc_sample = df['Text'][0]
process_tweet(doc_sample, tknzr)

u'millions afghanistan even zero attack isis sympathizers invest texas nation build harvey texasflood'

In [128]:
#list of embeddings
vec_length = 50
embeddings = np.zeros((1193514, vec_length))

keys = list()
f = open('amitkeys2.txt', 'r')
for line in f:
	raw = line.lower().replace("\n","").split(" ")
	keys.append(raw)

#two-way map, index->word and word->index
glove = {}

index = 0
with open('data/glove.twitter.27B/glove.twitter.27B.%dd.txt' % vec_length) as f:
    for l in f:
        line = []
        try:
            line = l.split()
            if len(line) != vec_length+1:
                print('empty line')
                continue
            
            word = line[0]
            embeddings[index] = np.array(line[1:]).astype(np.float)
            glove[index] = word
            glove[word] = index
            index += 1
        except:
            print(line)
            print(index)
            break

In [129]:
#convert columns to numpy arrays
text = df['Text'].values
relevancy = df['Relevancy'].values
print (relevancy)
urgency = df['Urgency'].values

[0 2 0 ... 3 0 0]


In [186]:
class RelevancyClassifier(nn.Module):
    def __init__(self, index, embeddings, embed_len, num_classes):
        super(RelevancyClassifier, self).__init__()
        self.hidden_size = 30
        self.embed_len = embed_len
        #print(embeddings.shape)
        #self.embedding = nn.Embedding.from_pretrained(embeddings)
        self.fc1 = nn.Linear(embed_len, self.hidden_size)
        self.nl = nn.LeakyReLU()
        self.fc2 = nn.Linear(self.hidden_size, num_classes)

    def forward(self, x):
        return self.fc2(self.nl(self.fc1(x)))

def weightingHeuristic(wv, word, keys, embeddings, glove):
    #new weighting heuristic finds the meaning of a tweet by looking at how each word
    #relates to the keys.  Finds cosine similarity of this word to all the keys and finds the max
    #and average of all the cosine similarities.  Returns 0.75 * max + 0.25 * avg
    
    total = 0
    num = 0
    word = word.lower().replace("\n","")
    similarities = list()
    for kili in keys:
        #if the kili is multiple words long, do the average
        #print (kili)
        innerTotal = 0
        innerNum = 0
        for kWord in kili:
            try:
                #print (embeddings[kWord])
                curCosineSimilarity = np.dot(embeddings[glove[kWord]], embeddings[glove[word]])
                #print (curCosineSimilarity)
                #curCosineSimilarity = wv.similarity(kWord, word)
                innerNum += 1
                innerTotal += curCosineSimilarity
            except:
                continue
        
        innerAvg = 0
        if innerNum != 0:
            innerAvg = float(innerTotal)/float(innerNum)
            similarities.append(innerAvg)
        total += innerAvg
        num += 1
    if len(similarities) == 0:
        return 0
    
    maxSimilarity = max(similarities);
    avgSimilarity = float(total)/float(num)
    return (0.75 * maxSimilarity) + (0.25 * avgSimilarity)
def train_relevancy_classifier(train_exs, train_labels, embeddings, index):
    try:
        wv = KeyedVectors.load("data/word2vec.kv", mmap='r')
        epochs = 70
        lr = .00005
        num_classes = 2
        rc = RelevancyClassifier(index, embeddings, len(embeddings[0]), num_classes)
        optimizer = optim.Adam(rc.parameters(), lr=lr)
        loss = nn.CrossEntropyLoss()
        
        idxToCurEmbed = dict()
        for epoch in range(epochs):
            ex_indices = [i for i in range(len(train_exs))]
            random.shuffle(ex_indices)
            total_loss = 0.0
            num_tweets = len(ex_indices)
            
            for idx in ex_indices:
                
                cur_tweet = train_exs[idx]
                cur_embed = []
                if idx in idxToCurEmbed:
                    cur_embed = idxToCurEmbed[idx]
                else:
                    for i in cur_tweet.split():
                        if i in index:
                            #print ("here")
                            newHeuristic = weightingHeuristic(wv, i, keys, embeddings, index)
                            cur_embed.append(newHeuristic * embeddings[index[i]])
                    idxToCurEmbed[idx] = cur_embed
                if len(cur_embed) == 0:
                    num_tweets -= 1
                    continue
                
                x = torch.from_numpy(np.asarray(np.mean(cur_embed, axis=0)).reshape(1,vec_length)).float()
                y = np.asarray(train_labels[idx]).reshape(1)
                if y[0] > 0:
                    y[0] = 1
                y = torch.tensor(y).long()
                rc.zero_grad()
                probs = rc.forward(x)
                cur_loss = loss(probs, y)
                total_loss += cur_loss
                cur_loss.backward()
                optimizer.step()
            if epoch % 10 == 0:
                print("Avg loss on epoch %i: %f" % (epoch, total_loss/num_tweets))
        return rc
    except KeyboardInterrupt:
        return rc

In [187]:
X_train, X_test, y_train, y_test = train_test_split(df['Text'].values, relevancy, test_size=0.33)

In [188]:
model = train_relevancy_classifier(X_train, y_train, embeddings, glove)

Avg loss on epoch 0: 0.729923
Avg loss on epoch 10: 0.442603
Avg loss on epoch 20: 0.410383
Avg loss on epoch 30: 0.387440
Avg loss on epoch 40: 0.369778
Avg loss on epoch 50: 0.351115
Avg loss on epoch 60: 0.334685


In [189]:
val_tweets = X_test
val_labels = y_test
for i in range(len(val_labels)):
    if val_labels[i] > 0:
        val_labels[i] = 1

num_correct = 0
num_true_pos = 0
num_false_pos = 0
num_false_neg = 0

wv = KeyedVectors.load("data/word2vec.kv", mmap='r')

for i in range(len(val_tweets)):
    cur_embed = []
    cur_tweet = val_tweets[i]
    cur_label = val_labels[i]
    for i in cur_tweet.split():
        if i in glove:
            newHeuristic = weightingHeuristic(wv, i, keys, embeddings, glove)
            cur_embed.append(newHeuristic * embeddings[glove[i]])
    if len(cur_embed) == 0:
        print ("curembed was 0")
        continue
    x = torch.from_numpy(np.asarray(np.mean(cur_embed, axis=0)).reshape(1,vec_length)).float()
    probs = model.forward(x).detach().numpy().reshape(2)
    pred_label = np.argmax(probs)
    #print ("pred_label: " + str(pred_label) + " cur_label: " + str(cur_label))
    if pred_label == cur_label:
        num_correct += 1
        if pred_label > 0:
            num_true_pos += 1
    else:
        if pred_label == 0:
            num_false_neg += 1
        else:
            num_false_pos += 1
print ("numCorrect: " + str(num_correct))
print ("numTruePos: " + str(num_true_pos))
print ("numFalsePos: " + str(num_false_pos))
print ("numFalseNeg: " + str(num_false_neg))
accuracy = float(num_correct)/len(val_tweets)
precision = float(num_true_pos)/(num_true_pos + num_false_pos)
recall = float(num_true_pos)/(num_true_pos + num_false_neg)
f1 = 2*precision*recall/(precision+recall)

print('accuracy: %f' % accuracy)
print('precision: %f' % precision)
print('recall: %f' % recall)
print('f1: %f' % f1)

numCorrect: 385
numTruePos: 57
numFalsePos: 33
numFalseNeg: 74
accuracy: 0.782520
precision: 0.633333
recall: 0.435115
f1: 0.515837


In [193]:
#urgency
X_train, X_test, y_train, y_test = train_test_split(df['Text'].values, urgency, test_size=0.33)
model = train_relevancy_classifier(X_train, y_train, embeddings, glove)

Avg loss on epoch 0: 0.856552
Avg loss on epoch 10: 0.254290
Avg loss on epoch 20: 0.235053
Avg loss on epoch 30: 0.215762
Avg loss on epoch 40: 0.201644
Avg loss on epoch 50: 0.186645
Avg loss on epoch 60: 0.172226


In [194]:
val_tweets = X_test
val_labels = y_test
for i in range(len(val_labels)):
    if val_labels[i] > 0:
        val_labels[i] = 1

num_correct = 0
num_true_pos = 0
num_false_pos = 0
num_false_neg = 0

wv = KeyedVectors.load("data/word2vec.kv", mmap='r')

for i in range(len(val_tweets)):
    cur_embed = []
    cur_tweet = val_tweets[i]
    cur_label = val_labels[i]
    for i in cur_tweet.split():
        if i in glove:
            newHeuristic = weightingHeuristic(wv, i, keys, embeddings, glove)
            cur_embed.append(newHeuristic * embeddings[glove[i]])
    if len(cur_embed) == 0:
        print ("curembed was 0")
        continue
    x = torch.from_numpy(np.asarray(np.mean(cur_embed, axis=0)).reshape(1,vec_length)).float()
    probs = model.forward(x).detach().numpy().reshape(2)
    pred_label = np.argmax(probs)
    #print ("pred_label: " + str(pred_label) + " cur_label: " + str(cur_label))
    if pred_label == cur_label:
        num_correct += 1
        if pred_label > 0:
            num_true_pos += 1
    else:
        if pred_label == 0:
            num_false_neg += 1
        else:
            num_false_pos += 1
print ("numCorrect: " + str(num_correct))
print ("numTruePos: " + str(num_true_pos))
print ("numFalsePos: " + str(num_false_pos))
print ("numFalseNeg: " + str(num_false_neg))
accuracy = float(num_correct)/len(val_tweets)
precision = float(num_true_pos)/(num_true_pos + num_false_pos)
recall = float(num_true_pos)/(num_true_pos + num_false_neg)
f1 = 2*precision*recall/(precision+recall)

print('accuracy: %f' % accuracy)
print('precision: %f' % precision)
print('recall: %f' % recall)
print('f1: %f' % f1)

numCorrect: 422
numTruePos: 9
numFalsePos: 13
numFalseNeg: 57
accuracy: 0.857724
precision: 0.409091
recall: 0.136364
f1: 0.204545


In [195]:
#only handles binary classification for now
def tweets_to_df(df, labels, embeddings, glove):
    wv = KeyedVectors.load("data/word2vec.kv", mmap='r')
    weights = []
    index_omit = []
    index = -1
    tweets = df['Text']
    
    for i in range(vec_length+1):
        weights.append([])
    
    for i in range(len(tweets)):
        index += 1
        cur_embed = []
        cur_tweet = tweets[i]
        cur_label = labels[i]
        for i in cur_tweet.split():
            if i in glove:
                newHeuristic = weightingHeuristic(wv, i, keys, embeddings, glove)
                cur_embed.append(newHeuristic * embeddings[glove[i]])
        
        if len(cur_embed) == 0:
            #make sure we drop this row from the input dataframe
            index_omit.append(index)
            continue
        
        x = np.asarray(np.mean(cur_embed, axis=0))
        
        for j in range(vec_length):
            weights[j].append(x[j])
        weights[vec_length].append(0 if cur_label == 0 else 1)
        #weights[vec_length].append(cur_label)
        
    df_pruned = df.drop(index_omit)
    
    #convert to dataframe
    cols = {}
    for i in range(vec_length):
       cols['v' + str(i)] = weights[i]
    
    cols['class'] = weights[vec_length]
    
    df2 = pd.DataFrame(data=cols)
    return df2

In [196]:
dfv = tweets_to_df(df, relevancy, embeddings, glove)
labels = dfv.pop('class')
dfv.head()

Unnamed: 0,v0,v1,v10,v11,v12,v13,v14,v15,v16,v17,...,v45,v46,v47,v48,v49,v5,v6,v7,v8,v9
0,3.998035,5.347114,0.950726,-1.466031,-58.230695,2.642284,6.643521,1.476769,-6.476167,3.053299,...,-2.344103,2.369313,2.817833,1.587355,7.929418,3.433002,0.103426,-4.444109,3.601828,-7.109326
1,10.605123,11.142267,-5.486157,-0.896788,-95.222239,-4.549225,-6.169168,4.328559,1.857858,-2.36118,...,-7.594231,-1.014837,-2.841846,-4.090874,2.883946,-1.407894,21.839877,-2.375379,4.392766,2.105177
2,12.350399,13.051163,-3.55858,-6.126958,-68.863806,-7.538091,2.289408,3.754879,-0.834112,1.938239,...,0.102241,1.185584,1.660065,-4.959424,0.344646,-6.032435,11.982321,-1.327724,6.428528,-11.003581
3,8.809885,5.458992,-2.47159,1.22746,-77.099327,-3.85733,-1.095846,4.377424,0.624772,-3.6022,...,-3.280418,5.682988,1.137328,-3.541299,3.428471,-1.458458,12.939525,4.937977,-1.503626,0.343659
4,11.60692,11.16761,1.128147,-7.107333,-74.370362,-0.794647,5.265394,11.115533,-2.088763,0.902111,...,-1.667678,1.449313,2.282958,-1.661326,1.893424,-8.783475,12.07451,-2.774548,3.245433,-12.295974


In [199]:
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import * 
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import *
from sklearn.model_selection import cross_validate
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2

def average(x):
    return sum(x)/len(x)

def get_stats(model, X, y, cv, verbose=False):
    
    accuracy = []
    precision = []
    recall = []
    f1 = []
    auc = []
        
    cv_results = cross_validate(model, X, y, scoring = ['accuracy', 'precision', 'recall', 'f1', 'roc_auc'], 
                                cv=cv, return_train_score=False)
    
    if verbose:
        print(cv_results)
    
    #now return the data
    return cv_results

In [200]:
models = {'Naive Bayes': GaussianNB(),
          'Voting': VotingClassifier(estimators=[('mlp', MLPClassifier()),
                                            ('ada', AdaBoostClassifier()),
                                            ('nb', GaussianNB())], voting='soft'),
          'Perceptron': MLPClassifier(),
          'AdaBoost': AdaBoostClassifier()}

vals = []
metric = []
model_name = []

f1 = []
precision = []
recall = []
accuracy = []
auc = []
method = ['Naive Bayes', 'Voting', 'MLP', 'AdaBoost']

cv = 10
for k,v in models.items():
    stats = get_stats(v, dfv, labels, cv)
    accuracy_avg = np.average(stats['test_accuracy'])
    accuracy_std = np.std(stats['test_accuracy'])
    precision_avg = np.average(stats['test_precision'])
    precision_std = np.std(stats['test_precision'])
    recall_avg = np.average(stats['test_recall'])
    recall_std = np.std(stats['test_recall'])
    f1_avg = np.average(stats['test_f1'])
    f1_std = np.std(stats['test_f1'])
    auc_avg = np.average(stats['test_roc_auc'])
    
    f1.append('%.2f ± %.2f' % (f1_avg, f1_std))
    precision.append('%.2f ± %.2f' % (precision_avg, precision_std))
    recall.append('%.2f ± %.2f' % (recall_avg, recall_std))
    accuracy.append('%.2f ± %.2f' % (accuracy_avg, accuracy_std))
    auc.append('%.2f' % auc_avg)
    
#     print('%s (%.2f, %.4f) (%.2f, %.4f) (%.2f, %.4f) (%.2f, %.4f) %.2f' % 
#           (k, accuracy_avg, accuracy_std, precision_avg, precision_std, recall_avg, 
#                recall_std, f1_avg, f1_std, auc_avg))

df_view = pd.DataFrame(data={'Method': method, 'f1': f1, 
                             'precision':precision, 'recall':recall,
                             'accuracy':accuracy, 'auc':auc})
df_view

Unnamed: 0,Method,accuracy,auc,f1,precision,recall
0,Naive Bayes,0.76 ± 0.04,0.77,0.47 ± 0.08,0.55 ± 0.10,0.41 ± 0.09
1,Voting,0.74 ± 0.03,0.76,0.55 ± 0.06,0.49 ± 0.06,0.62 ± 0.09
2,MLP,0.68 ± 0.04,0.74,0.53 ± 0.04,0.42 ± 0.04,0.70 ± 0.06
3,AdaBoost,0.73 ± 0.04,0.73,0.42 ± 0.08,0.48 ± 0.09,0.38 ± 0.10
