In [1]:
import numpy as np
import pandas as pd
from pprint import pprint

import gensim
from gensim.utils import simple_preprocess
from gensim import corpora, models

from nltk.stem import WordNetLemmatizer
import nltk
nltk.download('wordnet')
from nltk.tokenize import TweetTokenizer
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))
from symspellpy.symspellpy import SymSpell, Verbosity
from sklearn.model_selection import train_test_split

from process_tweet import *

import collections
import torch
import torch.nn as nn
from torch import optim
import random
import time

import warnings;
warnings.filterwarnings('ignore');

[nltk_data] Downloading package wordnet to /home/ashwin/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package wordnet to /home/ashwin/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [3]:
df = pd.read_csv('data/labeled_prelim_lda.csv')
df = df.dropna()
df.pop('Id')
df = df.astype({'Relevancy':np.int32, 'Urgency':np.int32})
df.head()

Unnamed: 0,Text,Relevancy,Urgency,top0,top1,top2,top3,top4,top5,top6,top7,top8,top9,top10,top11,top12,top13,top14
0,millions afghanistan even zero attack isis sym...,0,0,0.0,0.326608,0.0,0.098502,0.089703,0.0,0.0,0.0,0.158985,0.0,0.088616,0.0,0.0,0.0,0.191432
1,last post brother make social media phone go v...,2,1,0.0,0.0,0.2675,0.0,0.0,0.0,0.0,0.0,0.145833,0.106667,0.0,0.106667,0.0,0.0,0.306667
2,listen local officials epa help harvey respons...,0,0,0.0,0.0,0.314598,0.132971,0.0,0.258455,0.202309,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,damn proud tirelessly help fellow texans affec...,3,0,0.0,0.0,0.088902,0.088699,0.0,0.11002,0.0,0.154831,0.0,0.410966,0.0,0.0,0.0,0.0,0.096582
4,help harvey disaster response help victims nat...,0,0,0.0,0.0,0.211954,0.282543,0.0,0.0,0.0,0.22222,0.0,0.0,0.109938,0.0,0.0,0.0,0.106679


In [4]:
#sym_spell = create_symspell(2,7,'data/frequency_dictionary_en_82_765.txt')
tknzr = TweetTokenizer(strip_handles=True, reduce_len=True)

doc_sample = df['Text'][0]
process_tweet(doc_sample, tknzr)

'millions afghanistan even zero attack isis sympathizers invest texas nation build harvey texasflood'

In [5]:
#list of embeddings
vec_length = 50
embeddings = np.zeros((1193514, vec_length))

#two-way map, index->word and word->index
glove = {}

index = 0
with open('data/glove.twitter.27B/glove.twitter.27B.%dd.txt' % vec_length) as f:
    for l in f:
        line = []
        try:
            line = l.split()
            if len(line) != vec_length+1:
                print('empty line')
                continue
            
            word = line[0]
            embeddings[index] = np.array(line[1:]).astype(np.float)
            glove[index] = word
            glove[word] = index
            index += 1
        except:
            print(line)
            print(index)
            break

empty line


In [6]:
#convert columns to numpy arrays
text = df['Text'].values
relevancy = df['Relevancy'].values
urgency = df['Urgency'].values

In [7]:
class RelevancyClassifier(nn.Module):
    def __init__(self, index, embeddings, embed_len, num_classes):
        super(RelevancyClassifier, self).__init__()
        self.hidden_size = 30
        self.embed_len = embed_len
        #print(embeddings.shape)
        #self.embedding = nn.Embedding.from_pretrained(embeddings)
        self.fc1 = nn.Linear(embed_len, self.hidden_size)
        self.nl = nn.LeakyReLU()
        self.fc2 = nn.Linear(self.hidden_size, num_classes)

    def forward(self, x):
        return self.fc2(self.nl(self.fc1(x)))

def train_relevancy_classifier(train_exs, train_labels, embeddings, index):
    try:
        epochs = 100
        lr = .0001
        num_classes = 2
        rc = RelevancyClassifier(index, embeddings, len(embeddings[0]), num_classes)
        optimizer = optim.Adam(rc.parameters(), lr=lr)
        loss = nn.CrossEntropyLoss()
        
        for epoch in range(epochs):
            ex_indices = [i for i in range(len(train_exs))]
            random.shuffle(ex_indices)
            total_loss = 0.0
            num_tweets = len(ex_indices)
            for idx in ex_indices:
                cur_tweet = train_exs[idx]
                cur_embed = []
                for i in cur_tweet.split():
                    if i in index:
                        cur_embed.append(embeddings[index[i]])
                if len(cur_embed) == 0:
                    num_tweets -= 1
                    continue
                
                x = torch.from_numpy(np.asarray(np.mean(cur_embed, axis=0)).reshape(1,vec_length)).float()
                y = np.asarray(train_labels[idx]).reshape(1)
                if y[0] > 0:
                    y[0] = 1
                y = torch.tensor(y).long()
                rc.zero_grad()
                probs = rc.forward(x)
                cur_loss = loss(probs, y)
                total_loss += cur_loss
                cur_loss.backward()
                optimizer.step()
            if epoch % 10 == 0:
                print("Avg loss on epoch %i: %f" % (epoch, total_loss/num_tweets))
        return rc
    except KeyboardInterrupt:
        return rc

In [9]:
X_train, X_test, y_train, y_test = train_test_split(df['Text'].values, relevancy, test_size=0.33)

In [10]:
model = train_relevancy_classifier(X_train, y_train, embeddings, glove)

Avg loss on epoch 0: 0.577541
Avg loss on epoch 10: 0.471307
Avg loss on epoch 20: 0.449612
Avg loss on epoch 30: 0.436855
Avg loss on epoch 40: 0.425467
Avg loss on epoch 50: 0.415612
Avg loss on epoch 60: 0.405068
Avg loss on epoch 70: 0.394910
Avg loss on epoch 80: 0.384393
Avg loss on epoch 90: 0.375414


In [11]:
val_tweets = X_test
val_labels = y_test
for i in range(len(val_labels)):
    if val_labels[i] > 0:
        val_labels[i] = 1

num_correct = 0
num_true_pos = 0
num_false_pos = 0
num_false_neg = 0

for i in range(len(val_tweets)):
    cur_embed = []
    cur_tweet = val_tweets[i]
    cur_label = val_labels[i]
    for i in cur_tweet.split():
        if i in glove:
            cur_embed.append(embeddings[glove[i]])
    if len(cur_embed) == 0:
        continue
    x = torch.from_numpy(np.asarray(np.mean(cur_embed, axis=0)).reshape(1,vec_length)).float()
    probs = model.forward(x).detach().numpy().reshape(2)
    pred_label = np.argmax(probs)
    if pred_label == cur_label:
        num_correct += 1
        if pred_label > 0:
            num_true_pos += 1
    else:
        if pred_label == 0:
            num_false_neg += 1
        else:
            num_false_pos += 1

accuracy = num_correct/len(val_tweets)
precision = num_true_pos/(num_true_pos + num_false_pos)
recall = num_true_pos/(num_true_pos + num_false_neg)
f1 = 2*precision*recall/(precision+recall)

print('accuracy: %f' % accuracy)
print('precision: %f' % precision)
print('recall: %f' % recall)
print('f1: %f' % f1)

accuracy: 0.760163
precision: 0.511111
recall: 0.383333
f1: 0.438095


In [12]:
#only handles binary classification for now
def tweets_to_df(df, labels, embeddings, glove):
    
    weights = []
    index_omit = []
    index = -1
    tweets = df['Text']
    
    for i in range(vec_length+1):
        weights.append([])
    
    for i in range(len(tweets)):
        index += 1
        cur_embed = []
        cur_tweet = tweets[i]
        cur_label = labels[i]
        for i in cur_tweet.split():
            if i in glove:
                cur_embed.append(embeddings[glove[i]])
        
        if len(cur_embed) == 0:
            #make sure we drop this row from the input dataframe
            index_omit.append(index)
            continue
        
        x = np.asarray(np.mean(cur_embed, axis=0))
        
        for j in range(vec_length):
            weights[j].append(x[j])
        weights[vec_length].append(0 if cur_label == 0 else 1)
        #weights[vec_length].append(cur_label)
        
    df_pruned = df.drop(index_omit)
    
    #convert to dataframe
    cols = {}
    for i in range(vec_length):
       cols['v' + str(i)] = weights[i]
    
    cols['class'] = weights[vec_length]
    
    df2 = pd.DataFrame(data=cols)
    return df2

In [13]:
dfv = tweets_to_df(df, relevancy, embeddings, glove)
labels = dfv.pop('class')
dfv.head()

Unnamed: 0,v0,v1,v2,v3,v4,v5,v6,v7,v8,v9,...,v40,v41,v42,v43,v44,v45,v46,v47,v48,v49
0,0.155161,0.263687,-0.029247,-0.204771,-0.086706,0.11964,-0.093732,-0.177305,0.228087,-0.533599,...,-0.05722,-0.203605,0.035893,-0.323683,0.077375,-0.104995,0.097013,0.07906,0.138891,0.369817
1,0.506732,0.533653,-0.23627,-0.220237,0.152508,-0.093731,0.910368,-0.188011,0.156793,-0.024209,...,-1.003719,0.280569,-0.034819,-0.100392,0.27576,-0.321023,-0.030767,-0.124344,-0.230642,0.072494
2,0.616114,0.568701,-0.347945,-0.320448,0.147845,-0.314545,0.45459,-0.031889,0.34414,-0.661508,...,-0.412471,0.25362,0.302392,0.076902,0.06233,0.034372,0.041387,0.031561,-0.225155,0.007954
3,0.402962,0.203641,-0.242108,-0.260091,-0.097311,-0.035554,0.515226,0.211987,-0.145409,-0.181648,...,-0.16968,0.037749,0.24001,0.07818,0.026234,-0.069752,0.245466,0.028327,-0.201005,0.133578
4,0.530228,0.428295,-0.506334,-0.411837,0.458561,-0.404194,0.540873,-0.13778,0.16795,-0.65472,...,-0.351348,-0.081795,0.374603,-0.125783,0.285781,-0.053783,0.070025,0.046146,-0.0972,0.10483


In [17]:
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import * 
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import *
from sklearn.model_selection import cross_validate
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2

def average(x):
    return sum(x)/len(x)

def get_stats(model, X, y, cv, verbose=False):
    
    accuracy = []
    precision = []
    recall = []
    f1 = []
    auc = []
        
    cv_results = cross_validate(model, X, y, scoring = ['accuracy', 'precision', 'recall', 'f1', 'roc_auc'], 
                                cv=cv, return_train_score=False)
    
    if verbose:
        print(cv_results)
    
    #now return the data
    return cv_results

In [18]:
models = {'Perceptron': MLPClassifier(), 'KNN': KNeighborsClassifier(),
         'AdaBoost': AdaBoostClassifier(),
          'Naive Bayes': GaussianNB(),
          'Voting': VotingClassifier(estimators=[('mlp', MLPClassifier()),
                                            ('ada', AdaBoostClassifier()),
                                            ('nb', GaussianNB())], voting='soft')}

vals = []
metric = []
model_name = []

X_new = SelectKBest(k=25).fit_transform(dfv, labels)

cv = 10
for k,v in models.items():
    stats = get_stats(v, dfv, labels, cv)
    accuracy_avg = np.average(stats['test_accuracy'])
    accuracy_std = np.std(stats['test_accuracy'])
    precision_avg = np.average(stats['test_precision'])
    precision_std = np.std(stats['test_precision'])
    recall_avg = np.average(stats['test_recall'])
    recall_std = np.std(stats['test_recall'])
    f1_avg = np.average(stats['test_f1'])
    f1_std = np.std(stats['test_f1'])
    auc_avg = np.average(stats['test_roc_auc'])
    
    print('%s (%.2f, %.4f) (%.2f, %.4f) (%.2f, %.4f) (%.2f, %.4f) %.2f' % 
          (k, accuracy_avg, accuracy_std, precision_avg, precision_std, recall_avg, 
               recall_std, f1_avg, f1_std, auc_avg))

Perceptron (0.76, 0.0381) (0.57, 0.1004) (0.42, 0.0448) (0.48, 0.0576) 0.77
KNN (0.75, 0.0273) (0.53, 0.0643) (0.42, 0.0777) (0.46, 0.0572) 0.72
AdaBoost (0.75, 0.0325) (0.52, 0.0756) (0.42, 0.0727) (0.46, 0.0610) 0.75
Naive Bayes (0.70, 0.0387) (0.45, 0.0449) (0.69, 0.0728) (0.54, 0.0473) 0.76
Voting (0.75, 0.0337) (0.52, 0.0567) (0.63, 0.0787) (0.57, 0.0569) 0.78
