In [1]:
#Loading csv file and creating tweets and category list 
import os
import csv
import pandas as pd
import nltk
from sklearn.model_selection import KFold #import KFold
import numpy as np
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import SGDClassifier
from sklearn.linear_model import LogisticRegression
from gensim.models import KeyedVectors
from gensim.models.keyedvectors import Word2VecKeyedVectors, FastTextKeyedVectors
from sklearn.metrics import accuracy_score
from keras.models import Sequential
from keras.layers import Dense, Dropout
from keras.optimizers import RMSprop
from keras.wrappers.scikit_learn import KerasClassifier
from keras import backend as K
from nltk.tokenize import RegexpTokenizer
import math
import tensorflow as tf
from sklearn.pipeline import Pipeline
from glove import Glove

Using TensorFlow backend.


In [2]:
def loadWordVector(index):
    if index==0:
        google_vecs = KeyedVectors.load_word2vec_format(
            '/home/hnakai/GoogleNews-vectors-negative300.bin', 
            binary=True, limit=200000)
        return google_vecs
    if index==1:
        fasttext_vecs = FastTextKeyedVectors.load("fasttext_300.wv")
        return fasttext_vecs
    elif index!=5:
        wv_files=["word2vec_tweets_300.wv","word2vec_tweets_100.wv","word2vec_tweets_500.wv"]
        word_vec = Word2VecKeyedVectors.load(wv_files[index-2])
        return word_vec
    else: 
        glove = Glove.load('glove.wv')
        word_vec = {}
        for word, i in glove.dictionary.iteritems():
            word_vec[word]=glove.word_vectors[i]
        return word_vec

filename = []
filename.append("/home/hnakai/CrisisLexT6/2012_Sandy_Hurricane/2012_Sandy_Hurricane-ontopic_offtopic.csv")
filename.append("/home/hnakai/CrisisLexT6/2013_Alberta_Floods/2013_Alberta_Floods-ontopic_offtopic.csv")
filename.append("/home/hnakai/CrisisLexT6/2013_Boston_Bombings/2013_Boston_Bombings-ontopic_offtopic.csv")
filename.append("/home/hnakai/CrisisLexT6/2013_Oklahoma_Tornado/2013_Oklahoma_Tornado-ontopic_offtopic.csv")
filename.append("/home/hnakai/CrisisLexT6/2013_Queensland_Floods/2013_Queensland_Floods-ontopic_offtopic.csv")
filename.append("/home/hnakai/CrisisLexT6/2013_West_Texas_Explosion/2013_West_Texas_Explosion-ontopic_offtopic.csv")

def loadCsv(filename):
    print("**************************************************************")
    print("")
    print("File Name: "+filename)
    print("")
    lines = csv.reader(open(filename, 'rt'))
    dataset = list(lines)
    nltk_tweets = []
    tweets = []
    filtered_tweets = []
    full_category = []

    for i in range(len(dataset)):
        if i>0:
            nltk_tweets.append((dataset[i][1], dataset[i][2])) #array containing both tweets and category
            tweets.append(dataset[i][1]) #array containing just tweets
            full_category.append(dataset[i][2]) #array containing just category

    return nltk_tweets,tweets,full_category


In [3]:
#Cross Validation - 5-Fold Validation


def crossvalidate(data):
    x = np.array(data) #convert tweets into np array
    kf = KFold(n_splits=5) # implementing 5-fold validation
    kf.get_n_splits(x) # returns the number of splitting iterations in cross-validator
    return kf,x


In [4]:
# Neural Network


# Parameters
dropout = 0.25
learning_rate = 0.00005 
n_hidden = 3
n_epochs = 50
min_cc=5

def fit_and_predict_w2v(kf, x, classifier, tweets, full_category, wv=1):
    average = []
    
    for train_index, test_index in kf.split(x):
        print("TRAIN:", train_index, "TEST:", test_index)
        X_train, X_test = x[train_index], x[test_index]
        train_set = []
        test_set = []
        categories_train = []
        categories_test = []
        for index in train_index:
            train_set.append(tweets[index])
            categories_train.append(0 if full_category[index]=='off-topic' else 1)
        for ind in test_index:
            test_set.append(tweets[ind])
            categories_test.append(0 if full_category[ind]=='off-topic' else 1)
        tweet_train_counts, categories_train = word2vec(train_set, categories_train, wv)
        tweet_test_counts, categories_test = word2vec(test_set, categories_test, wv)
        print(tweet_train_counts.shape)
        n_input = tweet_train_counts.shape[1]
        
        classifier = classifier.fit(tweet_train_counts, categories_train)
        acc = accuracy_score(categories_test, classifier.predict(tweet_test_counts))
        print acc
        average.append(acc)
    
    sum = 0
    for value in average:
        sum = sum + value
    print("Accuracy: ")
    print((sum/5)*100)
    
    return sum/5*100

def create_model(n_input):
    model=Sequential()
    n_neurons = int(math.ceil(float(n_input)*2.0/3.0))
    model.add(Dense(n_neurons, activation='relu', input_dim=n_input))
    model.add(Dropout(dropout))
    for _ in range(1, n_hidden):
        model.add(Dense(n_neurons, activation='relu'))
        model.add(Dropout(dropout))
    model.add(Dense(1, activation='sigmoid'))
    model.compile(loss='binary_crossentropy', 
        optimizer=RMSprop(lr=learning_rate),
        metrics=['accuracy'])
    return model

def fit_and_predict_w2v_nn(kf, x, tweets, full_category, wv=1):
    average = []
    
    for train_index, test_index in kf.split(x):
        print("TRAIN:", train_index, "TEST:", test_index)
        X_train, X_test = x[train_index], x[test_index]
        train_set = []
        test_set = []
        categories_train = []
        categories_test = []
        for index in train_index:
            train_set.append(tweets[index])
            categories_train.append(0 if full_category[index]=='off-topic' else 1)
        for ind in test_index:
            test_set.append(tweets[ind])
            categories_test.append(0 if full_category[ind]=='off-topic' else 1)
        tweet_train_counts, categories_train = word2vec(train_set, categories_train, wv)
        tweet_test_counts, categories_test = word2vec(test_set, categories_test, wv)
        print(tweet_train_counts.shape)
        n_input = tweet_train_counts.shape[1]
        
        #Reinitialize TensorFlow: due to bug in TF
        tf.reset_default_graph()
        K.clear_session()
        K.set_session(tf.Session())
        
        model = create_model(n_input)
        model.fit(tweet_train_counts, categories_train, epochs=n_epochs, batch_size=50, verbose=0)
        acc = model.evaluate(tweet_test_counts, categories_test, batch_size=50, verbose=0)[1]
        print acc
        average.append(acc)
    
    sum = 0
    for value in average:
        sum = sum + value
    print("Accuracy: ")
    print((sum/5)*100)
    
    return sum/5*100
    
def word2vec(texts, related, wv=1, pos_filter = "nofilter"):
    vects, voided = [], []
    tokenizer = RegexpTokenizer('(?:@?)+\w+(?:(?:\'|-)\w+)?')
    tokens = [[w for w in tokenizer.tokenize(d.lower()) 
               if (pos_filter == "nofilter"
                   or (pos_filter == "stopwords" and w not in stop_words)
                   or map_tag('en-ptb', 'universal', pos_tag([w])[0][1]) in pos_filter
                  )]
              for d in texts]
    word_vec=loadWordVector(wv)
    for i, d in enumerate(tokens):
        count = 0
        doc_vect = np.zeros(len(word_vec["hurricane"]))
        for w in d:
            try:
                doc_vect += word_vec[w.decode('ascii', 'ignore')]
                count+=1
            except KeyError:
                continue
        if np.isnan(np.min(doc_vect)) or count == 0:
            voided.append(i)
            continue
        doc_vect /= count
        vects.append(doc_vect)
    return np.array(vects), np.delete(related, voided, 0)

In [5]:
def classify_crisis_tweets_w2v(wv=1):
    accuracies = []
    
    for i, file in enumerate(filename):
        nltk_tweets= []
        tweets = []
        full_category = []
        nltk_tweets, tweets, full_category = loadCsv(file)
        kf,x = crossvalidate(nltk_tweets)
        
        accs = []
        
        names = ["Naive Bayes", "Logistic Regression", "Support Vector Machine"]
        for i, classifier in enumerate([GaussianNB(),
                                        LogisticRegression(),
                                        SGDClassifier(loss='hinge', penalty='l2', alpha=1e-3, max_iter=5, random_state=42)
                                       ]):
            print("**************************************************************")
            print(names[i]+" Classification using Word2Vec Word Embeddings")
            print("**************************************************************")
            accs.append(fit_and_predict_w2v(kf, x, classifier, tweets, full_category, wv))
        
        print("**************************************************************")
        print("Neural Network Classification using Word2Vec Word Embeddings")
        print("**************************************************************")
        accs.append(fit_and_predict_w2v_nn(kf, x, tweets, full_category, wv))
        accuracies.append(accs)
    return accuracies
accs=[]
wvn=["google","fasttext","tw300","tw100","tw500","glove"]
for i in range(6):
    if "w2v_comparison_"+wvn[i]+".csv" in os.listdir('.'):
        accs.append(None)
    else:
        accuracies = classify_crisis_tweets_w2v(i)
        accs.append(accuracies)

**************************************************************

File Name: /home/hnakai/CrisisLexT6/2012_Sandy_Hurricane/2012_Sandy_Hurricane-ontopic_offtopic.csv

**************************************************************
Naive Bayes Classification using Word2Vec Word Embeddings
**************************************************************
('TRAIN:', array([ 2002,  2003,  2004, ..., 10005, 10006, 10007]), 'TEST:', array([   0,    1,    2, ..., 1999, 2000, 2001]))
(8006, 300)
0.7182817182817183
('TRAIN:', array([    0,     1,     2, ..., 10005, 10006, 10007]), 'TEST:', array([2002, 2003, 2004, ..., 4001, 4002, 4003]))
(8006, 300)
0.7387612387612388
('TRAIN:', array([    0,     1,     2, ..., 10005, 10006, 10007]), 'TEST:', array([4004, 4005, 4006, ..., 6003, 6004, 6005]))
(8006, 300)
0.7737262737262737
('TRAIN:', array([    0,     1,     2, ..., 10005, 10006, 10007]), 'TEST:', array([6006, 6007, 6008, ..., 8004, 8005, 8006]))
(8007, 300)
0.8700649675162418
('TRAIN:', array([   0, 

(8025, 300)
0.9257228295444134
Accuracy: 
91.7555155259881
**************************************************************

File Name: /home/hnakai/CrisisLexT6/2013_Boston_Bombings/2013_Boston_Bombings-ontopic_offtopic.csv

**************************************************************
Naive Bayes Classification using Word2Vec Word Embeddings
**************************************************************
('TRAIN:', array([ 2003,  2004,  2005, ..., 10009, 10010, 10011]), 'TEST:', array([   0,    1,    2, ..., 2000, 2001, 2002]))
(8009, 300)
0.8262606090863704
('TRAIN:', array([    0,     1,     2, ..., 10009, 10010, 10011]), 'TEST:', array([2003, 2004, 2005, ..., 4003, 4004, 4005]))
(8009, 300)
0.8227658512231653
('TRAIN:', array([    0,     1,     2, ..., 10009, 10010, 10011]), 'TEST:', array([4006, 4007, 4008, ..., 6005, 6006, 6007]))
(8010, 300)
0.8396603396603397
('TRAIN:', array([    0,     1,     2, ..., 10009, 10010, 10011]), 'TEST:', array([6008, 6009, 6010, ..., 8007, 8008, 8009

(7994, 300)
0.8633633596641762
Accuracy: 
87.65973546180248
**************************************************************

File Name: /home/hnakai/CrisisLexT6/2013_Queensland_Floods/2013_Queensland_Floods-ontopic_offtopic.csv

**************************************************************
Naive Bayes Classification using Word2Vec Word Embeddings
**************************************************************
('TRAIN:', array([ 2007,  2008,  2009, ..., 10030, 10031, 10032]), 'TEST:', array([   0,    1,    2, ..., 2004, 2005, 2006]))
(8026, 300)
0.8121574489287494
('TRAIN:', array([    0,     1,     2, ..., 10030, 10031, 10032]), 'TEST:', array([2007, 2008, 2009, ..., 4011, 4012, 4013]))
(8026, 300)
0.8290981564524166
('TRAIN:', array([    0,     1,     2, ..., 10030, 10031, 10032]), 'TEST:', array([4014, 4015, 4016, ..., 6018, 6019, 6020]))
(8026, 300)
0.8704534130543099
('TRAIN:', array([    0,     1,     2, ..., 10030, 10031, 10032]), 'TEST:', array([6021, 6022, 6023, ..., 8024, 8025,

(8005, 300)
0.9695152461320266
('TRAIN:', array([   0,    1,    2, ..., 8002, 8003, 8004]), 'TEST:', array([ 8005,  8006,  8007, ..., 10003, 10004, 10005]))
(8005, 300)
0.9775112464629311
Accuracy: 
95.75266812151526


In [6]:
corpus = []
corpus.append("sandy")
corpus.append("alberta")
corpus.append("boston")
corpus.append("oklahoma")
corpus.append("queensland")
corpus.append("west_texas")

wvn=["google","fasttext","tw300","tw100","tw500","glove"]

df_accs=[]
for i, a in enumerate(accs):
    if "w2v_comparison_"+wvn[i]+".csv" in os.listdir('.'):
        df_accs.append(pd.read_csv("w2v_comparison_"+wvn[i]+".csv", header=0))
    else:
        df_accs.append(pd.DataFrame(a,
                     columns=["Naive Bayes","Logistic Regression", "SVM", "Neural Network"],
                     index = corpus
                    ))
        df_accs[-1].to_csv("w2v_comparison_"+wvn[i]+".csv")


In [7]:
#Google
df_accs[0]

Unnamed: 0.1,Unnamed: 0,Naive Bayes,Logistic Regression,SVM,Neural Network
0,sandy,87.6507,90.408437,89.649241,91.137857
1,alberta,81.686748,85.205678,82.872697,86.930483
2,boston,82.821263,86.646599,85.977687,88.06506
3,oklahoma,86.448495,89.731207,90.451903,91.813154
4,queensland,92.166232,95.405547,95.016868,96.372447
5,west_texas,92.334552,94.553278,94.163363,95.952493


In [8]:
#FastText
df_accs[1]

Unnamed: 0.1,Unnamed: 0,Naive Bayes,Logistic Regression,SVM,Neural Network
0,sandy,73.193084,88.819726,85.303382,89.109546
1,alberta,78.686819,92.66261,89.840975,93.25084
2,boston,78.657334,90.112558,88.514237,91.081345
3,oklahoma,81.185112,90.702198,87.430467,91.552858
4,queensland,84.073748,95.415666,95.226264,96.033563
5,west_texas,89.65652,95.572853,90.755137,96.841959


In [9]:
#W2V 300
df_accs[2]

Unnamed: 0.1,Unnamed: 0,Naive Bayes,Logistic Regression,SVM,Neural Network
0,sandy,80.436725,89.649196,85.812104,90.978032
1,alberta,81.338307,91.994679,90.648647,93.430287
2,boston,81.483384,90.632059,90.691775,91.980341
3,oklahoma,84.307068,90.561978,90.372238,92.483544
4,queensland,87.522127,95.196424,94.608576,96.183095
5,west_texas,92.374742,95.812698,95.352978,96.851924


In [10]:
#W2V 100
df_accs[3]

Unnamed: 0.1,Unnamed: 0,Naive Bayes,Logistic Regression,SVM,Neural Network
0,sandy,78.058573,87.15124,84.653562,87.571004
1,alberta,80.859792,88.884324,86.262001,88.914214
2,boston,80.124972,88.06517,86.796533,89.153682
3,oklahoma,83.416373,89.361042,88.410787,90.131692
4,queensland,87.980523,94.747889,93.871107,95.186389
5,west_texas,91.695151,94.933168,93.054142,95.332903


In [11]:
#W2V 500
df_accs[4]

Unnamed: 0.1,Unnamed: 0,Naive Bayes,Logistic Regression,SVM,Neural Network
0,sandy,80.62651,89.699062,86.721744,90.908122
1,alberta,81.707195,92.493163,91.286594,93.649545
2,boston,81.553379,91.001604,91.05135,92.469737
3,oklahoma,84.367164,90.432033,90.962503,92.363534
4,queensland,87.890857,95.296085,94.209742,96.272746
5,west_texas,92.414727,96.012598,95.243173,96.861949


In [12]:
df_accs[5]

Unnamed: 0,Naive Bayes,Logistic Regression,SVM,Neural Network
sandy,79.138123,86.622049,85.962823,88.800225
alberta,83.272311,88.465729,88.844399,91.755516
boston,84.568941,89.872753,89.932853,91.910336
oklahoma,81.035067,83.336092,82.066053,87.659735
queensland,87.143453,92.575209,91.07062,95.206424
west_texas,91.585256,93.414087,92.994222,95.752668
