In [186]:
import numpy as np
import pandas as pd
import random
import time
from itertools import product

import gensim
from gensim.utils import simple_preprocess
from gensim import corpora, models

from nltk.stem import WordNetLemmatizer
import nltk
nltk.download('wordnet')
from nltk.tokenize import TweetTokenizer
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))
from symspellpy.symspellpy import SymSpell, Verbosity
from sklearn.model_selection import train_test_split

import collections
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch import optim


import process_tweet
import importlib
importlib.reload(process_tweet)

import warnings;
warnings.filterwarnings('ignore');

[nltk_data] Downloading package wordnet to /home/ashwin/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package wordnet to /home/ashwin/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [2]:
#read in the word embeddings
vec_length = 100
embeddings = np.zeros((1193514+2, vec_length))

#two-way map, index->word and word->index
glove = {}

#add special tokens for unknown and padding
embeddings[0] = np.zeros(vec_length)
glove[0] = 'UNK'
glove['UNK'] = 0

embeddings[1] = np.zeros(vec_length)
glove[1] = 'PAD'
glove['PAD'] = 1

index = 2
with open('data/glove.twitter.27B/glove.twitter.27B.%dd.txt' % vec_length) as f:
    for l in f:
        line = []
        try:
            line = l.split()
            if len(line) != vec_length+1:
                print('empty line')
                continue
            
            word = line[0]
            embeddings[index] = np.array(line[1:]).astype(np.float)
            glove[index] = word
            glove[word] = index
            index += 1
        except:
            break

empty line


In [3]:
#read in the dataset
df = pd.read_csv('data/final_dataset.csv')
print(df.shape)
df.head()

(4078, 3)


Unnamed: 0,Text,Relevancy,Urgency
0,That #HarveyStorm water is waist deep. My resp...,0,0
1,Find out how you help those affected by Hurric...,1,0
2,#MountainView heroes deployed to help with #Ha...,3,0
3,"To help those impacted by #HurricaneHarvey, we...",0,0
4,So much flooding in Houston. Wow! Just tuning ...,0,0


In [4]:
#preprocess the tweets
sym_spell = process_tweet.create_symspell(2,7,'data/frequency_dictionary_en_82_765.txt')
tknzr = TweetTokenizer(strip_handles=False, reduce_len=True)
df['Text'] = df['Text'].map(lambda x: process_tweet.process_tweet(x, glove, tknzr, sym_spell, True))
df.head()

Unnamed: 0,Text,Relevancy,Urgency
0,harveystorm water waist deep respect cop help ...,0,0
1,find help affect hurricane harvey yeg hurrican...,1,0
2,mountainview heroes deploy help harvey <url>,3,0
3,help impact hurricaneharvey weave activate don...,0,0
4,much flood houston wow tune news prayers sympa...,0,0


In [96]:
#now convert the tweets into a list of indices
X = []
unk_percent = []
unk_words = set()
max_len = 0
for tweet in df['Text']:
    indices = []
    words = tweet.split()
    if len(words) > max_len:
        max_len = len(words)
    unknown = 0
    for word in words:
        if word in glove:
            indices.append(glove[word])
        else:
            indices.append(glove['UNK'])
            unk_words.add(word)
            unknown += 1
        unk_percent.append(unknown/len(words))
    X.append(indices)

# add padding to make every tweet the same length
for i in range(len(X)):
    tweet = X[i]
    if len(tweet) < max_len:
        tweet = np.append(tweet, np.ones(max_len - len(tweet)))
    X[i] = tweet

X = np.asarray(X, dtype=np.int64)
y = np.array(list(map(lambda x: 1 if x > 0 else 0, df['Relevancy'].values)), dtype=np.int64)
print(np.mean(unk_percent))
print(len(unk_words))

0.035785945166337485
735


In [169]:
"""
This architecture is inspired by the one used in the paper
'Twitter Sentiment Analysis with Deep Convolutional Neural Networks' (Severyn et al., 2015)
"""
class CNN(nn.Module):
    def __init__(self, embeddings, n_filters, filter_sizes, n_classes, dropout):
        
        super().__init__()
        
        #length of the word embeddings
        embedding_dim = embeddings.shape[1]
        
        #architecture
        self.embedding = nn.Embedding.from_pretrained(embeddings)
        
        self.conv_0 = nn.Conv2d(in_channels = 1, 
                                out_channels = n_filters, 
                                kernel_size = (filter_sizes[0], embedding_dim))
        
        self.conv_1 = nn.Conv2d(in_channels = 1, 
                                out_channels = n_filters, 
                                kernel_size = (filter_sizes[1], embedding_dim))
        
        self.conv_2 = nn.Conv2d(in_channels = 1, 
                                out_channels = n_filters, 
                                kernel_size = (filter_sizes[2], embedding_dim))
        
        self.fc = nn.Linear(len(filter_sizes) * n_filters, n_classes)
        
        self.dropout = nn.Dropout(dropout)
        self.softmax = nn.Softmax()
        
    def forward(self, tweet_indices):
        
        embedded = self.embedding(tweet_indices)
        embedded = embedded.unsqueeze(1)
        
        conved_0 = F.relu(self.conv_0(embedded).squeeze(3))
        conved_1 = F.relu(self.conv_1(embedded).squeeze(3))
        conved_2 = F.relu(self.conv_2(embedded).squeeze(3))
        
        pooled_0 = F.max_pool1d(conved_0, conved_0.shape[2]).squeeze(2)
        pooled_1 = F.max_pool1d(conved_1, conved_1.shape[2]).squeeze(2)
        pooled_2 = F.max_pool1d(conved_2, conved_2.shape[2]).squeeze(2)
        
        cat = self.dropout(torch.cat((pooled_0, pooled_1, pooled_2), dim = 1))
        
        return self.softmax(self.fc(cat))
    
    def predict(self, tweet):
        return np.argmax(self.forward(tweet).detach().numpy())

In [181]:
"""
X_train: 2d np array, where each row is the indices corresponding to each word of a specific tweet
y_train: 1d np array of same length as X_train with 0/1 based on relevant/not relevant or urgent/not urgent
embeddings: GloVe word embeddings created above
"""
def train_cnn_classifier(X_train, y_train, embeddings, num_classes, manual_params=None, verbose=False):
    try:
        start = time.time()
        embeddings = torch.from_numpy(embeddings).float()
        embed_len = len(embeddings[0])
        seq_len = len(X_train[0])
        
        #default parameters for the model
        params = {'batch_size': 10, 'epochs': 50, 'lr': 0.0001, 'n_filters': 100, 'filter_sizes': [3,4,5],
                 'dropout': 0.5}
        
        #replace default parameters with any user-defined ones
        if manual_params is not None:
            for p in manual_params:
                params[p] = manual_params[p]
                
        batch_size = params['batch_size']
        epochs = params['epochs']
        lr = params['lr']
        
        #initialize network and optimizer
        cnn = CNN(embeddings, n_filters=params['n_filters'], filter_sizes=params['filter_sizes'], 
                n_classes=num_classes, dropout=params['dropout'])
        optimizer = optim.Adam(cnn.parameters(), lr=lr)
        loss = nn.CrossEntropyLoss()
        
        cnn.train()
        for epoch in range(epochs):
            ex_indices = [i for i in range(len(X_train))]
            random.shuffle(ex_indices)
            total_loss = 0.0
            for idx in range(len(ex_indices)//batch_size):
                
                #create input batch to feed in
                cur_batch_idx = ex_indices[idx*batch_size:(idx+1)*batch_size]
                cur_X = torch.from_numpy(np.asarray([X_train[i] for i in cur_batch_idx])).long()
                cur_y = torch.from_numpy(np.asarray([y_train[i] for i in cur_batch_idx]))
                
                #train
                cnn.zero_grad()
                probs = cnn.forward(cur_X)
                
                #calculate loss and update weights
                cur_loss = loss(probs, cur_y)
                total_loss += cur_loss
                cur_loss.backward()
                optimizer.step()
            
            if verbose:
                print("Avg loss on epoch %i: %f" % (epoch+1, total_loss/len(ex_indices)))
        end = time.time()
        print("Time taken: %f seconds" % (end-start))
        return cnn
    except KeyboardInterrupt:
        end = time.time()
        print("Time taken: %f seconds" % (end-start))
        return cnn

In [222]:
# evaluates binary classification model
def calc_metrics(model, X_test, y_test):
    num_correct = 0
    num_true_pos = 0
    num_false_pos = 0
    num_false_neg = 0
    
    num_test_exs = len(X_test)

    model.eval()
    for i in range(num_test_exs):
        
        cur_batch_idx = [i]
        cur_X = torch.from_numpy(np.asarray([X_test[i] for i in cur_batch_idx])).long()
        
        y_pred = model.predict(cur_X)
        y_gold = y_test[i]
        if y_pred == y_gold:
            num_correct += 1
            if y_gold > 0:
                num_true_pos += 1
        else:
            if y_pred == 0:
                num_false_neg += 1
            else:
                num_false_pos += 1

    accuracy = num_correct/num_test_exs
    precision = num_true_pos/(num_true_pos + num_false_pos)
    recall = num_true_pos/(num_true_pos + num_false_neg)
    f1 = 2*precision*recall/(precision+recall)

    return {'accuracy': accuracy, 'precision': precision, 'recall': recall, 'f1': f1}

In [223]:
def kfold(X, y, embeddings, manual_params=None, k=10):
    ex_indices = list(range(X.shape[0]))
    random.shuffle(ex_indices)
    
    accuracy = np.zeros(k)
    precision = np.zeros(k)
    recall = np.zeros(k)
    f1 = np.zeros(k)
    
    #calculate the splitting scheme
    splits = [X.shape[0]//k] * k
    for i in range(X.shape[0] % k):
        splits[i] += 1
    
    #keeps track of current location in 
    index = 0
    for i in range(k):
        #come up with the train-test split
        X_test = np.asarray([X[i] for i in ex_indices[index:index+splits[i]]])
        y_test = np.asarray([y[i] for i in ex_indices[index:index+splits[i]]])
        
        train_indices = ex_indices[0:index] + ex_indices[index+splits[i]:]
        X_train = np.asarray([X[i] for i in train_indices])
        y_train = np.asarray([y[i] for i in train_indices])
        
        #now train the model on this split and save the metrics
        cnn = train_cnn_classifier(X_train, y_train, embeddings, num_classes=2, manual_params=manual_params, verbose=False)
        
        results = calc_metrics(cnn, X_test, y_test)
        accuracy[i] = results['accuracy']
        precision[i] = results['precision']
        recall[i] = results['recall']
        f1[i] = results['f1']
        
        index += splits[i]
    
    return {'accuracy': np.mean(accuracy), 'precision': np.mean(precision), 
           'recall': np.mean(recall), 'f1': np.mean(f1)}

In [241]:
def gridsearch(X, y, embeddings, params, metric='f1', k=10):
    
    results = []
    keys = []
    values = []
    for key in params:
        keys.append(key)
        values.append(params[key])
    
    for config in product(*values):
        p = {}
        for i, v in enumerate(config):
            p[keys[i]] = v
        
        res = kfold(X, y, embeddings, manual_params=p, k=k)
        results.append((p, res))
    
    return sorted(results, reverse=True, key=lambda x: x[1][metric])

In [210]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33)

print('Training Relevancy CNN Classifier')
cnn = train_cnn_classifier(X_train, y_train, embeddings, 2, {'n_filters': 300})
print('\nRelevancy metrics:')
calc_metrics(cnn, X_test, y_test)

Training Relevancy CNN Classifier
Time taken: 1.857088 seconds

Relevancy metrics:


{'accuracy': 0.6404160475482912,
 'precision': 0.7235955056179775,
 'recall': 0.47144948755490484,
 'f1': 0.5709219858156028}

In [173]:
kfold(X, y, embeddings, k=10)

Time taken: 134.847982 seconds
Time taken: 105.110723 seconds
Time taken: 91.684669 seconds
Time taken: 93.566537 seconds
Time taken: 98.692508 seconds
Time taken: 84.461049 seconds
Time taken: 85.574390 seconds
Time taken: 89.420281 seconds
Time taken: 89.701171 seconds
Time taken: 90.888552 seconds


{'accuracy': 0.6927355831767597,
 'precision': 0.6899113483892558,
 'recall': 0.6983622580028672,
 'f1': 0.692984823435019}

In [None]:
#figure out region size first
filter_sizes = []
for i in range(10):
    filter_sizes.append([i+1])
filter_sizes.append([15])
filter_sizes_search = gridsearch(X, y, embeddings, params={'filter_sizes': filter_sizes})

In [None]:
filter_sizes_search

In [None]:
#now try multiple filters with sizes around opt
fs_opt = filter_sizes_search[0][0]['filter_sizes'][0]
print('optimal filter size: ' + str(fs_opt))

filter_sizes2 = [[fs_opt], [fs_opt]*2, [fs_opt]*3, [fs_opt]*4]

if fs_opt > 3:
    filter_sizes2.append([fs_opt-3, fs_opt-2, fs_opt-1])
if fs_opt > 2:
    filter_sizes2.append([fs_opt-2, fs_opt-1, fs_opt])
    filter_sizes2.append([fs_opt-2, fs_opt-1, fs_opt, fs_opt+1])
    filter_sizes2.append([fs_opt-2, fs_opt-1, fs_opt, fs_opt+1, fs_opt+2])
if fs_opt > 1:
    filter_sizes2.append([fs_opt-1, fs_opt, fs_opt+1])
    filter_sizes2.append([fs_opt-1, fs_opt-1, fs_opt, fs_opt])

filter_sizes2.append([fs_opt, fs_opt+1, fs_opt+2])
filter_sizes2.append([fs_opt, fs_opt, fs_opt+1, fs_opt+1])
filter_sizes2.append([fs_opt, fs_opt+1, fs_opt+2, fs_opt+3])

filter_sizes_search2 = gridsearch(X, y, embeddings, params={'filter_sizes': filter_sizes2})

In [247]:
filter_sizes_search2

In [None]:
fs_opt2 = filter_sizes_search2[0][0]['filter_sizes']
print('optimal filter size 2: ' + str(fs_opt2))

#now adjust the number of feature maps for each filter size to find best one
n_filters = [100,200,300,400,500,600,700,1000]
n_filters_search = gridsearch(X, y, embeddings, params={'dropout': [0.1], 'n_filters': n_filters, 'filter_sizes': [fs_opt2]})

In [None]:
n_filters_search

In [None]:
n_filters_opt = n_filters_search[0][0]['n_filters']
dropout_opt = 0.1
print('optimal n_filters: ' + str(n_filters_opt))

# we need to increase the dropout and try again with higher values
n_filters2 = [400,500,600,700,1000,2000]
n_filters_search2 = gridsearch(X, y, embeddings, params={'dropout': [0.5,0.75], 'n_filters': n_filters2, 'filter_sizes': [fs_opt2]})
print(n_filters_search2)

n_filters_opt2 = n_filters_search2[0][0]['n_filters']
print('optimal n_filters 2: ' + str(n_filters_opt2))
if n_filters_search2[0][1]['f1'] > n_filters_search[0][1]['f1']:
    print('new optimum!')
    n_filters_opt = n_filters_opt2
    dropout_opt = n_filters_search2[0][0]['dropout']

In [None]:
#now fine-tune epochs and batch size
epochs = [10,30,50,70,100]
batch_size = [10,30,50,100,200]

final_search = gridsearch(X, y, embeddings, params={'dropout': [dropout_opt], 
                                                    'n_filters': [n_filters_opt], 
                                                    'filter_sizes': [fs_opt2],
                                                    'epochs': epochs,
                                                    'batch_size': batch_size})

In [None]:
final_search