In [7]:
import numpy as np
import pandas as pd
import random
import time
from itertools import product

import collections
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch import optim

import warnings;
warnings.filterwarnings('ignore');

In [8]:
#read in the word embeddings
vec_length = 100
embeddings = np.zeros((1193514+2, vec_length))

#two-way map, index->word and word->index
glove = {}

#add special tokens for unknown and padding
embeddings[0] = np.zeros(vec_length)
glove[0] = 'UNK'
glove['UNK'] = 0

embeddings[1] = np.zeros(vec_length)
glove[1] = 'PAD'
glove['PAD'] = 1

index = 2
with open('glove.twitter.27B.%dd.txt' % vec_length) as f:
    for l in f:
        line = []
        try:
            line = l.split()
            if len(line) != vec_length+1:
                print('empty line')
                continue
            
            word = line[0]
            embeddings[index] = np.array(line[1:]).astype(np.float)
            glove[index] = word
            glove[word] = index
            index += 1
        except:
            break

empty line


In [10]:
#read in the dataset
df = pd.read_csv('final_dataset_processed.csv')
print(df.shape)
df.head()

(4078, 3)


Unnamed: 0,Text,Relevancy,Urgency
0,harveystorm water waist deep respect cop help ...,0,0
1,find help affect hurricane harvey yeg hurrican...,1,0
2,mountainview heroes deploy help harvey <url>,3,0
3,help impact hurricaneharvey weave activate don...,0,0
4,much flood houston wow tune news prayers sympa...,0,0


In [11]:
#now convert the tweets into a list of indices
X = []
unk_percent = []
unk_words = set()
max_len = 0
for tweet in df['Text']:
    indices = []
    words = tweet.split()
    if len(words) > max_len:
        max_len = len(words)
    unknown = 0
    for word in words:
        if word in glove:
            indices.append(glove[word])
        else:
            indices.append(glove['UNK'])
            unk_words.add(word)
            unknown += 1
        unk_percent.append(unknown/len(words))
    X.append(indices)

# add padding to make every tweet the same length
for i in range(len(X)):
    tweet = X[i]
    if len(tweet) < max_len:
        tweet = np.append(tweet, np.ones(max_len - len(tweet)))
    X[i] = tweet

X = np.asarray(X, dtype=np.int64)
y = np.array(list(map(lambda x: 1 if x > 0 else 0, df['Relevancy'].values)), dtype=np.int64)
print(np.mean(unk_percent))
print(len(unk_words))

0.035785945166337485
735


In [20]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(device)

"""
This architecture is inspired by the one used in the paper
'Twitter Sentiment Analysis with Deep Convolutional Neural Networks' (Severyn et al., 2015)
"""
class CNN(nn.Module):
    def __init__(self, embeddings, n_filters, filter_sizes, n_classes, dropout):
        
        super().__init__()
        
        #length of the word embeddings
        embedding_dim = embeddings.shape[1]
        
        #architecture
        self.embedding = nn.Embedding.from_pretrained(embeddings)
        
        self.conv_0 = nn.Conv2d(in_channels = 1, 
                                out_channels = n_filters, 
                                kernel_size = (filter_sizes[0], embedding_dim))
        
        self.conv_1 = nn.Conv2d(in_channels = 1, 
                                out_channels = n_filters, 
                                kernel_size = (filter_sizes[1], embedding_dim))
        
        self.conv_2 = nn.Conv2d(in_channels = 1, 
                                out_channels = n_filters, 
                                kernel_size = (filter_sizes[2], embedding_dim))
        
        self.fc = nn.Linear(len(filter_sizes) * n_filters, n_classes)
        
        self.dropout = nn.Dropout(dropout)
        self.softmax = nn.Softmax()
        
    def forward(self, tweet_indices):
        
        embedded = self.embedding(tweet_indices)
        embedded = embedded.unsqueeze(1)
        
        conved_0 = F.relu(self.conv_0(embedded).squeeze(3))
        conved_1 = F.relu(self.conv_1(embedded).squeeze(3))
        conved_2 = F.relu(self.conv_2(embedded).squeeze(3))
        
        pooled_0 = F.max_pool1d(conved_0, conved_0.shape[2]).squeeze(2)
        pooled_1 = F.max_pool1d(conved_1, conved_1.shape[2]).squeeze(2)
        pooled_2 = F.max_pool1d(conved_2, conved_2.shape[2]).squeeze(2)
        
        cat = self.dropout(torch.cat((pooled_0, pooled_1, pooled_2), dim = 1))
        
        return self.softmax(self.fc(cat))
    
    def predict(self, tweet):
        return np.argmax(self.forward(tweet).detach().cpu().numpy())

cuda:0


In [21]:
"""
X_train: 2d np array, where each row is the indices corresponding to each word of a specific tweet
y_train: 1d np array of same length as X_train with 0/1 based on relevant/not relevant or urgent/not urgent
embeddings: GloVe word embeddings created above
"""
def train_cnn_classifier(X_train, y_train, embeddings, num_classes, manual_params=None, verbose=False):
    try:
        start = time.time()
        embeddings = torch.from_numpy(embeddings).float().to(device)
        embed_len = len(embeddings[0])
        seq_len = len(X_train[0])
        
        #default parameters for the model
        params = {'batch_size': 10, 'epochs': 50, 'lr': 0.0001, 'n_filters': 100, 'filter_sizes': [3,4,5],
                 'dropout': 0.5}
        
        #replace default parameters with any user-defined ones
        if manual_params is not None:
            for p in manual_params:
                params[p] = manual_params[p]
                
        batch_size = params['batch_size']
        epochs = params['epochs']
        lr = params['lr']
        
        #initialize network and optimizer
        cnn = CNN(embeddings, n_filters=params['n_filters'], filter_sizes=params['filter_sizes'], 
                n_classes=num_classes, dropout=params['dropout'])
        cnn.to(device)
        
        optimizer = optim.Adam(cnn.parameters(), lr=lr)
        loss = nn.CrossEntropyLoss()
        
        cnn.train()
        for epoch in range(epochs):
            ex_indices = [i for i in range(len(X_train))]
            random.shuffle(ex_indices)
            total_loss = 0.0
            for idx in range(len(ex_indices)//batch_size):
                
                #create input batch to feed in
                cur_batch_idx = ex_indices[idx*batch_size:(idx+1)*batch_size]
                cur_X = torch.from_numpy(np.asarray([X_train[i] for i in cur_batch_idx])).long().to(device)
                cur_y = torch.from_numpy(np.asarray([y_train[i] for i in cur_batch_idx])).to(device)
                
                #train
                cnn.zero_grad()
                probs = cnn.forward(cur_X)
                
                #calculate loss and update weights
                cur_loss = loss(probs, cur_y)
                total_loss += cur_loss
                cur_loss.backward()
                optimizer.step()
            
            if verbose:
                print("Avg loss on epoch %i: %f" % (epoch+1, total_loss/len(ex_indices)))
        end = time.time()
        print("Time taken: %f seconds" % (end-start))
        return cnn
    except KeyboardInterrupt:
        end = time.time()
        print("Time taken: %f seconds" % (end-start))
        return cnn

In [22]:
# evaluates binary classification model
def calc_metrics(model, X_test, y_test):
    num_correct = 0
    num_true_pos = 0
    num_false_pos = 0
    num_false_neg = 0
    
    num_test_exs = len(X_test)

    model.eval()
    for i in range(num_test_exs):
        
        cur_batch_idx = [i]
        cur_X = torch.from_numpy(np.asarray([X_test[i] for i in cur_batch_idx])).long().to(device)
        
        y_pred = model.predict(cur_X)
        y_gold = y_test[i]
        if y_pred == y_gold:
            num_correct += 1
            if y_gold > 0:
                num_true_pos += 1
        else:
            if y_pred == 0:
                num_false_neg += 1
            else:
                num_false_pos += 1

    accuracy = num_correct/num_test_exs
    precision = num_true_pos/(num_true_pos + num_false_pos)
    recall = num_true_pos/(num_true_pos + num_false_neg)
    f1 = 2*precision*recall/(precision+recall)

    return {'accuracy': accuracy, 'precision': precision, 'recall': recall, 'f1': f1}

In [23]:
def kfold(X, y, embeddings, manual_params=None, k=10):
    ex_indices = list(range(X.shape[0]))
    random.shuffle(ex_indices)
    
    accuracy = np.zeros(k)
    precision = np.zeros(k)
    recall = np.zeros(k)
    f1 = np.zeros(k)
    
    #calculate the splitting scheme
    splits = [X.shape[0]//k] * k
    for i in range(X.shape[0] % k):
        splits[i] += 1
    
    #keeps track of current location in 
    index = 0
    for i in range(k):
        #come up with the train-test split
        X_test = np.asarray([X[i] for i in ex_indices[index:index+splits[i]]])
        y_test = np.asarray([y[i] for i in ex_indices[index:index+splits[i]]])
        
        train_indices = ex_indices[0:index] + ex_indices[index+splits[i]:]
        X_train = np.asarray([X[i] for i in train_indices])
        y_train = np.asarray([y[i] for i in train_indices])
        
        #now train the model on this split and save the metrics
        cnn = train_cnn_classifier(X_train, y_train, embeddings, num_classes=2, manual_params=manual_params, verbose=False)
        
        results = calc_metrics(cnn, X_test, y_test)
        accuracy[i] = results['accuracy']
        precision[i] = results['precision']
        recall[i] = results['recall']
        f1[i] = results['f1']
        
        index += splits[i]
    
    return {'accuracy': np.mean(accuracy), 'precision': np.mean(precision), 
           'recall': np.mean(recall), 'f1': np.mean(f1)}

In [24]:
def gridsearch(X, y, embeddings, params, metric='f1', k=10):
    
    results = []
    keys = []
    values = []
    for key in params:
        keys.append(key)
        values.append(params[key])
    
    for config in product(*values):
        p = {}
        for i, v in enumerate(config):
            p[keys[i]] = v
        
        res = kfold(X, y, embeddings, manual_params=p, k=k)
        results.append((p, res))
    
    return sorted(results, reverse=True, key=lambda x: x[1][metric])

In [None]:
kfold(X, y, embeddings, k=10)

In [None]:
#figure out region size first
filter_sizes = []
for i in range(10):
    filter_sizes.append([i+1])
filter_sizes.append([15])
filter_sizes_search = gridsearch(X, y, embeddings, params={'filter_sizes': filter_sizes})

In [None]:
filter_sizes_search

In [None]:
#now try multiple filters with sizes around opt
fs_opt = filter_sizes_search[0][0]['filter_sizes'][0]
print('optimal filter size: ' + str(fs_opt))

filter_sizes2 = [[fs_opt], [fs_opt]*2, [fs_opt]*3, [fs_opt]*4]

if fs_opt > 3:
    filter_sizes2.append([fs_opt-3, fs_opt-2, fs_opt-1])
if fs_opt > 2:
    filter_sizes2.append([fs_opt-2, fs_opt-1, fs_opt])
    filter_sizes2.append([fs_opt-2, fs_opt-1, fs_opt, fs_opt+1])
    filter_sizes2.append([fs_opt-2, fs_opt-1, fs_opt, fs_opt+1, fs_opt+2])
if fs_opt > 1:
    filter_sizes2.append([fs_opt-1, fs_opt, fs_opt+1])
    filter_sizes2.append([fs_opt-1, fs_opt-1, fs_opt, fs_opt])

filter_sizes2.append([fs_opt, fs_opt+1, fs_opt+2])
filter_sizes2.append([fs_opt, fs_opt, fs_opt+1, fs_opt+1])
filter_sizes2.append([fs_opt, fs_opt+1, fs_opt+2, fs_opt+3])

filter_sizes_search2 = gridsearch(X, y, embeddings, params={'filter_sizes': filter_sizes2})

In [247]:
filter_sizes_search2

In [None]:
fs_opt2 = filter_sizes_search2[0][0]['filter_sizes']
print('optimal filter size 2: ' + str(fs_opt2))

#now adjust the number of feature maps for each filter size to find best one
n_filters = [100,200,300,400,500,600,700,1000]
n_filters_search = gridsearch(X, y, embeddings, params={'dropout': [0.1], 'n_filters': n_filters, 'filter_sizes': [fs_opt2]})

In [None]:
n_filters_search

In [26]:
n_filters_opt = 100
dropout_opt = 0.1
print('optimal n_filters: ' + str(n_filters_opt))

# we need to increase the dropout and try again with higher values
n_filters2 = [400,500,600,700,1000,2000]
n_filters_search2 = gridsearch(X, y, embeddings, params={'dropout': [0.5,0.75], 'n_filters': n_filters2, 'filter_sizes': [[1,1,1]]})

optimal n_filters: 100
Time taken: 55.411257 seconds
Time taken: 55.921366 seconds
Time taken: 56.436141 seconds
Time taken: 55.501133 seconds
Time taken: 54.791711 seconds
Time taken: 55.152720 seconds
Time taken: 55.614873 seconds
Time taken: 55.211932 seconds
Time taken: 54.935635 seconds
Time taken: 54.456240 seconds
Time taken: 54.977714 seconds
Time taken: 55.548565 seconds
Time taken: 54.816603 seconds
Time taken: 56.436665 seconds
Time taken: 55.529351 seconds
Time taken: 55.977760 seconds
Time taken: 55.363546 seconds
Time taken: 54.436212 seconds
Time taken: 55.826600 seconds
Time taken: 55.520164 seconds
Time taken: 55.533190 seconds
Time taken: 54.887025 seconds
Time taken: 54.544723 seconds
Time taken: 56.047434 seconds
Time taken: 54.831809 seconds
Time taken: 55.841985 seconds
Time taken: 55.745844 seconds
Time taken: 55.250174 seconds
Time taken: 55.585490 seconds
Time taken: 55.630190 seconds
Time taken: 56.433008 seconds
Time taken: 55.620598 seconds
Time taken: 54.85

In [27]:
n_filters_search2

[({'dropout': 0.75, 'n_filters': 400, 'filter_sizes': [1, 1, 1]},
  {'accuracy': 0.7170291949703714,
   'precision': 0.7108499987233488,
   'recall': 0.7333624974603714,
   'f1': 0.7212108573859786}),
 ({'dropout': 0.75, 'n_filters': 500, 'filter_sizes': [1, 1, 1]},
  {'accuracy': 0.7177458206869971,
   'precision': 0.7168908465633443,
   'recall': 0.7207278323401358,
   'f1': 0.7179898960524425}),
 ({'dropout': 0.5, 'n_filters': 400, 'filter_sizes': [1, 1, 1]},
  {'accuracy': 0.7128667437490966,
   'precision': 0.7055637933851523,
   'recall': 0.7293383015203121,
   'f1': 0.7169432988044626}),
 ({'dropout': 0.75, 'n_filters': 1000, 'filter_sizes': [1, 1, 1]},
  {'accuracy': 0.7145601483836778,
   'precision': 0.7130406361017637,
   'recall': 0.7211848764500807,
   'f1': 0.7154566986638359}),
 ({'dropout': 0.75, 'n_filters': 600, 'filter_sizes': [1, 1, 1]},
  {'accuracy': 0.7091697981403864,
   'precision': 0.7010760122269847,
   'recall': 0.7318993496138287,
   'f1': 0.715030153800041

In [32]:
n_filters_opt2 = n_filters_search2[0][0]['n_filters']
dropout_opt = 0.75

print('updated optimal n_filters: ' + str(n_filters_opt2))
print('updated optimal dropout: ' + str(dropout_opt))

updated optimal n_filters: 400
updated optimal dropout: 0.75


In [30]:
#now fine-tune epochs and batch size
epochs = [10,50,100,200]
batch_size = [10,25,50,100,200]

final_search = gridsearch(X, y, embeddings, params={'dropout': [dropout_opt], 
                                                    'n_filters': [n_filters_opt2], 
                                                    'filter_sizes': [[1,1,1]],
                                                    'epochs': epochs,
                                                    'batch_size': batch_size})

Time taken: 11.475208 seconds
Time taken: 11.781382 seconds
Time taken: 11.755190 seconds
Time taken: 11.277146 seconds
Time taken: 11.619232 seconds
Time taken: 11.378665 seconds
Time taken: 11.339352 seconds
Time taken: 11.338720 seconds
Time taken: 11.046167 seconds
Time taken: 11.712664 seconds
Time taken: 33.269053 seconds
Time taken: 33.276424 seconds
Time taken: 33.248067 seconds
Time taken: 33.228979 seconds
Time taken: 33.229292 seconds
Time taken: 33.305548 seconds
Time taken: 33.217199 seconds
Time taken: 33.205963 seconds
Time taken: 33.191382 seconds
Time taken: 33.217854 seconds
Time taken: 3.708146 seconds
Time taken: 3.716348 seconds
Time taken: 3.714643 seconds
Time taken: 3.696690 seconds
Time taken: 3.692441 seconds
Time taken: 3.719661 seconds
Time taken: 3.705287 seconds
Time taken: 3.708778 seconds
Time taken: 3.686179 seconds
Time taken: 3.683019 seconds
Time taken: 3.127100 seconds
Time taken: 3.102632 seconds
Time taken: 3.104588 seconds
Time taken: 3.146895 se

In [31]:
final_search

[({'dropout': 0.75,
   'n_filters': 400,
   'filter_sizes': [1, 1, 1],
   'epochs': 200,
   'batch_size': 50},
  {'accuracy': 0.7165305680011562,
   'precision': 0.7105332920476017,
   'recall': 0.7316548997856122,
   'f1': 0.7203522038738812}),
 ({'dropout': 0.75,
   'n_filters': 400,
   'filter_sizes': [1, 1, 1],
   'epochs': 50,
   'batch_size': 10},
  {'accuracy': 0.7194723466782291,
   'precision': 0.7210557020517993,
   'recall': 0.7158398179089841,
   'f1': 0.7178588187252875}),
 ({'dropout': 0.75,
   'n_filters': 400,
   'filter_sizes': [1, 1, 1],
   'epochs': 50,
   'batch_size': 25},
  {'accuracy': 0.715291227055933,
   'precision': 0.7128450501673116,
   'recall': 0.7220024533730067,
   'f1': 0.7163535968996311}),
 ({'dropout': 0.75,
   'n_filters': 400,
   'filter_sizes': [1, 1, 1],
   'epochs': 200,
   'batch_size': 200},
  {'accuracy': 0.7155477670183552,
   'precision': 0.7133871406861227,
   'recall': 0.7188571738935536,
   'f1': 0.7157658659770727}),
 ({'dropout': 0.75