In [7]:
import numpy as np
import pandas as pd
import random
import time
from itertools import product

import collections
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch import optim

import warnings;
warnings.filterwarnings('ignore');

In [8]:
#read in the word embeddings
vec_length = 100
embeddings = np.zeros((1193514+2, vec_length))

#two-way map, index->word and word->index
glove = {}

#add special tokens for unknown and padding
embeddings[0] = np.zeros(vec_length)
glove[0] = 'UNK'
glove['UNK'] = 0

embeddings[1] = np.zeros(vec_length)
glove[1] = 'PAD'
glove['PAD'] = 1

index = 2
with open('glove.twitter.27B.%dd.txt' % vec_length) as f:
    for l in f:
        line = []
        try:
            line = l.split()
            if len(line) != vec_length+1:
                print('empty line')
                continue
            
            word = line[0]
            embeddings[index] = np.array(line[1:]).astype(np.float)
            glove[index] = word
            glove[word] = index
            index += 1
        except:
            break

empty line


In [19]:
#read in character-level embeddings
char_vec_length = 300
char_embeddings = np.zeros((94+2, char_vec_length))

#two-way map, index->word and word->index
char_glove = {}

#add special tokens for unknown and padding
char_embeddings[0] = np.zeros(char_vec_length)
char_glove[0] = 'UNK'
char_glove['UNK'] = 0

char_embeddings[1] = np.zeros(char_vec_length)
char_glove[1] = 'PAD'
char_glove['PAD'] = 1

index = 2
with open('glove.840B.%dd-char.txt' % char_vec_length) as f:
    for l in f:
        line = []
        try:
            line = l.split()
            if len(line) != char_vec_length+1:
                print('empty line')
                continue
            
            word = line[0]
            char_embeddings[index] = np.array(line[1:]).astype(np.float)
            char_glove[index] = word
            char_glove[word] = index
            index += 1
        except:
            break

In [20]:
#read in the dataset
df = pd.read_csv('final_dataset_processed.csv')
print(df.shape)
df.head()

(4078, 3)


Unnamed: 0,Text,Relevancy,Urgency
0,harveystorm water waist deep respect cop help ...,0,0
1,find help affect hurricane harvey yeg hurrican...,1,0
2,mountainview heroes deploy help harvey <url>,3,0
3,help impact hurricaneharvey weave activate don...,0,0
4,much flood houston wow tune news prayers sympa...,0,0


In [21]:
#now convert the tweets into a list of indices
X = []
unk_percent = []
unk_words = set()
max_len = 0

# set of all words found in all tweets
all_words = set() 

for tweet in df['Text']:
    indices = []
    words = tweet.split()
    if len(words) > max_len:
        max_len = len(words)
    
    unknown = 0
    for word in words:
        all_words.add(word)
        if word in glove:
            indices.append(glove[word])
        else:
            indices.append(glove['UNK'])
            unk_words.add(word)
            unknown += 1
        unk_percent.append(unknown/len(words))
    X.append(indices)

# add padding to make every tweet the same length
for i in range(len(X)):
    tweet = X[i]
    if len(tweet) < max_len:
        tweet = np.append(tweet, np.ones(max_len - len(tweet)))
    X[i] = tweet

X = np.asarray(X, dtype=np.int64)
y = np.array(list(map(lambda x: 1 if x > 0 else 0, df['Relevancy'].values)), dtype=np.int64)

print(np.mean(unk_percent))
print('number of unknown words: ' + str(len(unk_words)))
print('max tweet length: ' + str(max_len))

0.035785945166337485
number of unknown words: 735
max tweet length: 22


In [22]:
#create map of words to arrays of character indices in two-way embedding map
word_to_charr = {}
for word in all_words:
    if word[0] == '<':
        continue
    word_to_charr[word] = list(map(lambda x: char_glove[x] if x in char_glove else 0, list(word)))

In [24]:
#figure out a good cutoff for word length
word_lengths = {}
for tweet in df['Text']:
    for word in tweet.split():
        if len(word) not in word_lengths:
            word_lengths[len(word)] = 1
        else:
            word_lengths[len(word)] += 1

sorted_lengths = sorted(list(word_lengths.keys()))

for l in sorted_lengths:
    print(l, word_lengths[l])

#now figure out the max word length threshold that captures 90% words
num_words = sum(word_lengths.values())
curr_sum = 0
index = -1
while curr_sum < 0.9*num_words:
    index += 1
    curr_sum += word_lengths[sorted_lengths[index]]

max_word_len = sorted_lengths[index]
print('\nchosen word length threshold: ' + str(max_word_len))

1 189
2 737
3 3399
4 8961
5 9685
6 8544
7 3812
8 2799
9 1729
10 833
11 592
12 870
13 512
14 136
15 1566
16 79
17 38
18 19
19 32
20 8
21 14
22 4
23 3
24 1
25 2
29 2
30 2

chosen word length threshold: 10


In [45]:
#now we generate the character index array
X_char = []

for tweet in df['Text']:
    tweet_indices = []
    for word in tweet.split():
        
        word_indices = []
        if word in word_to_charr:
            word_indices = word_to_charr[word]
        else:
            #word is not in map, fill it with dummy chars
            word_indices = [char_glove['PAD']] * max_word_len
        
        if len(word) > max_word_len:
            word_indices = word_indices[:max_word_len]
        else:
            word_indices = word_indices + [char_glove['PAD']]*(max_word_len-len(word_indices))
        
        tweet_indices.append(word_indices)
    
    X_char.append(tweet_indices)

#now add more padding to make all tweets same length
for i in range(len(X_char)):
    tweet_indices = X_char[i]
    if len(tweet_indices) < max_len:
        taco = len(tweet_indices)
        for j in range(max_len-taco):
            tweet_indices.append([char_glove['PAD']]*max_word_len)
        X_char[i] = tweet_indices 

X_char = np.array(X_char, dtype=np.int64) 
X_char.shape

(4078, 22, 10)

In [46]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
device

device(type='cuda', index=0)

In [104]:
"""
This architecture is inspired by the one used in the paper
'Deep Convolutional Neural Networks for Sentiment Analysis of Short Texts' (Santos and Gatti, 2014)
"""
class CNN(nn.Module):
    def __init__(self, w_embeddings, c_embeddings, n_w_filters, n_c_filters,
                    w_filter_sizes, c_filter_sizes, n_classes, dropout):
        
        super(CNN, self).__init__()
        
        #length of the word and character embeddings
        word_embedding_dim = w_embeddings.shape[1]
        char_embedding_dim = c_embeddings.shape[1]
        
        #architecture
        self.word_embedding = nn.Embedding.from_pretrained(w_embeddings).cuda()
        self.char_embedding = nn.Embedding.from_pretrained(c_embeddings).cuda()
        
        self.word_convs = [nn.Conv2d(in_channels = 1, 
                               out_channels = n_w_filters, 
                               kernel_size = (f_size, word_embedding_dim + len(c_filter_sizes)*n_c_filters)).cuda() 
                     for f_size in w_filter_sizes]
        
        self.char_convs = [nn.Conv2d(in_channels = 1, 
                               out_channels = n_c_filters, 
                               kernel_size = (f_size, char_embedding_dim)).cuda() 
                     for f_size in c_filter_sizes]
        
        
        self.fc = nn.Linear(len(w_filter_sizes) * n_w_filters, n_classes).cuda()
        
        self.dropout = nn.Dropout(dropout).cuda()
        self.softmax = nn.Softmax().cuda()
        
    def forward(self, word_indices, char_indices):
        
        #first generate word vectors from character embeddings
        char_indices = self.char_embedding(char_indices)
        char_vectors = [self.process_chars(tweet) for tweet in char_indices]
        char_vectors = torch.reshape(torch.cat(char_vectors, dim=0), 
                                    (len(char_vectors), *char_vectors[0].shape))
        char_vectors = char_vectors.unsqueeze(1)
        
        embedded = self.word_embedding(word_indices)
        embedded = embedded.unsqueeze(1)
        embedded = torch.cat((embedded, char_vectors), dim=3)
        
        conved = [F.tanh(conv(embedded)).squeeze(3) for conv in self.word_convs]
        pooled = [F.max_pool1d(conv, conv.shape[2]).squeeze(2) for conv in conved]
        
        cat = self.dropout(torch.cat(pooled, dim = 1))
        return self.softmax(self.fc(cat))
    
    def process_chars(self, tweet):
        tweet = tweet.unsqueeze(1)
        conved = [F.tanh(conv(tweet)).squeeze(3) for conv in self.char_convs]
        pooled = [F.max_pool1d(conv, conv.shape[2]).squeeze(2) for conv in conved]
        word_vector = torch.cat(pooled, dim=1)
        return word_vector
    
    def predict(self, tweet, chars):
        return np.argmax(self.forward(tweet, chars).detach().cpu().numpy())

In [120]:
def train_cnn_classifier(X_train, X_char_train, y_train, w_embeddings, c_embeddings, num_classes, manual_params=None, verbose=False):
    try:
        start = time.time()
        w_embeddings = torch.from_numpy(w_embeddings).float().to(device)
        c_embeddings = torch.from_numpy(c_embeddings).float().to(device)
        
        #default parameters for the model
        params = {'batch_size': 10, 'epochs': 50, 'lr': 0.0001, 'n_w_filters': 400, 
                  'n_c_filters': 100, 'w_filter_sizes': [1,1,1], 'c_filter_sizes': [3,4,5], 'dropout': 0.75}
        
        #replace default parameters with any user-defined ones
        if manual_params is not None:
            for p in manual_params:
                params[p] = manual_params[p]
                
        batch_size = params['batch_size']
        epochs = params['epochs']
        lr = params['lr']
        
        #initialize network and optimizer
        cnn = CNN(w_embeddings, c_embeddings, n_w_filters=params['n_w_filters'], 
                  n_c_filters=params['n_c_filters'], w_filter_sizes=params['w_filter_sizes'],
                  c_filter_sizes=params['c_filter_sizes'], n_classes=num_classes, dropout=params['dropout'])
        cnn.to(device)
        
        optimizer = optim.Adam(cnn.parameters(), lr=lr)
        loss = nn.CrossEntropyLoss()
        
        cnn.train()
        for epoch in range(epochs):
            
            ex_indices = [i for i in range(len(X_train))]
            random.shuffle(ex_indices)
            total_loss = 0.0
            
            for idx in range(len(ex_indices)//batch_size):
                
                #create input batch to feed in
                cur_batch_idx = ex_indices[idx*batch_size:(idx+1)*batch_size]
                cur_X = torch.from_numpy(np.asarray([X_train[i] for i in cur_batch_idx])).long().to(device)
                cur_X_char = torch.from_numpy(np.asarray([X_char_train[i] for i in cur_batch_idx])).long().to(device)
                cur_y = torch.from_numpy(np.asarray([y_train[i] for i in cur_batch_idx])).to(device)
                
                #train
                cnn.zero_grad()
                probs = cnn.forward(cur_X, cur_X_char)
                
                #calculate loss and update weights
                cur_loss = loss(probs, cur_y)
                total_loss += cur_loss
                cur_loss.backward()
                optimizer.step()
            
            if verbose:
                print("Avg loss on epoch %i: %f" % (epoch+1, total_loss/len(ex_indices)))
        end = time.time()
        print("Time taken: %f seconds" % (end-start))
        return cnn
    except KeyboardInterrupt:
        end = time.time()
        print("Time taken: %f seconds" % (end-start))
        return cnn

In [121]:
# evaluates binary classification model
def calc_metrics(model, X_test, X_char_test, y_test):
    num_correct = 0
    num_true_pos = 0
    num_false_pos = 0
    num_false_neg = 0
    
    num_test_exs = len(X_test)
    
    model.eval()
    for i in range(num_test_exs):
        
        cur_batch_idx = [i]
        cur_X = torch.from_numpy(np.asarray([X_test[i] for i in cur_batch_idx])).long().to(device)
        cur_X_char = torch.from_numpy(np.asarray([X_char_test[i] for i in cur_batch_idx])).long().to(device)
        
        y_pred = model.predict(cur_X, cur_X_char)
        y_gold = y_test[i]
        if y_pred == y_gold:
            num_correct += 1
            if y_gold > 0:
                num_true_pos += 1
        else:
            if y_pred == 0:
                num_false_neg += 1
            else:
                num_false_pos += 1

    accuracy = num_correct/num_test_exs
    precision = num_true_pos/(num_true_pos + num_false_pos)
    recall = num_true_pos/(num_true_pos + num_false_neg)
    f1 = 2*precision*recall/(precision+recall)

    return {'accuracy': accuracy, 'precision': precision, 'recall': recall, 'f1': f1}

In [130]:
def kfold(X, X_char, y, embeddings, char_embeddings, manual_params=None, k=10):
    ex_indices = list(range(X.shape[0]))
    random.shuffle(ex_indices)
    
    accuracy = np.zeros(k)
    precision = np.zeros(k)
    recall = np.zeros(k)
    f1 = np.zeros(k)
    
    #calculate the splitting scheme
    splits = [X.shape[0]//k] * k
    for i in range(X.shape[0] % k):
        splits[i] += 1
    
    #keeps track of current location in 
    index = 0
    for i in range(k):
        #come up with the train-test split
        X_test = np.asarray([X[i] for i in ex_indices[index:index+splits[i]]])
        X_char_test = np.asarray([X_char[i] for i in ex_indices[index:index+splits[i]]])
        y_test = np.asarray([y[i] for i in ex_indices[index:index+splits[i]]])
        
        train_indices = ex_indices[0:index] + ex_indices[index+splits[i]:]
        X_train = np.asarray([X[i] for i in train_indices])
        X_char_train = np.asarray([X_char[i] for i in train_indices])
        y_train = np.asarray([y[i] for i in train_indices])
        
        #now train the model on this split and save the metrics
        cnn = train_cnn_classifier(X_train, X_char_train, y_train, 
                                   embeddings, char_embeddings, num_classes=2, 
                                   manual_params=manual_params, verbose=False)
        
        results = calc_metrics(cnn, X_test, X_char_test, y_test)
        accuracy[i] = results['accuracy']
        precision[i] = results['precision']
        recall[i] = results['recall']
        f1[i] = results['f1']
        
        index += splits[i]
    
    return {'accuracy': np.mean(accuracy), 'precision': np.mean(precision), 
           'recall': np.mean(recall), 'f1': np.mean(f1)}

In [131]:
def gridsearch(X, X_char, y, embeddings, char_embeddings, params, metric='f1', k=10):
    
    results = []
    keys = []
    values = []
    for key in params:
        keys.append(key)
        values.append(params[key])
    
    for config in product(*values):
        p = {}
        for i, v in enumerate(config):
            p[keys[i]] = v
        
        res = kfold(X, X_char, y, embeddings, char_embeddings, manual_params=p, k=k)
        results.append((p, res))
    
    return sorted(results, reverse=True, key=lambda x: x[1][metric])

In [124]:
taco = int(2*len(X)/3)
X_train = X[:taco]
X_char_train = X_char[:taco]
y_train = y[:taco]

X_test = X[taco:]
X_char_test = X_char[taco:]
y_test = y[taco:]

print('Training Relevancy CNN Classifier')
cnn = train_cnn_classifier(X_train, X_char_train, y_train, embeddings, char_embeddings, 2)
print('\nRelevancy metrics:')
calc_metrics(cnn, X_test, X_char_test, y_test)

Training Relevancy CNN Classifier
Time taken: 251.318575 seconds

Relevancy metrics:


{'accuracy': 0.6588235294117647,
 'precision': 0.6293436293436293,
 'recall': 0.7353383458646616,
 'f1': 0.6782246879334257}

In [132]:
kfold(X, X_char, y, embeddings, char_embeddings, k=10)

Time taken: 406.577297 seconds


TypeError: calc_metrics() missing 1 required positional argument: 'y_test'

In [None]:
#first gridsearch for the best region size
#figure out region size first
filter_sizes = []
for i in range(10):
    filter_sizes.append([i+1])
filter_sizes.append([15])
filter_sizes_search = gridsearch(X, y, embeddings, params={'filter_sizes': filter_sizes})