In [7]:
from time import time
import sys

import numpy as np
from scipy.sparse import csr_matrix, vstack
from sklearn.model_selection import train_test_split
from sklearn import metrics as skmetrics
from nltk.tokenize import TweetTokenizer
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from collections import Counter

np.random.seed(1234) # help reproducibility

In [8]:
%config IPCompleter.greedy=True

In [48]:
class CNN(nn.Module):
    def __init__(self, vocab_size, embedding_dim, num_filters, num_classes, max_tweet_length, embeddings=None):
        super(CNN, self).__init__()

        if embeddings is not None:
            self.embedding = nn.Embedding.from_pretrained(embeddings)
        else:
            self.embedding = nn.Embedding(num_embeddings=vocab_size, embedding_dim=embedding_dim)
        self.conv1 = nn.Conv1d(in_channels=embedding_dim, out_channels=num_filters, kernel_size=1)
        self.conv2 = nn.Conv1d(in_channels=embedding_dim, out_channels=num_filters, kernel_size=2)
#         self.conv3 = nn.Conv1d(in_channels=embedding_dim, out_channels=num_filters, kernel_size=3)
        self.max_pool = nn.MaxPool1d(kernel_size=max_tweet_length+1, padding=(max_tweet_length+1) // 2)
        self.g = nn.ReLU()
        self.linear = nn.Linear(in_features=num_filters * 2, out_features=num_classes)
        self.loss = nn.CrossEntropyLoss()
                
    def forward(self, x):
        
        emb = self.embedding(x).T
        emb = emb.unsqueeze(0)
        
        c1 = self.conv1(emb)
        c2 = self.conv2(emb)
#         c3 = self.conv3(emb)
        
        c1_pooled = self.max_pool(c1)
        c2_pooled = self.max_pool(c2)
#         c3_pooled = self.max_pool(c3)
        
        h1 = self.g(c1_pooled)
        h2 = self.g(c2_pooled)
#         h3 = self.g(c3_pooled)
        
        try:
#             all_out = torch.cat((h1, h2, h3), 1).squeeze()
            all_out = torch.cat((h1, h2), 1).squeeze()
        except Exception as e:
            print(e)
            print(h1.shape)
            print(h2.shape)
#             print(h3.shape)
            sys.exit()
        
        y_hat = self.linear(all_out)
        
        return y_hat
    
    def train(self, X, Y, iterations, learning_rate):
        print('beginning training...')
        num_samples = len(Y)
        
        optimizer = optim.Adam(params=self.parameters(), lr=learning_rate)
        
        for e in range(iterations):
            # randomize order of samples at each epoch
            rand_idx = np.random.permutation(num_samples)

            total_loss = 0
            count = 0
            for i in rand_idx:
                x = X[i]
                y = Y[i].long().unsqueeze(0)
                
                self.zero_grad()
                y_hat = self(x).unsqueeze(0)
                
                loss = self.loss.forward(y_hat, y)
                total_loss += loss
                loss.backward()
                optimizer.step()
                
#                 if count % 1000 == 0:
#                     print(f'loss as of sample {count}: {total_loss}')
                    
#                 count += 1

            print(f'total loss on epoch {e}: {total_loss}')
        print('done training!')
        return
    
    def eval(self, X, Y, idx=None):
        print('evaluating...')
        Y_hat = torch.zeros(len(Y))
        
        if idx is not None:
            fp = open('predictions/cnn-validation.csv', 'w')
            fp.write('id,prediction\n')
        
        for i in range(len(X)):
            y_prob = self(X[i])
            pred = torch.argmax(y_prob)
            Y_hat[i] = pred
            if idx is not None:
                fp.write(f'{idx[i]},{pred}\n')
                
        
        if idx is not None:
            fp.close()
        
        
        metrics = {}
        
        metrics['acc'] = skmetrics.accuracy_score(Y, Y_hat)
        metrics['f1'] = skmetrics.f1_score(Y, Y_hat)
        metrics['prec'] = skmetrics.precision_score(Y, Y_hat)
        metrics['rec'] = skmetrics.recall_score(Y, Y_hat)
        
        
        return metrics

In [45]:
def get_embeddings(word2id, embedding_dim):
    print('loading word embeddings...')
    embedding_file = 'data/glove.twitter.27B/glove.twitter.27B.%dd.txt' % embedding_dim
    
    embeddings = torch.empty(len(word2id), embedding_dim)
    glove_embeddings = {}
    
    with open(embedding_file) as fp:
        for line in fp.readlines():
            arr = line.split()
            glove_embeddings[arr[0]] = np.array(arr[1:embedding_dim+1]).astype(float)
        
    for w, idx in word2id.items():
        if w in glove_embeddings:
            embeddings[idx] = torch.LongTensor(glove_embeddings[w])
        else:
            embeddings[idx] = torch.randn(embedding_dim)
            
    return embeddings
            
            

In [34]:
# **** FOR TRAINING AND TESTING ON OLID TRAINING SET ****

# extract data from file and format accordingly
data = pd.read_csv('data/OLIDv1.0/olid-training-v1.0.tsv', '\t')
X_raw = data['tweet'].values
tknzr = TweetTokenizer(preserve_case=False, reduce_len=True, strip_handles=True)

# build vocab
print('building vocabulary...')
vocab = Counter()
for tweet in data['tweet']:
    vocab.update(w for w in tknzr.tokenize(tweet))
    
vocab = sorted(vocab, key=vocab.get, reverse=True)

word2id = {w:i for i,w in enumerate(vocab)}
id2word = {i:w for i,w in enumerate(vocab)}

def convert_to_idx(tweet):
    return [word2id[w] for w in tknzr.tokenize(tweet)]

data['tweet_tokenized'] = data['tweet'].apply(convert_to_idx)
data['length'] = data['tweet_tokenized'].apply(len)
max_tweet_len = int(data['length'].max())

data_without_short_tweets = data.loc[data['length'] >= 3]

X = [x for x in data_without_short_tweets['tweet_tokenized']]


Y_raw = data_without_short_tweets['subtask_a'].values
Y = np.zeros(len(Y_raw))
Y[np.where(Y_raw == 'OFF')] = 1.

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=.2)

# convert Xs to tensors
X_train = [torch.LongTensor(x) for x in X_train]
X_test = [torch.LongTensor(x) for x in X_test]
Y_train = torch.Tensor(Y_train)
Y_test = torch.Tensor(Y_test)

# this prevents printing to predictions/cnn-validation.csv
idx_test = None

building vocabulary...


In [46]:
# **** FOR TRAINING ON OLID TRAING AND TESTING ON OLID VALIDATION

training_df = pd.read_csv('data/OLIDv1.0/olid-training-v1.0.tsv', '\t')

test_df = pd.read_csv('data/OLIDv1.0/testset-levela.tsv', '\t')
Y_test_df = pd.read_csv('data/OLIDv1.0/labels-levela.csv', names=['id', 'label'])

test_df['label'] = Y_test_df['label']


tknzr = TweetTokenizer(preserve_case=False, reduce_len=True, strip_handles=True)

# build vocab over both training and test sets
print('building vocabulary...')
vocab = Counter()
for tweet in training_df['tweet'].append(test_df['tweet']):
    vocab.update(w for w in tknzr.tokenize(tweet))
    
vocab = sorted(vocab, key=vocab.get, reverse=True)

word2id = {w:i for i,w in enumerate(vocab)}
id2word = {i:w for i,w in enumerate(vocab)}

def convert_to_idx(tweet):
    return [word2id[w] for w in tknzr.tokenize(tweet)]

print('converting tweets to IDs...')
training_df['tweet_tokenized'] = training_df['tweet'].apply(convert_to_idx)
training_df['length'] = training_df['tweet_tokenized'].apply(len)

test_df['tweet_tokenized'] = test_df['tweet'].apply(convert_to_idx)
test_df['length'] = test_df['tweet_tokenized'].apply(len)

print('removing short tweets...')
max_tweet_len = int((training_df['length'].append(test_df['length'])).max())

train_without_short_tweets = training_df.loc[training_df['length'] >= 3]
test_without_short_tweets = test_df.loc[test_df['length'] >= 3]

X_train = [x for x in train_without_short_tweets['tweet_tokenized']]
X_test = [x for x in test_without_short_tweets['tweet_tokenized']]

Y_train_raw = train_without_short_tweets['subtask_a'].values
Y_train = np.zeros(len(Y_train_raw))
Y_train[np.where(Y_train_raw == 'OFF')] = 1.

Y_test_raw = test_without_short_tweets['label'].values
Y_test = np.zeros(len(Y_test_raw))
Y_test[np.where(Y_test_raw == 'OFF')] = 1.

# this means we will print to predictions/cnn-validation.csv
idx_test = test_without_short_tweets['id'].values

X_train = [torch.LongTensor(x) for x in X_train]
X_test = [torch.LongTensor(x) for x in X_test]
Y_train = torch.Tensor(Y_train)
Y_test = torch.Tensor(Y_test)
print('done prepping data')

building vocabulary...
converting tweets to IDs...
removing short tweets...
done prepping data


In [49]:
# hyper params
EPOCHS = 3
LR = .01
EMBEDDING_DIM = 10
NUM_FILTERS = 200


# embeddings = get_embeddings(word2id, EMBEDDING_DIM)
model = CNN(vocab_size=len(vocab), embedding_dim=EMBEDDING_DIM, num_filters=NUM_FILTERS, num_classes=2, max_tweet_length=max_tweet_len)
model.train(X_train, Y_train, EPOCHS, LR)
metrics = model.eval(X_test, Y_test, idx_test)

acc = metrics['acc']
f1 = metrics['f1']
prec = metrics['prec']
rec = metrics['rec']

print('Metrics:')
print(f'accuracy: {acc}')
print(f'f1: {f1}')
print(f'precision: {prec}')
print(f'recall: {rec}')


yooooo
beginning training...
total loss on epoch 0: 10224.8271484375
total loss on epoch 1: 9280.5302734375
total loss on epoch 2: 8348.5615234375
done training!
evaluating...
Metrics:
accuracy: 0.79463243873979
f1: 0.5294117647058822
precision: 0.7333333333333333
recall: 0.41422594142259417


### Results

#### self trained embeddings

* three sizes of filter (1, 2, 3)
* 0.01 learning rate
* 3 epochs

  * accuracy: 0.6874760628111835
  * f1: 0.4708171206225681
  * precision: 0.5208034433285509
  * recall: 0.42958579881656805
 
* three sizes of filter (1, 2, 3)
* 0.01 learning rate
* 4 epochs

  * accuracy: 0.7338184603600153
  * f1: 0.42514474772539296
  * precision: 0.6710182767624021
  * recall: 0.31113801452784506
  
* three sizes of filter (1, 2, 3)
* 0.01 learning rate
* 5 epochs

  * accuracy: 0.6974339333588664
  * f1: 0.5135467980295566
  * precision: 0.5567423230974633
  * recall: 0.4765714285714286
  
* three sizes of filter (1, 2, 3)
* 0.001 learning rate
* 5 epochs

  * accuracy: 0.7123707391803906
  * f1: 0.5406727828746177
  * precision: 0.5755208333333334
  * recall: 0.5098039215686274
  
* two sizes of filter (1, 2)
* 0.01 learning rate
* 4 epochs

  * accuracy: 0.7280735350440444
  * f1: 0.5069444444444444
  * precision: 0.6529516994633273
  * recall: 0.41430192962542567
  
* two sizes of filter (1, 2)
* 0.001 learning rate
* 5 epochs

  * accuracy: 0.7261585599387208
  * f1: 0.3986543313708999
  * precision: 0.7337461300309598
  * recall: 0.27367205542725176
  
* two sizes of filter (1, 2)
* 0.01 learning rate
* 3 epochs

  * accuracy: 0.7342014553810801
  * f1: 0.5475880052151239
  * precision: 0.6451612903225806
  * recall: 0.47565118912797283
  
#### pre-trained embeddings

* two sizes of filter (1, 2)
* 0.01 learning rate
* 3 epochs

  * accuracy: 0.665645346610494
  * f1: 0.3361216730038023
  * precision: 0.5224586288416075
  * recall: 0.24775784753363228