In this project, I will attempt to solve the following problem. Given a headline, predict whether or not it is from the Onion (a satire news source). I wil first attempt to do this using a Nueral Bag Of Words (NBOW model). Then, I'll attempt to use a LSTM and hopefully get better results.

### Part 1. Loading and Preprocessing Data 
The following cell loads the OnionOrNot dataset

In [None]:
!curl https://raw.githubusercontent.com/lukefeilberg/onion/master/OnionOrNot.csv > OnionOrNot.csv

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
  0     0    0     0    0     0      0      0 --:--:-- --:--:-- --:--:--     0100 1903k  100 1903k    0     0  9019k      0 --:--:-- --:--:-- --:--:-- 9019k


In [None]:
import torch
import random
import numpy as np

RANDOM_SEED = 42
torch.manual_seed(RANDOM_SEED)
random.seed(RANDOM_SEED)
np.random.seed(RANDOM_SEED)


device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [None]:
import re
import html

def spec_add_spaces(t: str) -> str:
    "Add spaces around / and # in `t`. \n"
    return re.sub(r"([/#\n])", r" \1 ", t)

def rm_useless_spaces(t: str) -> str:
    "Remove multiple spaces in `t`."
    return re.sub(" {2,}", " ", t)

def replace_multi_newline(t: str) -> str:
    return re.sub(r"(\n(\s)*){2,}", "\n", t)

def fix_html(x: str) -> str:
    "List of replacements from html strings in `x`."
    re1 = re.compile(r"  +")
    x = (
        x.replace("#39;", "'")
        .replace("amp;", "&")
        .replace("#146;", "'")
        .replace("nbsp;", " ")
        .replace("#36;", "$")
        .replace("\\n", "\n")
        .replace("quot;", "'")
        .replace("<br />", "\n")
        .replace('\\"', '"')
        .replace(" @.@ ", ".")
        .replace(" @-@ ", "-")
        .replace(" @,@ ", ",")
        .replace("\\", " \\ ")
    )
    return re1.sub(" ", html.unescape(x))

def clean_text(input_text):
    text = fix_html(input_text)
    text = replace_multi_newline(text)
    text = spec_add_spaces(text)
    text = rm_useless_spaces(text)
    text = text.strip()
    return text

In [None]:
import pandas as pd
import nltk
from tqdm import tqdm

nltk.download('punkt')
df              = pd.read_csv("OnionOrNot.csv")
df["tokenized"] = df["text"].apply(lambda x: nltk.word_tokenize(clean_text(x.lower())))

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [None]:
df.head()

Unnamed: 0,text,label,tokenized
0,Entire Facebook Staff Laughs As Man Tightens P...,1,"[entire, facebook, staff, laughs, as, man, tig..."
1,Muslim Woman Denied Soda Can for Fear She Coul...,0,"[muslim, woman, denied, soda, can, for, fear, ..."
2,Bold Move: Hulu Has Announced That They’re Gon...,1,"[bold, move, :, hulu, has, announced, that, th..."
3,Despondent Jeff Bezos Realizes He’ll Have To W...,1,"[despondent, jeff, bezos, realizes, he, ’, ll,..."
4,"For men looking for great single women, online...",1,"[for, men, looking, for, great, single, women,..."


In [None]:
df.iloc[42]

text         Customers continued to wait at drive-thru even...
label                                                        0
tokenized    [customers, continued, to, wait, at, drive-thr...
Name: 42, dtype: object

#### Split the dataset into training, validation, and testing

In [None]:
from collections import Counter
PADDING_VALUE = 0
UNK_VALUE     = 1
rame


def split_train_val_test(df, props=[.8, .1, .1]):
    assert round(sum(props), 2) == 1 and len(props) >= 2
    train_df, test_df, val_df = None, None, None
    prev = 0
    for i, prop in enumerate(props):
      props[i] = prev + prop
      prev = props[i]
    train_df, val_df, test_df  = df.iloc[0:int(props[0] * len(df))], df.iloc[int(props[0] * len(df)):int(props[1] * len(df))], df.iloc[int(props[1] * len(df)):]
    return train_df, val_df, test_df

def generate_vocab_map(df, cutoff=2):
    vocab          = {"": PADDING_VALUE, "UNK": UNK_VALUE}
    reversed_vocab = dict()
    
    freqMap = nltk.FreqDist(sum(df["tokenized"].tolist(), []))
    uniqId = 2
    for word in freqMap.keys():
      if freqMap[word] > cutoff:
        vocab[word] = uniqId
        uniqId += 1
    for word in vocab.keys():
      reversed_vocab[vocab[word]] = word
    
    return vocab, reversed_vocab

In [None]:
df                         = df.sample(frac=1)
train_df, val_df, test_df  = split_train_val_test(df, props=[.8, .1, .1])
train_vocab, reverse_vocab = generate_vocab_map(train_df)

In [None]:
(len(train_df) / len(df)), (len(val_df) / len(df)), (len(test_df) / len(df))

(0.8, 0.1, 0.1)

In [None]:
print(type(df["tokenized"][0]))
print(torch.zeros([1], dtype=torch.int32))

<class 'list'>
tensor([0], dtype=torch.int32)


#### Building a Dataset Class

PyTorch has custom Dataset Classes that have very useful extentions, we want to turn our current pandas DataFrame into a subclass of Dataset so that we can iterate and sample through it for minibatch updates. **In the following cell, fill out the HeadlineDataset class.** Refer to PyTorch documentation on [Dataset Classes](https://pytorch.org/tutorials/beginner/basics/data_tutorial.html) 
for help.

In [None]:
from torch.utils.data import Dataset

class HeadlineDataset(Dataset):
    
    
    def __init__(self, vocab, df, max_length=50):
        self.vocab = vocab
        self.df = df 
        self.df.reset_index(inplace = True)
        self.max_length = max_length
        return 
    def __len__(self):

        df_len = None  
        df_len = len(self.df)
        return df_len

    def __getitem__(self, index: int):
        tokenized_word_tensor = None
        curr_label            = None
        l = self.df["tokenized"][index]
        tokenized_word_tensor = torch.empty([len(l)], dtype=torch.long)
        for i, word in enumerate(l):
          if word in self.vocab.keys():
            tokenized_word_tensor[i] = self.vocab[word]
          else:
            tokenized_word_tensor[i] = self.vocab["UNK"]
        curr_label = self.df["label"][index]
        return tokenized_word_tensor, curr_label


In [None]:
from torch.utils.data import RandomSampler

train_dataset = HeadlineDataset(train_vocab, train_df)
val_dataset   = HeadlineDataset(train_vocab, val_df)
test_dataset  = HeadlineDataset(train_vocab, test_df)

 
train_sampler = RandomSampler(train_dataset)
val_sampler   = RandomSampler(val_dataset)
test_sampler  = RandomSampler(test_dataset)

#### Finishing DataLoader

We can now use PyTorch DataLoaders to batch our data for us. **In the following cell fill out collate_fn.** Refer to PyTorch documentation on [DataLoaders](https://pytorch.org/tutorials/beginner/basics/data_tutorial.html) for help.

In [None]:
from torch.nn.utils.rnn import pad_sequence

def collate_fn(batch, padding_value=PADDING_VALUE):
    padded_tokens, y_labels = None, None
    maxlen = max(batch, key=len)
    padded_tokens = pad_sequence([x[0] for x in batch], batch_first = True, padding_value = PADDING_VALUE)
    y_labels = torch.Tensor([x[1] for x in batch])
    return padded_tokens, y_labels

In [None]:
from torch.utils.data import DataLoader
BATCH_SIZE = 16

train_iterator = DataLoader(train_dataset, batch_size=BATCH_SIZE, sampler=train_sampler, collate_fn=collate_fn)
val_iterator   = DataLoader(val_dataset, batch_size=BATCH_SIZE, sampler=val_sampler, collate_fn=collate_fn)
test_iterator  = DataLoader(test_dataset, batch_size=BATCH_SIZE, sampler=test_sampler, collate_fn=collate_fn)

In [None]:
for x, y in test_iterator:
    print(f'x: {x.shape}')
    print(f'y: {y.shape}')
    break
test_iterator = DataLoader(test_dataset, batch_size=BATCH_SIZE, sampler=test_sampler, collate_fn=collate_fn)

x: torch.Size([16, 23])
y: torch.Size([16])


### Create NBOW Model
Architecture Reference: Section 2.1 in (https://www.aclweb.org/anthology/P15-1162.pdf). 

In [None]:
import torch.nn as nn

class NBOW(nn.Module):
    def __init__(self, vocab_size, embedding_dim):
        super().__init__()
        self.embedLayer = nn.Embedding(vocab_size, embedding_dim)
        self.linearLayer = nn.Linear(embedding_dim, 1)
        self.sigmoidLayer = nn.Sigmoid()

    def forward(self, x):
        EmbedOutput = self.embedLayer(x)
        LinearOutput = self.linearLayer(torch.mean(EmbedOutput, dim=1))
        return torch.squeeze(self.sigmoidLayer(LinearOutput))



In [None]:
model = NBOW(vocab_size    = len(train_vocab.keys()),
             embedding_dim = 300).to(device)

Loss function and Optimizer

In [None]:
from torch.optim import Adam

criterion, optimizer = None, None
criterion = nn.BCELoss()
optimizer = Adam(model.parameters(), lr = .001)



### Part 3: Training and Evaluation


In [None]:
def train_loop(model, criterion, optim, iterator):
    model.train()
    total_loss = 0
    for x, y in tqdm(iterator):
        output = model(x.to(device))
        loss = criterion(output, y.to(device))
        optim.zero_grad()
        loss.backward()
        optim.step()
        total_loss += loss.item()
    return total_loss

def val_loop(model, iterator):
    true, pred = [], []
    for x, y in tqdm(iterator):
      predictedVals = model(x.to(device))
      trueVals = y.to(device)
      true = [x == 1 for x in trueVals]
      pred = [x >= .5 for x in predictedVals]
    return true, pred

#Define and Use evaluation metrics

For the sake of learning, I chose to implement my own evaluation metrics.

In [None]:
def accuracy(true, pred):
    acc = None
    ## YOUR CODE STARTS HERE (~2-5 lines of code) ##
    arr = [x[0] == x[1] for x in zip(true, pred)]
    acc = sum(arr) / len(arr)
    ## YOUR CODE ENDS HERE ##
    return acc


def binary_f1(true, pred, selected_class=True):
    f1 = None
    ## YOUR CODE STARTS HERE (~10-15 lines of code) ##
    tup = zip(true, pred)
    tp = 0
    tn = 0 
    fp = 0
    fn = 0
    for t, p in tup:
      if t == p:
        if p == True:
          tp += 1
        else:
          tn += 1
      else:
        if t == True and p == False:
          fn += 1
        else:
          fp += 1

    if selected_class:
      tprecision = 0
      if tp + fp == 0:
        tprecision = 0
      else:
        tprecision = tp / (tp + fp)
      trecall = 0
      if tp + fn == 0:
        trecall = 0
      else:
        trecall = tp / (tp + fn)
      if tprecision + trecall == 0:
        return 0
      f1 = 2 * (tprecision * trecall) / (tprecision + trecall)
    else:
      fprecision = 0
      if tn + fn == 0:
        fprecision = 0
      else:
        fprecision = tn / (tn + fn)
      frecall = 0
      if tn + fp == 0:
        frecall = 0
      else:
        frecall = tn / (tn + fp)
      if fprecision + frecall == 0:
        return 0
      f1 = 2 * (fprecision * frecall) / (fprecision + frecall)
    ## YOUR CODE ENDS HERE ##
    return f1


def binary_macro_f1(true, pred):
    averaged_macro_f1 = None
    averaged_macro_f1 = (binary_f1(true, pred, selected_class=True) + binary_f1(true, pred, selected_class=False)) / 2
    return averaged_macro_f1

In [None]:
true, pred = val_loop(model, val_iterator)
print()
print(f'Binary Macro F1: {binary_macro_f1(true, pred)}')
print(f'Accuracy: {accuracy(true, pred)}')

100%|██████████| 150/150 [00:00<00:00, 553.81it/s]


Binary Macro F1: 0.375
Accuracy: 0.375





### Part 4: Training the model 

In [None]:
TOTAL_EPOCHS = 10
for epoch in range(TOTAL_EPOCHS):
    train_loss = train_loop(model, criterion, optimizer, train_iterator)
    true, pred = val_loop(model, val_iterator)
    print(f"EPOCH: {epoch}")
    print(f"TRAIN LOSS: {train_loss}")
    print(f"VAL F-1: {binary_macro_f1(true, pred)}")
    print(f"VAL ACC: {accuracy(true, pred)}")

100%|██████████| 1200/1200 [00:03<00:00, 385.75it/s]
100%|██████████| 150/150 [00:00<00:00, 570.37it/s]


EPOCH: 0
TRAIN LOSS: 621.4540008604527
VAL F-1: 0.5897435897435898
VAL ACC: 0.75


100%|██████████| 1200/1200 [00:03<00:00, 384.94it/s]
100%|██████████| 150/150 [00:00<00:00, 558.99it/s]


EPOCH: 1
TRAIN LOSS: 404.4328829944134
VAL F-1: 0.8333333333333333
VAL ACC: 0.875


100%|██████████| 1200/1200 [00:03<00:00, 385.53it/s]
100%|██████████| 150/150 [00:00<00:00, 550.92it/s]


EPOCH: 2
TRAIN LOSS: 317.68711391836405
VAL F-1: 0.746031746031746
VAL ACC: 0.75


100%|██████████| 1200/1200 [00:03<00:00, 382.98it/s]
100%|██████████| 150/150 [00:00<00:00, 555.87it/s]


EPOCH: 3
TRAIN LOSS: 267.90556765161455
VAL F-1: 0.9352226720647774
VAL ACC: 0.9375


100%|██████████| 1200/1200 [00:03<00:00, 385.54it/s]
100%|██████████| 150/150 [00:00<00:00, 552.14it/s]


EPOCH: 4
TRAIN LOSS: 229.62556424643844
VAL F-1: 0.9372549019607843
VAL ACC: 0.9375


100%|██████████| 1200/1200 [00:03<00:00, 386.34it/s]
100%|██████████| 150/150 [00:00<00:00, 559.25it/s]


EPOCH: 5
TRAIN LOSS: 201.79788933508098
VAL F-1: 0.8117647058823529
VAL ACC: 0.8125


100%|██████████| 1200/1200 [00:03<00:00, 386.44it/s]
100%|██████████| 150/150 [00:00<00:00, 558.21it/s]


EPOCH: 6
TRAIN LOSS: 179.83632330223918
VAL F-1: 1.0
VAL ACC: 1.0


100%|██████████| 1200/1200 [00:03<00:00, 388.64it/s]
100%|██████████| 150/150 [00:00<00:00, 556.32it/s]


EPOCH: 7
TRAIN LOSS: 159.9260141660925
VAL F-1: 0.9352226720647774
VAL ACC: 0.9375


100%|██████████| 1200/1200 [00:03<00:00, 387.05it/s]
100%|██████████| 150/150 [00:00<00:00, 552.77it/s]


EPOCH: 8
TRAIN LOSS: 144.3743618351873
VAL F-1: 0.8545454545454546
VAL ACC: 0.875


100%|██████████| 1200/1200 [00:03<00:00, 387.43it/s]
100%|██████████| 150/150 [00:00<00:00, 562.06it/s]

EPOCH: 9
TRAIN LOSS: 131.1241482088808
VAL F-1: 0.7090909090909091
VAL ACC: 0.75





We can also look at the models performance on the held-out test set, using the same val_loop we wrote earlier.

In [None]:
true, pred = val_loop(model, test_iterator)
print()
print(f"TEST F-1: {binary_macro_f1(true, pred)}")
print(f"TEST ACC: {accuracy(true, pred)}")

100%|██████████| 150/150 [00:00<00:00, 559.35it/s]


TEST F-1: 0.8666666666666667
TEST ACC: 0.875





### Part 6: LSTM Model 

In [None]:
class RecurrentModel(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, \
                 num_layers=1, bidirectional=True):
        super().__init__()
        mult = 1
        if(bidirectional):
          mult = 2
        self.embedLayer = nn.Embedding(vocab_size, embedding_dim)
        self.LSTMLayer = nn.LSTM(input_size=embedding_dim, hidden_size=hidden_dim, num_layers=num_layers, bidirectional=bidirectional, batch_first=True)
        self.linearLayer = nn.Linear(mult * embedding_dim, 1)
        self.sigmoidLayer = nn.Sigmoid()

    def forward(self, x):
        EmbedOutput = self.embedLayer(x)
        LSTMOutput, (hidden, cell) = self.LSTMLayer(EmbedOutput)
        LinearOutput = self.linearLayer(LSTMOutput[:, -1, :])
        return torch.squeeze(self.sigmoidLayer(LinearOutput))
    

In [None]:
train_iterator = DataLoader(train_dataset, batch_size=BATCH_SIZE, sampler=train_sampler, collate_fn=collate_fn)
val_iterator   = DataLoader(val_dataset, batch_size=BATCH_SIZE, sampler=val_sampler, collate_fn=collate_fn)
test_iterator  = DataLoader(test_dataset, batch_size=BATCH_SIZE, sampler=test_sampler, collate_fn=collate_fn)

In [None]:
lstm_model = RecurrentModel(vocab_size    = len(train_vocab.keys()),
                            embedding_dim = 300,
                            hidden_dim    = 300,
                            num_layers    = 5,
                            bidirectional = False).to(device)

In [None]:
from torch.optim.adagrad import Adagrad
from torch.optim import Adam

lstm_criterion, lstm_optimizer = None, None
lstm_criterion = nn.BCELoss()
lstm_optimizer = Adam(lstm_model.parameters(), lr = .001)


### Training and Evaluation



In [None]:
true, pred = val_loop(lstm_model, val_iterator)
print()
print(f'Binary Macro F1: {binary_macro_f1(true, pred)}')
print(f'Accuracy: {accuracy(true, pred)}')

100%|██████████| 150/150 [00:00<00:00, 218.41it/s]


Binary Macro F1: 0.36
Accuracy: 0.5625





In [None]:
TOTAL_EPOCHS = 10
for epoch in range(TOTAL_EPOCHS):
    train_loss = train_loop(lstm_model, lstm_criterion, lstm_optimizer, train_iterator)
    true, pred = val_loop(lstm_model, val_iterator)
    print(f"EPOCH: {epoch}")
    print(f"TRAIN LOSS: {train_loss}")
    print(f"VAL F-1: {binary_macro_f1(true, pred)}")
    print(f"VAL ACC: {accuracy(true, pred)}")

100%|██████████| 1200/1200 [00:12<00:00, 94.15it/s]
100%|██████████| 150/150 [00:00<00:00, 219.89it/s]


EPOCH: 0
TRAIN LOSS: 711.4798891246319
VAL F-1: 0.805668016194332
VAL ACC: 0.8125


100%|██████████| 1200/1200 [00:12<00:00, 94.81it/s]
100%|██████████| 150/150 [00:00<00:00, 221.23it/s]


EPOCH: 1
TRAIN LOSS: 601.4297215938568
VAL F-1: 0.746031746031746
VAL ACC: 0.75


100%|██████████| 1200/1200 [00:12<00:00, 94.94it/s]
100%|██████████| 150/150 [00:00<00:00, 217.84it/s]


EPOCH: 2
TRAIN LOSS: 399.8543336354196
VAL F-1: 0.8666666666666667
VAL ACC: 0.875


100%|██████████| 1200/1200 [00:12<00:00, 94.85it/s]
100%|██████████| 150/150 [00:00<00:00, 223.04it/s]


EPOCH: 3
TRAIN LOSS: 289.0646998193115
VAL F-1: 0.9352226720647774
VAL ACC: 0.9375


100%|██████████| 1200/1200 [00:12<00:00, 94.63it/s]
100%|██████████| 150/150 [00:00<00:00, 219.63it/s]


EPOCH: 4
TRAIN LOSS: 221.1596870906651
VAL F-1: 0.8117647058823529
VAL ACC: 0.8125


100%|██████████| 1200/1200 [00:12<00:00, 94.53it/s]
100%|██████████| 150/150 [00:00<00:00, 218.60it/s]


EPOCH: 5
TRAIN LOSS: 162.88612027280033
VAL F-1: 0.7681159420289854
VAL ACC: 0.8125


100%|██████████| 1200/1200 [00:12<00:00, 94.00it/s]
100%|██████████| 150/150 [00:00<00:00, 221.00it/s]


EPOCH: 6
TRAIN LOSS: 110.52352964691818
VAL F-1: 1.0
VAL ACC: 1.0


100%|██████████| 1200/1200 [00:12<00:00, 93.93it/s]
100%|██████████| 1200/1200 [00:12<00:00, 94.34it/s]
100%|██████████| 150/150 [00:00<00:00, 218.57it/s]


EPOCH: 8
TRAIN LOSS: 65.77736117457971
VAL F-1: 0.8666666666666667
VAL ACC: 0.875


100%|██████████| 1200/1200 [00:12<00:00, 94.07it/s]
100%|██████████| 150/150 [00:00<00:00, 218.84it/s]

EPOCH: 9
TRAIN LOSS: 61.73469458904583
VAL F-1: 0.8545454545454546
VAL ACC: 0.875





In [None]:
true, pred = val_loop(lstm_model, test_iterator)
print()
print(f"TEST F-1: {binary_macro_f1(true, pred)}")
print(f"TEST ACC: {accuracy(true, pred)}")

100%|██████████| 150/150 [00:00<00:00, 214.51it/s]


TEST F-1: 0.873015873015873
TEST ACC: 0.875



