In [1]:
!pip install torch==1.8.0+cu111 torchvision==0.9.0+cu111 torchaudio==0.8.0 torchtext==0.9.0 -f https://download.pytorch.org/whl/torch_stable.html

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in links: https://download.pytorch.org/whl/torch_stable.html
Collecting torch==1.8.0+cu111
  Downloading https://download.pytorch.org/whl/cu111/torch-1.8.0%2Bcu111-cp38-cp38-linux_x86_64.whl (1982.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.0/2.0 GB[0m [31m868.2 kB/s[0m eta [36m0:00:00[0m
[?25hCollecting torchvision==0.9.0+cu111
  Downloading https://download.pytorch.org/whl/cu111/torchvision-0.9.0%2Bcu111-cp38-cp38-linux_x86_64.whl (17.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m17.6/17.6 MB[0m [31m79.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting torchaudio==0.8.0
  Downloading torchaudio-0.8.0-cp38-cp38-manylinux1_x86_64.whl (1.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.9/1.9 MB[0m [31m38.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting torchtext==0.9.0
  Downloading torchtext-0.9.0-c

In [2]:
!git clone https://github.com/bhargaviparanjape/clickbait.git
!gzip -d /content/clickbait/dataset/clickbait_data.gz
!gzip -d /content/clickbait/dataset/non_clickbait_data.gz

Cloning into 'clickbait'...
remote: Enumerating objects: 104, done.[K
remote: Total 104 (delta 0), reused 0 (delta 0), pack-reused 104[K
Receiving objects: 100% (104/104), 1.54 MiB | 7.65 MiB/s, done.
Resolving deltas: 100% (44/44), done.


In [4]:
import pandas as pd

In [5]:
f1 = open('/content/clickbait/dataset/clickbait_data', 'r')
clickbait = f1.readlines()

# clickbait
clickbait = list(map(lambda x:x.strip(),clickbait))
clickbait = list(filter(None, clickbait))

cb_df = pd.DataFrame(clickbait, columns = ['text'])
cb_df['label'] = 1

f2 = open('/content/clickbait/dataset/clickbait_data', 'r')
non_clickbait = f2.readlines()

non_clickbait = list(map(lambda x:x.strip(),non_clickbait))
non_clickbait = list(filter(None, non_clickbait))

ncb_df = pd.DataFrame(non_clickbait, columns = ['text'])
ncb_df['label'] = 0

raw_df = pd.concat([cb_df, ncb_df], ignore_index=True)
raw_df.to_csv('raw_data.csv')

In [6]:
import torch
from torchtext.legacy import data

SEED = 1234

torch.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

TEXT = data.Field(tokenize = 'spacy', tokenizer_language = 'en_core_web_sm')
LABEL = data.LabelField(dtype = torch.float)

In [7]:
fields = [('Unnamed: 0', None), ('text', TEXT), ("label", LABEL)]

raw_data = data.TabularDataset(path="raw_data.csv",format="csv",fields=fields,skip_header=True)

In [8]:
import random

# train and validation splitting
train_data,test_data = raw_data.split(split_ratio=0.80,random_state=random.seed(SEED))
print(f'Number of training examples: {len(train_data)}')
valid_data, test_data = test_data.split(split_ratio=0.50,random_state=random.seed(SEED))
print(f'Number of testing examples: {len(valid_data)}')
print(f'Number of testing examples: {len(test_data)}')

Number of training examples: 25598
Number of testing examples: 3200
Number of testing examples: 3200


In [9]:
MAX_VOCAB_SIZE = 25_000

TEXT.build_vocab(train_data, max_size = MAX_VOCAB_SIZE)
LABEL.build_vocab(train_data)
print(f"Unique tokens in TEXT vocabulary: {len(TEXT.vocab)}")
print(f"Unique tokens in LABEL vocabulary: {len(LABEL.vocab)}")
print(TEXT.vocab.freqs.most_common(20))
print(TEXT.vocab.itos[:10])
print(LABEL.vocab.stoi)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')


Unique tokens in TEXT vocabulary: 11621
Unique tokens in LABEL vocabulary: 2
[('You', 8865), ('The', 7980), ('"', 7919), ('To', 5151), ('A', 4195), ('Your', 4082), ('Of', 3888), ("'s", 3706), ('Are', 3223), ('That', 3105), ('In', 3088), ('This', 2842), ('And', 2610), ('Is', 2605), ('On', 2359), ('For', 2265), ('What', 2101), ('Will', 2002), ('-', 1766), ('About', 1708)]
['<unk>', '<pad>', 'You', 'The', '"', 'To', 'A', 'Your', 'Of', "'s"]
defaultdict(None, {'0': 0, '1': 1})


In [10]:
# Setting up mini batching using dataloaders and collate function
from torch.utils.data import DataLoader
from torch.nn.utils.rnn import pad_sequence


def encode_text_pipeline(source):
  encoded_text = [TEXT.vocab[word] for word in source]
  return torch.tensor(encoded_text, dtype = torch.int64)

def encoded_label_pipeline(target):
  encoded_label = LABEL.vocab[target]
  return torch.tensor(encoded_label, dtype = torch.int64)

def pad_function(batch):
  x_data = []
  y_data = []
  x_lengths = []
  for i in range(len(batch)):
    x_lengths.append(len(batch[i].text))
    encoded_text = encode_text_pipeline(batch[i].text)
    x_data.append(encoded_text)
    encoded_label = encoded_label_pipeline(batch[i].label)
    y_data.append(encoded_label)
  padded_data = pad_sequence(x_data, batch_first = True, padding_value=0)
  return padded_data, torch.tensor(y_data), torch.tensor(x_lengths)


def create_loader(dataset):
  data_loader = DataLoader(dataset=dataset, batch_size=32, collate_fn = pad_function, drop_last = True)
  return data_loader

train_loader = create_loader(train_data)
valid_loader = create_loader(valid_data)
test_loader = create_loader(test_data)

In [11]:
# for x, y, length in train_loader:
#   print(x)
#   print(y)
#   print(length)
#   break


torch.Size([32, 17])
tensor([[    3,   601,  1067,   221,   289,     2,    60,    71,   213,     0,
             0,     0,     0,     0,     0,     0,     0],
        [   13,   600,  2109,   787,    15,     3,  2431,   433,     2,    60,
            71,  1543,     0,     0,     0,     0,     0],
        [   32,     2,    40,    29,    27,   634,   784,    39,  2182,     0,
             0,     0,     0,     0,     0,     0,     0],
        [   32,     2,    29,    27,   187,   125,    13,     4,  4438,     4,
            15,    30,     0,     0,     0,     0,     0],
        [  141,    47,  2147,  1558,   282,    88,   283,   532,     0,     0,
             0,     0,     0,     0,     0,     0,     0],
        [   18,    34,     9,    49,     5,    43,   766,     3,  1604,    53,
             6,  1038,  3144,     0,     0,     0,     0],
        [   91,    97,    25,    22,    10,    45,  3210,   648,    19,   119,
           101,     0,     0,     0,     0,     0,     0],
        [   2

In [23]:
import torch.nn as nn
import torch.nn as nn
class LSTM_classifier(nn.Module):
    def __init__(self, input_dim, embedding_dim, hidden_dim, output_dim):
        super(LSTM_classifier, self).__init__()
        self.n_layers = 1
        self.hidden_dim = hidden_dim
        self.embedding = nn.Embedding(input_dim, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, num_layers=1, dropout=0.4)
        self.fc = nn.Linear(hidden_dim, output_dim)

    def forward(self, text, x_lengths):
      batch_size = text.size(0)
      embedded = self.embedding(text)
      packed_embeddings = nn.utils.rnn.pack_padded_sequence(embedded, x_lengths, batch_first=True, enforce_sorted=False)
      packed_output, (hidden, cell) = self.lstm(packed_embeddings)
      output, output_lengths = nn.utils.rnn.pad_packed_sequence(packed_output)
      prediction = self.fc(torch.mean(output, 0))
      return prediction, hidden
    

In [24]:
INPUT_DIM = len(TEXT.vocab)
EMBEDDING_DIM = 100
HIDDEN_DIM = 64
OUTPUT_DIM = 1

model = LSTM_classifier(INPUT_DIM, EMBEDDING_DIM, HIDDEN_DIM ,OUTPUT_DIM)



In [25]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f'The model has {count_parameters(model):,} trainable parameters')

The model has 1,204,661 trainable parameters


In [26]:
import torch.optim as optim

optimizer = optim.SGD(model.parameters(), lr=1e-3)

In [27]:
criterion = nn.BCEWithLogitsLoss()

In [28]:
model = model.to(device)
criterion = criterion.to(device)

In [49]:
from sklearn.metrics import f1_score

def binary_accuracy(preds, y):
    rounded_preds = torch.round(torch.sigmoid(preds))
    correct = (rounded_preds == y).float()
    acc = correct.sum() / len(correct)
    f1 = f1_score(rounded_preds.tolist(), y.tolist())
    return acc, f1

In [54]:
from tqdm import tqdm
def train(model, iterator, optimizer, criterion):
    model.train()
    epoch_loss = []
    epoch_accuracy = []
    epoch_f1 = []
    for batch_text, batch_labels, batch_lengths in tqdm(iterator):
        batch_size = batch_text.size(0)
        optimizer.zero_grad()
        batch_labels = batch_labels.to(device)
        batch_predictions, hidden = model(batch_text.to(device), batch_lengths)
        hidden.detach()
        batch_loss = criterion(batch_predictions.squeeze(), batch_labels.squeeze().float())
        epoch_loss.append(batch_loss)
        batch_loss.backward()
        optimizer.step()
        batch_acc, batch_f1 = binary_accuracy(batch_predictions.squeeze(), batch_labels.squeeze())
        epoch_accuracy.append(batch_acc)
        epoch_f1.append(batch_f1)
    return sum(epoch_loss)/len(epoch_loss), sum(epoch_accuracy)/len(epoch_accuracy), sum(epoch_f1)/len(epoch_f1)

In [55]:
def evaluate(model, iterator, criterion):
    epoch_loss = []
    epoch_accuracy = []
    epoch_f1 = []
    model.eval()
    with torch.no_grad():
        for batch_text, batch_labels, batch_lengths in tqdm(iterator):
            batch_size = batch_text.size(0)
            batch_labels = batch_labels.to(device)
            batch_predictions, hidden = model(batch_text.to(device), batch_lengths)
            hidden.detach()
            batch_loss = criterion(batch_predictions.squeeze(), batch_labels.squeeze().float())
            epoch_loss.append(batch_loss)
            batch_acc, batch_f1 = binary_accuracy(batch_predictions.squeeze(), batch_labels.squeeze())
            epoch_accuracy.append(batch_acc)
            epoch_f1.append(batch_f1)
    return sum(epoch_loss)/len(epoch_loss), sum(epoch_accuracy)/len(epoch_accuracy), sum(epoch_f1)/len(epoch_f1)

In [56]:
import time

def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

In [62]:
N_EPOCHS = 20

best_valid_loss = float('inf')
tolerance = 2

for epoch in range(N_EPOCHS):
    print('Epoch: ', epoch)
    start_time = time.time()
    train_epoch_loss, train_epoch_accuracy, train_epoch_F1 = train(model, train_loader, optimizer, criterion)
    print('Loss: ', train_epoch_loss)
    print('Accuracy: ', train_epoch_accuracy)
    valid_epoch_loss, valid_epoch_accuracy, valid_epoch_F1 = evaluate(model, valid_loader, criterion)
    print('Loss: ', valid_epoch_loss)
    print('Accuracy: ', valid_epoch_accuracy)
    end_time = time.time()
    epoch_mins, epoch_secs = epoch_time(start_time, end_time)
    
    if valid_epoch_loss < best_valid_loss:
      best_valid_loss = valid_epoch_loss
    else:
      tolerance -= 1
      if tolerance == 0:  
        break
      
    print(f'Epoch: {epoch+1:02} | Epoch Time: {epoch_mins}m {epoch_secs}s')
    print('Train Loss:', train_epoch_loss)
    print('Train accuracy', train_epoch_accuracy)
    print('Train F1', train_epoch_F1)
    print('Val Loss:', valid_epoch_loss)
    print('Val accuracy', valid_epoch_accuracy)
    print('Val F1', valid_epoch_F1)

Epoch:  0


100%|██████████| 799/799 [00:07<00:00, 111.64it/s]


Loss:  tensor(0.6933, device='cuda:0', grad_fn=<DivBackward0>)
Accuracy:  tensor(0.4969, device='cuda:0')


100%|██████████| 100/100 [00:00<00:00, 235.12it/s]


Loss:  tensor(0.6931, device='cuda:0')
Accuracy:  tensor(0.4994, device='cuda:0')
Epoch: 01 | Epoch Time: 0m 7s
Train Loss: tensor(0.6933, device='cuda:0', grad_fn=<DivBackward0>)
Train accuracy tensor(0.4969, device='cuda:0')
Train F1 0.512309218137563
Val Loss: tensor(0.6931, device='cuda:0')
Val accuracy tensor(0.4994, device='cuda:0')
Val F1 0.5159421368791764
Epoch:  1


100%|██████████| 799/799 [00:05<00:00, 144.32it/s]


Loss:  tensor(0.6933, device='cuda:0', grad_fn=<DivBackward0>)
Accuracy:  tensor(0.4973, device='cuda:0')


100%|██████████| 100/100 [00:00<00:00, 222.66it/s]


Loss:  tensor(0.6931, device='cuda:0')
Accuracy:  tensor(0.4966, device='cuda:0')
Epoch: 02 | Epoch Time: 0m 6s
Train Loss: tensor(0.6933, device='cuda:0', grad_fn=<DivBackward0>)
Train accuracy tensor(0.4973, device='cuda:0')
Train F1 0.5090896738184288
Val Loss: tensor(0.6931, device='cuda:0')
Val accuracy tensor(0.4966, device='cuda:0')
Val F1 0.5092660606590604
Epoch:  2


100%|██████████| 799/799 [00:06<00:00, 123.51it/s]


Loss:  tensor(0.6933, device='cuda:0', grad_fn=<DivBackward0>)
Accuracy:  tensor(0.4978, device='cuda:0')


100%|██████████| 100/100 [00:00<00:00, 241.30it/s]


Loss:  tensor(0.6931, device='cuda:0')
Accuracy:  tensor(0.4981, device='cuda:0')
Epoch: 03 | Epoch Time: 0m 6s
Train Loss: tensor(0.6933, device='cuda:0', grad_fn=<DivBackward0>)
Train accuracy tensor(0.4978, device='cuda:0')
Train F1 0.506161728623709
Val Loss: tensor(0.6931, device='cuda:0')
Val accuracy tensor(0.4981, device='cuda:0')
Val F1 0.5071006982286046
Epoch:  3


100%|██████████| 799/799 [00:05<00:00, 147.70it/s]


Loss:  tensor(0.6933, device='cuda:0', grad_fn=<DivBackward0>)
Accuracy:  tensor(0.4977, device='cuda:0')


100%|██████████| 100/100 [00:00<00:00, 235.83it/s]


Loss:  tensor(0.6931, device='cuda:0')
Accuracy:  tensor(0.4994, device='cuda:0')
Epoch: 04 | Epoch Time: 0m 5s
Train Loss: tensor(0.6933, device='cuda:0', grad_fn=<DivBackward0>)
Train accuracy tensor(0.4977, device='cuda:0')
Train F1 0.5036230993170404
Val Loss: tensor(0.6931, device='cuda:0')
Val accuracy tensor(0.4994, device='cuda:0')
Val F1 0.5065165077375664
Epoch:  4


100%|██████████| 799/799 [00:06<00:00, 129.66it/s]


Loss:  tensor(0.6933, device='cuda:0', grad_fn=<DivBackward0>)
Accuracy:  tensor(0.4983, device='cuda:0')


100%|██████████| 100/100 [00:00<00:00, 167.22it/s]


Loss:  tensor(0.6931, device='cuda:0')
Accuracy:  tensor(0.4994, device='cuda:0')
Epoch: 05 | Epoch Time: 0m 6s
Train Loss: tensor(0.6933, device='cuda:0', grad_fn=<DivBackward0>)
Train accuracy tensor(0.4983, device='cuda:0')
Train F1 0.5022720210936129
Val Loss: tensor(0.6931, device='cuda:0')
Val accuracy tensor(0.4994, device='cuda:0')
Val F1 0.5052092214237011
Epoch:  5


100%|██████████| 799/799 [00:05<00:00, 150.15it/s]


Loss:  tensor(0.6933, device='cuda:0', grad_fn=<DivBackward0>)
Accuracy:  tensor(0.4984, device='cuda:0')


100%|██████████| 100/100 [00:00<00:00, 244.63it/s]


Loss:  tensor(0.6931, device='cuda:0')
Accuracy:  tensor(0.5003, device='cuda:0')
Epoch: 06 | Epoch Time: 0m 5s
Train Loss: tensor(0.6933, device='cuda:0', grad_fn=<DivBackward0>)
Train accuracy tensor(0.4984, device='cuda:0')
Train F1 0.5007967461648647
Val Loss: tensor(0.6931, device='cuda:0')
Val accuracy tensor(0.5003, device='cuda:0')
Val F1 0.5044930781584153
Epoch:  6


100%|██████████| 799/799 [00:05<00:00, 138.24it/s]


Loss:  tensor(0.6933, device='cuda:0', grad_fn=<DivBackward0>)
Accuracy:  tensor(0.4980, device='cuda:0')


100%|██████████| 100/100 [00:00<00:00, 168.61it/s]


Loss:  tensor(0.6931, device='cuda:0')
Accuracy:  tensor(0.5009, device='cuda:0')
Epoch: 07 | Epoch Time: 0m 6s
Train Loss: tensor(0.6933, device='cuda:0', grad_fn=<DivBackward0>)
Train accuracy tensor(0.4980, device='cuda:0')
Train F1 0.4993325909721835
Val Loss: tensor(0.6931, device='cuda:0')
Val accuracy tensor(0.5009, device='cuda:0')
Val F1 0.5038398136047496
Epoch:  7


100%|██████████| 799/799 [00:05<00:00, 136.39it/s]


Loss:  tensor(0.6933, device='cuda:0', grad_fn=<DivBackward0>)
Accuracy:  tensor(0.4980, device='cuda:0')


100%|██████████| 100/100 [00:00<00:00, 242.85it/s]


Loss:  tensor(0.6931, device='cuda:0')
Accuracy:  tensor(0.5019, device='cuda:0')
Epoch: 08 | Epoch Time: 0m 6s
Train Loss: tensor(0.6933, device='cuda:0', grad_fn=<DivBackward0>)
Train accuracy tensor(0.4980, device='cuda:0')
Train F1 0.4980019513088975
Val Loss: tensor(0.6931, device='cuda:0')
Val accuracy tensor(0.5019, device='cuda:0')
Val F1 0.5035955911930291
Epoch:  8


100%|██████████| 799/799 [00:05<00:00, 146.84it/s]


Loss:  tensor(0.6933, device='cuda:0', grad_fn=<DivBackward0>)
Accuracy:  tensor(0.4978, device='cuda:0')


100%|██████████| 100/100 [00:00<00:00, 181.27it/s]

Loss:  tensor(0.6931, device='cuda:0')
Accuracy:  tensor(0.5012, device='cuda:0')





In [63]:
# model.load_state_dict(torch.load('tut1-model.pt'))

test_loss, test_acc, test_f1 = evaluate(model, test_loader, criterion)

print('Test Loss',test_loss)
print('Test accuracy', test_acc)
print('Test F1', test_f1)

100%|██████████| 100/100 [00:00<00:00, 185.78it/s]

Test Loss tensor(0.6928, device='cuda:0')
Test accuracy tensor(0.5150, device='cuda:0')
Test F1 0.5214085798885922



