In [1]:
!pip install torch==1.8.0+cu111 torchvision==0.9.0+cu111 torchaudio==0.8.0 torchtext==0.9.0 -f https://download.pytorch.org/whl/torch_stable.html

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in links: https://download.pytorch.org/whl/torch_stable.html
Collecting torch==1.8.0+cu111
  Downloading https://download.pytorch.org/whl/cu111/torch-1.8.0%2Bcu111-cp38-cp38-linux_x86_64.whl (1982.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.0/2.0 GB[0m [31m874.9 kB/s[0m eta [36m0:00:00[0m
[?25hCollecting torchvision==0.9.0+cu111
  Downloading https://download.pytorch.org/whl/cu111/torchvision-0.9.0%2Bcu111-cp38-cp38-linux_x86_64.whl (17.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m17.6/17.6 MB[0m [31m45.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting torchaudio==0.8.0
  Downloading torchaudio-0.8.0-cp38-cp38-manylinux1_x86_64.whl (1.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.9/1.9 MB[0m [31m28.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting torchtext==0.9.0
  Downloading torchtext-0.9.0-c

In [2]:
!git clone https://github.com/bhargaviparanjape/clickbait.git
!gzip -d /content/clickbait/dataset/clickbait_data.gz
!gzip -d /content/clickbait/dataset/non_clickbait_data.gz

Cloning into 'clickbait'...
remote: Enumerating objects: 104, done.[K
remote: Total 104 (delta 0), reused 0 (delta 0), pack-reused 104[K
Receiving objects: 100% (104/104), 1.54 MiB | 9.33 MiB/s, done.
Resolving deltas: 100% (44/44), done.


In [3]:
import pandas as pd

In [4]:
f1 = open('/content/clickbait/dataset/clickbait_data', 'r')
clickbait = f1.readlines()

# clickbait
clickbait = list(map(lambda x:x.strip(),clickbait))
clickbait = list(filter(None, clickbait))

cb_df = pd.DataFrame(clickbait, columns = ['text'])
cb_df['label'] = 1

f2 = open('/content/clickbait/dataset/non_clickbait_data', 'r')
non_clickbait = f2.readlines()

non_clickbait = list(map(lambda x:x.strip(),non_clickbait))
non_clickbait = list(filter(None, non_clickbait))

ncb_df = pd.DataFrame(non_clickbait, columns = ['text'])
ncb_df['label'] = 0

raw_df = pd.concat([cb_df, ncb_df], ignore_index=True)
raw_df.to_csv('raw_data.csv')

In [5]:
print('Percentage of clickbait data: ', round(len(raw_df[raw_df['label']== 1])/len(raw_df)*100))

Percentage of clickbait data:  50


In [None]:
import torch
from torchtext.legacy import data

SEED = 1234

torch.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

TEXT = data.Field(tokenize = 'spacy', tokenizer_language = 'en_core_web_sm')
LABEL = data.LabelField(dtype = torch.float)

In [None]:
fields = [('Unnamed: 0', None), ('text', TEXT), ("label", LABEL)]

raw_data = data.TabularDataset(path="raw_data.csv",format="csv",fields=fields,skip_header=True)

In [None]:
import random

# train and validation splitting
train_data,test_data = raw_data.split(split_ratio=0.80,random_state=random.seed(SEED))
print(f'Number of training examples: {len(train_data)}')
valid_data, test_data = test_data.split(split_ratio=0.50,random_state=random.seed(SEED))
print(f'Number of testing examples: {len(valid_data)}')
print(f'Number of testing examples: {len(test_data)}')

Number of training examples: 25600
Number of testing examples: 3200
Number of testing examples: 3200


In [None]:
MAX_VOCAB_SIZE = 25_000

TEXT.build_vocab(train_data, max_size = MAX_VOCAB_SIZE)
LABEL.build_vocab(train_data)
print(f"Unique tokens in TEXT vocabulary: {len(TEXT.vocab)}")
print(f"Unique tokens in LABEL vocabulary: {len(LABEL.vocab)}")
print(TEXT.vocab.freqs.most_common(20))
print(TEXT.vocab.itos[:10])
print(LABEL.vocab.stoi)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')


Unique tokens in TEXT vocabulary: 25002
Unique tokens in LABEL vocabulary: 2
[('You', 4507), ('"', 4373), ('The', 4127), ('in', 3514), (',', 3273), ('to', 2716), ('To', 2590), ("'s", 2450), ('A', 2272), ('of', 2124), ('-', 2092), ('Your', 2074), ('Of', 1936), ('Are', 1713), ('In', 1692), ('Is', 1642), ('That', 1570), ('This', 1447), ('for', 1368), ('And', 1326)]
['<unk>', '<pad>', 'You', '"', 'The', 'in', ',', 'to', 'To', "'s"]
defaultdict(None, {'1': 0, '0': 1})


In [None]:
TEXT.vocab.stoi['<pad>']

1

In [None]:
# Setting up mini batching using dataloaders and collate function
from torch.utils.data import DataLoader
from torch.nn.utils.rnn import pad_sequence


def encode_text_pipeline(source):
  encoded_text = [TEXT.vocab[word] for word in source]
  return torch.tensor(encoded_text, dtype = torch.int64)

def encoded_label_pipeline(target):
  encoded_label = LABEL.vocab[target]
  return torch.tensor(encoded_label, dtype = torch.int64)

def pad_function(batch):
  x_data = []
  y_data = []
  x_lengths = []
  for i in range(len(batch)):
    x_lengths.append(len(batch[i].text))
    encoded_text = encode_text_pipeline(batch[i].text)
    x_data.append(encoded_text)
    encoded_label = encoded_label_pipeline(batch[i].label)
    y_data.append(encoded_label)
  padded_data = pad_sequence(x_data, batch_first = True, padding_value=1)
  return padded_data, torch.tensor(y_data), torch.tensor(x_lengths)


def create_loader(dataset):
  data_loader = DataLoader(dataset=dataset, batch_size=32, collate_fn = pad_function, drop_last = True)
  return data_loader

train_loader = create_loader(train_data)
valid_loader = create_loader(valid_data)
test_loader = create_loader(test_data)

In [None]:
import torch.nn as nn
import torch.nn as nn
class LSTM_classifier(nn.Module):
    def __init__(self, input_dim, embedding_dim, hidden_dim, output_dim):
        super(LSTM_classifier, self).__init__()
        self.n_layers = 1
        self.hidden_dim = hidden_dim
        self.embedding = nn.Embedding(input_dim, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, num_layers=1, dropout=0.4)
        self.fc = nn.Linear(hidden_dim, output_dim)

    def forward(self, text, x_lengths):
      batch_size = text.size(0)
      embedded = self.embedding(text)
      packed_embeddings = nn.utils.rnn.pack_padded_sequence(embedded, x_lengths, batch_first=True, enforce_sorted=False)
      packed_output, (hidden, cell) = self.lstm(packed_embeddings)
      output, output_lengths = nn.utils.rnn.pad_packed_sequence(packed_output)
      prediction = self.fc(torch.mean(output, 0))
      return prediction, hidden
    

In [None]:
INPUT_DIM = len(TEXT.vocab)
EMBEDDING_DIM = 256
HIDDEN_DIM = 128
OUTPUT_DIM = 1

model = LSTM_classifier(INPUT_DIM, EMBEDDING_DIM, HIDDEN_DIM ,OUTPUT_DIM)



In [None]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f'The model has {count_parameters(model):,} trainable parameters')

The model has 6,598,273 trainable parameters


In [None]:
import torch.optim as optim

# optimizer = optim.SGD(model.parameters(), lr=1e-3)
optimizer = optim.Adam(model.parameters(), lr = 1e-4 )

In [None]:
criterion = nn.BCEWithLogitsLoss()

In [None]:
model = model.to(device)
criterion = criterion.to(device)

In [None]:
from sklearn.metrics import f1_score

def binary_accuracy(preds, y):
    rounded_preds = torch.round(torch.sigmoid(preds))
    correct = (rounded_preds == y).float()
    acc = correct.sum() / len(correct)
    f1 = f1_score(rounded_preds.tolist(), y.tolist())
    return acc, f1

In [None]:
from tqdm import tqdm
def train(model, iterator, optimizer, criterion):
    model.train()
    epoch_loss = []
    epoch_accuracy = []
    epoch_f1 = []
    for batch_text, batch_labels, batch_lengths in tqdm(iterator):
        batch_size = batch_text.size(0)
        optimizer.zero_grad()
        batch_labels = batch_labels.to(device)
        batch_predictions, hidden = model(batch_text.to(device), batch_lengths)
        hidden.detach()
        batch_loss = criterion(batch_predictions.squeeze(), batch_labels.squeeze().float())
        epoch_loss.append(batch_loss)
        batch_loss.backward()
        optimizer.step()
        batch_acc, batch_f1 = binary_accuracy(batch_predictions.squeeze(), batch_labels.squeeze())
        epoch_accuracy.append(batch_acc)
        epoch_f1.append(batch_f1)
    return sum(epoch_loss)/len(epoch_loss), sum(epoch_accuracy)/len(epoch_accuracy), sum(epoch_f1)/len(epoch_f1)

In [None]:
def evaluate(model, iterator, criterion):
    epoch_loss = []
    epoch_accuracy = []
    epoch_f1 = []
    model.eval()
    with torch.no_grad():
        for batch_text, batch_labels, batch_lengths in tqdm(iterator):
            batch_size = batch_text.size(0)
            batch_labels = batch_labels.to(device)
            batch_predictions, hidden = model(batch_text.to(device), batch_lengths)
            hidden.detach()
            batch_loss = criterion(batch_predictions.squeeze(), batch_labels.squeeze().float())
            epoch_loss.append(batch_loss)
            batch_acc, batch_f1 = binary_accuracy(batch_predictions.squeeze(), batch_labels.squeeze())
            epoch_accuracy.append(batch_acc)
            epoch_f1.append(batch_f1)
    return sum(epoch_loss)/len(epoch_loss), sum(epoch_accuracy)/len(epoch_accuracy), sum(epoch_f1)/len(epoch_f1)

In [None]:
import time

def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

In [None]:
N_EPOCHS = 10

best_valid_loss = float('inf')
tolerance = 2

for epoch in range(N_EPOCHS):
    print('Epoch: ', epoch)
    start_time = time.time()
    train_epoch_loss, train_epoch_accuracy, train_epoch_F1 = train(model, train_loader, optimizer, criterion)
    print('Loss: ', train_epoch_loss)
    print('Accuracy: ', train_epoch_accuracy)
    valid_epoch_loss, valid_epoch_accuracy, valid_epoch_F1 = evaluate(model, valid_loader, criterion)
    print('Loss: ', valid_epoch_loss)
    print('Accuracy: ', valid_epoch_accuracy)
    end_time = time.time()
    epoch_mins, epoch_secs = epoch_time(start_time, end_time)
    
    if valid_epoch_loss < best_valid_loss:
      best_valid_loss = valid_epoch_loss
    else:
      tolerance -= 1
      if tolerance == 0:  
        break
      
    print(f'Epoch: {epoch+1:02} | Epoch Time: {epoch_mins}m {epoch_secs}s')
    print('Train Loss:', train_epoch_loss)
    print('Train accuracy', train_epoch_accuracy)
    print('Train F1', train_epoch_F1)
    print('Val Loss:', valid_epoch_loss)
    print('Val accuracy', valid_epoch_accuracy)
    print('Val F1', valid_epoch_F1)

Epoch:  0


100%|██████████| 800/800 [00:08<00:00, 90.74it/s] 


Loss:  tensor(0.3359, device='cuda:0', grad_fn=<DivBackward0>)
Accuracy:  tensor(0.9037, device='cuda:0')


100%|██████████| 100/100 [00:00<00:00, 223.70it/s]


Loss:  tensor(0.1770, device='cuda:0')
Accuracy:  tensor(0.9547, device='cuda:0')
Epoch: 01 | Epoch Time: 0m 9s
Train Loss: tensor(0.3359, device='cuda:0', grad_fn=<DivBackward0>)
Train accuracy tensor(0.9037, device='cuda:0')
Train F1 0.8812735720790761
Val Loss: tensor(0.1770, device='cuda:0')
Val accuracy tensor(0.9547, device='cuda:0')
Val F1 0.9559155593711124
Epoch:  1


100%|██████████| 800/800 [00:09<00:00, 88.12it/s]


Loss:  tensor(0.1272, device='cuda:0', grad_fn=<DivBackward0>)
Accuracy:  tensor(0.9725, device='cuda:0')


100%|██████████| 100/100 [00:00<00:00, 208.34it/s]


Loss:  tensor(0.1477, device='cuda:0')
Accuracy:  tensor(0.9600, device='cuda:0')
Epoch: 02 | Epoch Time: 0m 9s
Train Loss: tensor(0.1272, device='cuda:0', grad_fn=<DivBackward0>)
Train accuracy tensor(0.9725, device='cuda:0')
Train F1 0.9714909861961051
Val Loss: tensor(0.1477, device='cuda:0')
Val accuracy tensor(0.9600, device='cuda:0')
Val F1 0.9616601590853984
Epoch:  2


100%|██████████| 800/800 [00:08<00:00, 92.15it/s]


Loss:  tensor(0.0777, device='cuda:0', grad_fn=<DivBackward0>)
Accuracy:  tensor(0.9825, device='cuda:0')


100%|██████████| 100/100 [00:00<00:00, 139.61it/s]


Loss:  tensor(0.1457, device='cuda:0')
Accuracy:  tensor(0.9625, device='cuda:0')
Epoch: 03 | Epoch Time: 0m 9s
Train Loss: tensor(0.0777, device='cuda:0', grad_fn=<DivBackward0>)
Train accuracy tensor(0.9825, device='cuda:0')
Train F1 0.9818814750439112
Val Loss: tensor(0.1457, device='cuda:0')
Val accuracy tensor(0.9625, device='cuda:0')
Val F1 0.9636254504259256
Epoch:  3


100%|██████████| 800/800 [00:08<00:00, 92.94it/s]


Loss:  tensor(0.0525, device='cuda:0', grad_fn=<DivBackward0>)
Accuracy:  tensor(0.9877, device='cuda:0')


100%|██████████| 100/100 [00:00<00:00, 202.85it/s]


Loss:  tensor(0.1560, device='cuda:0')
Accuracy:  tensor(0.9609, device='cuda:0')
Epoch: 04 | Epoch Time: 0m 9s
Train Loss: tensor(0.0525, device='cuda:0', grad_fn=<DivBackward0>)
Train accuracy tensor(0.9877, device='cuda:0')
Train F1 0.9872620721209112
Val Loss: tensor(0.1560, device='cuda:0')
Val accuracy tensor(0.9609, device='cuda:0')
Val F1 0.9619891992449019
Epoch:  4


100%|██████████| 800/800 [00:09<00:00, 87.61it/s]


Loss:  tensor(0.0372, device='cuda:0', grad_fn=<DivBackward0>)
Accuracy:  tensor(0.9909, device='cuda:0')


100%|██████████| 100/100 [00:00<00:00, 191.36it/s]

Loss:  tensor(0.1753, device='cuda:0')
Accuracy:  tensor(0.9594, device='cuda:0')





In [None]:
# model.load_state_dict(torch.load('tut1-model.pt'))

test_loss, test_acc, test_f1 = evaluate(model, test_loader, criterion)

print('Test Loss',test_loss)
print('Test accuracy', test_acc)
print('Test F1', test_f1)

100%|██████████| 100/100 [00:00<00:00, 216.76it/s]

Test Loss tensor(0.1648, device='cuda:0')
Test accuracy tensor(0.9666, device='cuda:0')
Test F1 0.9658104132247586



