In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
import os
import pandas as pd
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from tqdm import tqdm
from collections import Counter
from torch.nn.utils.rnn import pad_sequence
import numpy as np
import torch.nn.functional as F

In [None]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(device)

In [None]:
df_cb=pd.read_csv("/content/drive/MyDrive/clickbait/clickbait_data", on_bad_lines='skip', names=["headline", "label"])
df_ncb=pd.read_csv("/content/drive/MyDrive/clickbait/non_clickbait_data", on_bad_lines='skip',names=["headline", "label"])
df_cb["label"]=["click bait"]*len(df_cb)
df_ncb["label"]=["non click bait"]*len(df_ncb)
frames = [df_cb, df_ncb]
df=pd.concat(frames)
percent_clickbait=(len(df[df["label"]=="click bait"])/len(df))*100
percent_clickbait
print(f"The percentage of clickbait data in the entire dataset:{round(percent_clickbait,2)}%")

In [None]:
cb = {"non click bait":0,"click bait":1}
df['label']=df['label'].map(cb)
df.head(10)

In [None]:
train_data, val_data = train_test_split(df, test_size=0.2)
print('Train data size={}, Validation data size={}'.format(len(train_data), len(val_data)))

In [None]:
from spacy.tokenizer import Tokenizer
from spacy.lang.en import English
nlp = English()
# Create a blank Tokenizer with just the English vocab
tokenizer = Tokenizer(nlp.vocab)

In [None]:
vocab_size = 20_000
all_tokens = []
for review in tqdm(train_data['headline']):
  tokens = tokenizer(review)
  all_tokens.extend([i.text for i in tokens])

In [None]:
count = Counter(all_tokens)
tokens, counts = zip(*count.most_common(vocab_size))
vocab = {token: idx for idx, token in enumerate(tokens)}
vocab['<unk>'] = len(vocab)

In [None]:
print(vocab['<unk>'])
print(vocab['I'])

In [None]:
val_data.head(10)

In [None]:
def get_maxlen(data):
  maxlen = 0
  for ex in train_data['headline']:
    max_len_tmp = (len(ex))
    if max_len_tmp > maxlen:
      maxlen = max_len_tmp
  return maxlen

max_len_tr = get_maxlen(train_data)
max_len_val = get_maxlen(val_data)
max_len = max(max_len_tr, max_len_val)
print(max_len)
print(type(train_data))

In [None]:
#hyperparameters
vocab_size=vocab_size+1
embedding_size=100
hidden_size=64
num_layers=2
out_dim=2
batch_size = 32
max_len=max_len
n_epochs=5
num_classes=2

In [None]:
class clickbaitDataset(Dataset):
  def __init__(self, data: pd.DataFrame, vocab,max_len):
    self.data = data
    self.vocab = vocab
    self.default = self.vocab['<unk>']
    self.max_len = max_len

  def tokenize(self, text: str):
    return [i.text for i in tokenizer(text)]

  def encode_tokens(self, tokens):
    encoded = [self.vocab.get(token, self.default) for token in tokens]
    encoded += [0 for _ in range(self.max_len-len(tokens))]
    return torch.tensor(encoded, device=device)

  def encode_label(self, label: str):
    return torch.tensor(0, device=device) if label == 0 else torch.tensor(1, device=device)
  
  def __getitem__(self, n: int):
    headline = self.data['headline'].iloc[n]
    label = self.data['label'].iloc[n]
    return self.encode_tokens(self.tokenize(headline)), self.encode_label(label)

  def __len__(self):
    return len(self.data)

train_ds = clickbaitDataset(train_data, vocab, max_len=max_len)
val_ds = clickbaitDataset(val_data, vocab,  max_len=max_len)
train_loader = DataLoader(train_ds, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_ds, batch_size=batch_size, shuffle=True)

In [None]:
next(iter(train_loader))

In [None]:
from torch.autograd import Variable
class LSTM(nn.Module):
    def __init__(self,  vocab_size, embedding_size, hidden_size, out_dim, batch_size):
        super().__init__()
        self.num_layers = 2
        self.batch_size = batch_size
        self.hidden_dim = hidden_size
        self.embedding_size = embedding_size
        self.num_heads = hidden_size
        self.embedding = nn.Embedding(vocab_size, embedding_size)
        self.lstm = nn.LSTM(embedding_size, hidden_size)
        self.nonlinear = nn.Sigmoid()
        self.fc = nn.Linear(hidden_size, out_dim)
        self.attention = nn.MultiheadAttention(hidden_size, self.num_heads)

    def forward(self, text):
        embedded = self.embedding(text)
        batch_size = embedded.size(0)
        input, hidden = self.lstm(embedded) #LSTM layer
        attention_out, _ = self.attention(input,input,input) #Added the Multi-headed attention layer.
        attention_out = attention_out[:, -1]
        out=self.fc(attention_out)
        out= self.nonlinear(out)
        out=out[:,-1]
        return  out

model = LSTM(vocab_size, embedding_size, hidden_size, out_dim, batch_size)
loss_fn = nn.BCEWithLogitsLoss(reduction="sum") #BCELoss
optimizer = torch.optim.Adam(filter(lambda p: p.requires_grad, model.parameters()), lr=0.0001)
model.cuda()

In [None]:
train_loss = []
valid_loss = []
for epoch in range(n_epochs):
  model.train()
  avg_loss = 0.  
  pbar = tqdm(train_loader)
  for i, (x_batch, y_batch) in enumerate(pbar):
    y_pred = model(x_batch) #Forwardpass
    loss = loss_fn(y_pred, y_batch.float()) # calculate loss
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
    avg_loss += loss.item() / len(train_loader)
  model.eval()   # Model Eval       
  avg_val_loss = 0.
  val_preds = np.zeros((len(val_ds), num_classes))
  pbar = tqdm(val_loader)
  for i, (x_batch, y_batch) in enumerate(pbar):
    y_pred = model(x_batch)
    avg_val_loss += loss_fn(y_pred, y_batch.float()).item() / len(val_loader)
    val_preds[i * batch_size:(i+1) * batch_size] = y_pred.cpu().data.numpy().argmax() #collecting predictions for F1-score
  train_loss.append(avg_loss)
  valid_loss.append(avg_val_loss)
  print('Epoch {}/{} \t loss={:.4f} \t val_loss={:.4f}'.format(
              epoch + 1, n_epochs, avg_loss, avg_val_loss))


In [None]:
from sklearn.metrics import f1_score
import matplotlib.pyplot as plt
from sklearn.metrics import classification_report
y_true = [x for x in val_data['label']]
y_pred = [x for x in val_preds.argmax(axis=1)]
print(classification_report(y_true, y_pred))
f1_score(y_true, y_pred, average='macro')