<a href="https://colab.research.google.com/github/ryuqae/2022-meta-learning/blob/main/Clickbait_Detection.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install transformers
import pandas as pd

import transformers
from transformers import AutoConfig, AutoModel, AutoTokenizer, Trainer

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from torch.utils.tensorboard import SummaryWriter

import logging
from tqdm import tqdm
from transformers import DistilBertTokenizer, DistilBertModel
from pprint import pprint

pd.set_option('display.max_colwidth', 1000)
pd.set_option('display.max_rows', 1000)

logging.basicConfig(format='%(asctime)s - %(message)s',
                    datefmt='%Y-%m-%d %H:%M:%S',
                    level=logging.INFO,
                    )
logger = logging.getLogger(__name__)


GPU_NUM = 0

device = torch.device(f'cuda:{GPU_NUM}' if torch.cuda.is_available() else 'cpu')
if torch.cuda.is_available():
    torch.cuda.set_device(device) # change allocation of current GPU
    print ('==================== \nCurrent cuda device ', torch.cuda.current_device()) # check

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Current cuda device  0


In [None]:
import sys, os
DATA_DIR = '/content/drive/MyDrive/Meta Learning/kaggle_clickbait'

In [None]:
train = pd.read_csv(os.path.join(DATA_DIR, 'train.csv'), index_col='id').dropna()
valid = pd.read_csv(os.path.join(DATA_DIR, 'valid.csv'), index_col='id').dropna()
test = pd.read_csv(os.path.join(DATA_DIR, 'test.csv'), index_col='id').dropna().reset_index()

train = train[train.label!='other'].reset_index()
valid = valid[valid.label!='other'].reset_index()

In [None]:
train.shape, valid.shape, test.shape

((18330, 4), (2624, 4), (5629, 3))

In [None]:
train.label.value_counts()

news         14606
clickbait     3724
Name: label, dtype: int64

In [None]:
train.groupby('label').sample(1)

Unnamed: 0,id,title,text,label
5562,5589,"Want to be neighbors with the Obamas, Ivanka Trump and Jeff Bezos? Here’s what it will cost you.","Want to become neighbors with the Obamas, Ivanka Trump and Jeff Bezos in one of the District’s hottest and most exclusive enclaves, Kalorama, but don’t want the hassle of modernizing a historic home? You are in luck. The French Embassy is selling off part of the land where its ambassador lives for $5.6 million. The 0.58-acre lot at 2221 Kalorama Road is roomy enough to build a grand mansion or up to five homes. It’s the first time in decades a parcel of land with no home on it has been for sale on the open market in this tony neighborhood. “What makes this land so unique is the zoning, R1B, which allows for a plethora of things you can build there,” said Alex Venditti of Coldwell Banker Residential Brokerage, who is co-listing the property with the Morrell-Roth team from Compass. The French Embassy acquired the 1910 Tudor Revival residence in 1936. Additional lots that overlooked Kalorama Circle were purchased in 1941 to expand the parcel to 3.6 acres. This sale will reduce the lot...",clickbait
14891,15113,"Elite donors push Democrats left on climate and immigration, but right on taxes","Bernie Sanders has identified the cancer he thinks is coursing through the Democratic Party’s bloodstream. In speech after speech, the Vermont senator has gone after the party’s donor class — and its hold on politicians — as the central impediment to both a more populist Democratic Party and its electoral success. “I believe strongly that the party must break loose from its corporate establishment ties,” Sanders said in a New York Times shortly after the election. “We must have the courage to take on the greed and power of Wall Street, the drug companies, the insurance companies and the fossil fuel industry. ” Sanders’s argument has a lot going for it. Hillary Clinton campaign’s spent much of the summer fundraising with donors, and ignored the union organizers in the Rust Belt in a way that backfired spectacularly. The WikiLeaks emails revealed that conservative donors like Israeli hawk Haim Saban were closely involved with the Clinton team’s policy shop. Her campaign was certainly...",news


https://pytorch.org/tutorials/beginner/basics/data_tutorial.html#creating-a-custom-dataset-for-your-files

In [None]:
tokenizer = AutoTokenizer.from_pretrained('distilbert-base-uncased')

In [None]:
a = tokenizer.encode("hello this is my name", max_length = 128, truncation=True, padding=True, return_tensors='pt')
b = tokenizer.encode("not my name is this", max_length = 128, truncation=True, padding=True, return_tensors='pt')

In [None]:
torch.cat((a,b), dim=0)

tensor([[ 101, 7592, 2023, 2003, 2026, 2171,  102],
        [ 101, 2025, 2026, 2171, 2003, 2023,  102]])

In [None]:
torch.cat((a, b), dim=1)

tensor([[ 101, 7592, 2023, 2003, 2026, 2171,  102,  101, 2025, 2026, 2171, 2003,
         2023,  102]])

In [None]:
class ClickbaitDataset(Dataset):
    def __init__(self,
                 headlines:list, 
                 texts:list,
                 concat_mode:str='cosine_similarity',
                 labels:list=None, 
                 label_dict: dict =None,
                 max_seq_length: int=512,
                 model_name: str='distilbert-base-uncased'):
        
        self.headlines = headlines
        self.texts = texts

        self.concat_mode = concat_mode
        self.labels = labels
        self.label_dict = label_dict

        if self.label_dict is None and labels is not None:
            # {'clickbait': 0, 'news': 1, 'other': 2}
            # using this instead of `sklearn.preprocessing.LabelEncoder`
            # no easily handle unknown target values
            self.label_dict = dict(zip(sorted(set(labels)), range(len(set(labels)))))

        self.max_seq_length=max_seq_length
        self.tokenizer = AutoTokenizer.from_pretrained(model_name)

        self.sep_vid = self.tokenizer.vocab["[SEP]"]
        self.cls_vid = self.tokenizer.vocab["[CLS]"]
        self.pad_vid = self.tokenizer.vocab["[PAD]"]

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, index):

        x = self.texts[index]
        x_head = self.headlines[index]

        x_encoded = self.tokenizer.encode(x_head, x, add_special_tokens=True, 
                                           truncation=True, 
                                          max_length=self.max_seq_length, 
                                          return_tensors="pt").squeeze(0)

        x_head_encoded = self.tokenizer.encode(x_head, add_special_tokens=True, 
                                               truncation=True, 
                                               max_length=self.max_seq_length, 
                                               return_tensors="pt").squeeze(0)

        
        x_encoded_length = x_encoded.size(0)
        x_head_encoded_length = x_head_encoded.size(0)


        # Manually Padding...
        pad_size = self.max_seq_length - x_encoded_length
        head_pad_size = self.max_seq_length - x_head_encoded_length

        pad_ids = torch.Tensor([self.pad_vid] * pad_size).long()
        head_pad_ids = torch.Tensor([self.pad_vid] * head_pad_size).long()
        
        x_tensor = torch.cat((x_encoded, pad_ids))
        x_head_tensor = torch.cat((x_head_encoded, head_pad_ids))

        x_tensor_agg = torch.cat((x_head_tensor.unsqueeze(0), x_tensor.unsqueeze(0)), dim=0).flatten()

        # Manually Masking...
        mask = torch.ones_like(x_encoded, dtype=torch.int8)
        mask_pad = torch.zeros_like(pad_ids, dtype=torch.int8)
        mask = torch.cat((mask, mask_pad),dim=0)

        head_mask = torch.ones_like(x_head_encoded, dtype=torch.int8)
        head_mask_pad = torch.zeros_like(head_pad_ids, dtype=torch.int8)
        head_mask = torch.cat((head_mask, head_mask_pad), dim=0)

        mask_agg = torch.cat((head_mask.unsqueeze(0), mask.unsqueeze(0)), dim=0).flatten()

        output_dict = {"input_ids":x_tensor_agg, "attention_mask": mask_agg}
        # print(x_encoded_agg)
        
        if self.labels is not None:
            y = self.labels[index]
            y_encoded = torch.Tensor([self.label_dict.get(y, -1)]).long().squeeze(0)
            # print(y_encoded)
            # x_encoded["targets"] = y_encoded

        return output_dict, y_encoded
    

In [None]:
MODEL_NAME = 'distilbert-base-uncased' # pretrained model from Transformers
LOG_DIR = "./logdir"                   # for training logs and tensorboard visualizations
NUM_EPOCHS = 3                         # smth around 2-6 epochs is typically fine when finetuning transformers
BATCH_SIZE = 32                        # depends on your available GPU memory (in combination with max seq length)
MAX_SEQ_LENGTH = 128                   # depends on your available GPU memory (in combination with batch size)
NUM_CLASSES = 2                        # solving 3-class classification problem
LEARN_RATE = 5e-5                      # learning rate is typically ~1e-5 for transformers
ACCUM_STEPS = 4                        # one optimization step for that many backward passes
SEED = 17

train_dataset = ClickbaitDataset(texts = train["text"], 
                                 headlines = train["title"], 
                                 labels=train['label'], 
                                 label_dict=None, 
                                 max_seq_length=MAX_SEQ_LENGTH)
valid_dataset = ClickbaitDataset(texts = valid["text"], 
                                 headlines = valid["title"], 
                                 labels=valid['label'], 
                                 label_dict=train_dataset.label_dict,
                                 max_seq_length=MAX_SEQ_LENGTH)
test_dataset = ClickbaitDataset(texts = test["text"], 
                                headlines = test["title"], 
                                labels=None, 
                                label_dict=None,
                                max_seq_length=MAX_SEQ_LENGTH)



train_dataloader = DataLoader(dataset=train_dataset,
                        batch_size=BATCH_SIZE, 
                        shuffle=True)


valid_dataloader = DataLoader(dataset=valid_dataset,
                        batch_size=BATCH_SIZE, 
                        shuffle=False)

In [None]:
train_dataset[10]

({'attention_mask': tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0,
          0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
          0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
          0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
          0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
          0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
          1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
          1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
          1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
          1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
          1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], dtype=torch.int8),
  'input_ids': tensor([  101,  1002,  6365,  1012,  1018,  2213,  1002, 20003,  2213

In [None]:
class ClickbaitDetector(nn.Module):
    def __init__(self, model_name, num_classes=None):
        super(ClickbaitDetector, self).__init__()

        config = AutoConfig.from_pretrained(model_name)

        self.model = AutoModel.from_pretrained(model_name, config=config)
        self.fclayer = nn.Linear(config.dim, config.dim)
        self.classifier = nn.Linear(config.dim, num_classes)
        self.dropout = nn.Dropout(config.seq_classif_dropout)

    def forward(self, input_ids, attention_mask=None, head_mask=None):
        assert attention_mask is not None, "attention mask is none"
        
        model_output = self.model(input_ids=input_ids, attention_mask=attention_mask, head_mask=head_mask)

        hidden_state = model_output[0]
        pooled_output = hidden_state[:, 0]
        pooled_output = self.fclayer(pooled_output)
        pooled_output = nn.Sigmoid()(pooled_output)
        pooled_output = self.dropout(pooled_output)
        logits = self.classifier(pooled_output)

        return logits


In [None]:
model = ClickbaitDetector(model_name=MODEL_NAME, num_classes=NUM_CLASSES).cuda()

loss_fn = torch.nn.CrossEntropyLoss().cuda()
optimizer = torch.optim.SGD(model.parameters(), lr=LEARN_RATE)
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer)

NameError: ignored

In [None]:
def train_one_epoch(epoch_index, tb_writer):
    running_loss = 0.
    total_acc_train = 0.
    last_loss = 0.
    

    for i, data in tqdm(enumerate(train_dataloader)):

        inputs, labels = data
        labels = labels.to(device)

        
        # Zero your gradients for every batch!
        optimizer.zero_grad()

        # Make predictions for this batch
        # outputs = model(inputs, masks)
        outputs = model(inputs['input_ids'].cuda(), inputs['attention_mask'].cuda())

        # Compute the accuracy
        acc = (outputs.argmax(dim=1)==labels).sum().item()/labels.shape[0]
        total_acc_train += acc

        # Compute the loss and its gradients
        loss = loss_fn(outputs, labels)
        loss.backward()

        # Adjust learning weights
        optimizer.step()

        # Gather data and report
        running_loss += loss.item()
        if i % 100 == 99:
            last_loss = running_loss / 100 # loss per batch
            last_acc = total_acc_train / 100 # acc per batch
            print('  batch {} loss: {} | acc: {}'.format(i + 1, last_loss, last_acc))
            tb_x = epoch_index * len(train_dataset) + i + 1
            tb_writer.add_scalar('Loss/train', last_loss, tb_x)
            running_loss = 0.
            total_acc_train = 0.

            # total_acc_val = 0.

            # for j, valid_data in enumerate(valid_dataloader):
            #     val_inputs, val_labels = valid_data
            #     val_labels = val_labels.to(device)

            #     with torch.no_grad():
            #         val_outputs = model(val_inputs['input_ids'].cuda(), val_inputs['attention_mask'].cuda())
            #         val_acc = (val_outputs.argmax(dim=1)==val_labels).sum().item()/val_labels.shape[0]
            #         print(val_acc)
            #         total_acc_val += val_acc

            # print(f"validation accuracy: {total_acc_val / j+1}")        
            



    return last_loss, last_acc

In [None]:
from datetime import datetime
timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
writer = SummaryWriter(f'run/detector_{timestamp}')

for e in range(NUM_EPOCHS):
    model.train()
    train_one_epoch(e, writer)

100it [01:14,  1.36it/s]

  batch 100 loss: 0.5360058930516243 | acc: 0.783125


200it [02:28,  1.33it/s]

  batch 200 loss: 0.5148235127329827 | acc: 0.7990625


300it [03:42,  1.34it/s]

  batch 300 loss: 0.5056973880529404 | acc: 0.8053125


400it [04:56,  1.32it/s]

  batch 400 loss: 0.5078784504532814 | acc: 0.7996875


500it [06:10,  1.37it/s]

  batch 500 loss: 0.522824182510376 | acc: 0.791875


573it [07:04,  1.35it/s]
100it [01:14,  1.36it/s]

  batch 100 loss: 0.4995529702305794 | acc: 0.8028125


200it [02:27,  1.34it/s]

  batch 200 loss: 0.5211602938175202 | acc: 0.79125


300it [03:41,  1.36it/s]

  batch 300 loss: 0.5156631743907929 | acc: 0.7903125


400it [04:55,  1.37it/s]

  batch 400 loss: 0.49721648544073105 | acc: 0.8046875


500it [06:09,  1.36it/s]

  batch 500 loss: 0.506161393225193 | acc: 0.7965625


573it [07:03,  1.35it/s]
100it [01:13,  1.36it/s]

  batch 100 loss: 0.4919824668765068 | acc: 0.8078125


200it [02:27,  1.36it/s]

  batch 200 loss: 0.5003746378421784 | acc: 0.8025


300it [03:41,  1.37it/s]

  batch 300 loss: 0.5241887336969375 | acc: 0.7859375


400it [04:55,  1.36it/s]

  batch 400 loss: 0.5067513501644134 | acc: 0.798125


500it [06:09,  1.34it/s]

  batch 500 loss: 0.5164433017373085 | acc: 0.790625


573it [07:03,  1.35it/s]
