In [59]:
from transformers import BertTokenizer, BertForSequenceClassification
import torch
from tqdm import tqdm
from torch.utils.data import Dataset
from torchvision import datasets
from torch.utils.data import DataLoader
from torch import nn
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all'

from utils import *



In [49]:
train_df, val_df, test_df = get_processed_data()

In [50]:
def get_cols_for_bert(df):    
    df = df.copy()
    df = df[['propaganda', 'original_sentence_no_tags']]
    return df
    
train_df= get_cols_for_bert(train_df)
val_df= get_cols_for_bert(val_df)
test_df= get_cols_for_bert(test_df)

train_df.shape
val_df.shape
test_df.shape
    

(2240, 2)

(480, 2)

(480, 2)

In [51]:
tokenizer = BertTokenizer.from_pretrained('bert-base-cased')

In [53]:
def format_and_tokenise_from_df(df):
    max_len = 160
    
    labels = list(df['propaganda'])
    sents = list(df['original_sentence_no_tags'])
    
    sents_input_embeddings = tokenizer(sents, padding='max_length', max_length=max_len, truncation=True, return_tensors='pt')
    sents_input_embeddings['labels'] = torch.tensor([label for label in labels])
    
    print(len(sents_input_embeddings['input_ids']))
    print(len(sents_input_embeddings['labels']))
    print(labels[:5])    
    print(sents[:5])
    return sents_input_embeddings
    
train_input_embeddings_labelled = format_and_tokenise_from_df(train_df)
val_input_embeddings_labelled = format_and_tokenise_from_df(val_df)
test_input_embeddings_labelled = format_and_tokenise_from_df(test_df)


2240
2240
[0, 1, 0, 0, 0]
['Vatican watchers believe the pope was influenced by Chilean Cardinal Francisco Javier  Errazuriz, a  member of the G9 group who has backed Barros and reportedly helped block moves to make Cruz a member of the abuse commission. ', 'While he is deeply concerned that British people not begin to think that resisting jihad terror means that they are in a “battle” with the “entire religion” of Islam, he appears unaware of the fact that many Muslims throughout history have considered their  entire religion  to be at war with the entire non-Muslim world. ', ' As for  her ties with Goldman Sachs, Breitbart reported the following: ', 'For decades, the county’s  public schools have offered a weekly Bible class during the school day — 30 minutes at the elementary level and  45 minutes in middle school. ', 'Douglas Haig had no reason to believe Stephen Paddock would launch  the Oct. 1 shooting in  Las Vegas that killed 58 people, attorney Marc Victor said. ']
480
480
[0,

In [54]:
class CustomPropagandaDataset(Dataset):
    def __init__(self, labelled_embeddings_dict):

        self.labelled_embeddings = labelled_embeddings_dict

    def __len__(self):
        return len(self.labelled_embeddings['input_ids'])

    def __getitem__(self, idx):
        input_ids, token_type_ids, attention_masks, label = [self.labelled_embeddings[key][idx] for key in self.labelled_embeddings.keys()]
        return {'input_ids':input_ids, 'token_type_ids': token_type_ids, 'attention_mask':attention_masks, 'labels':label}

In [55]:
train_dataset = CustomPropagandaDataset(train_input_embeddings_labelled)
test_dataset = CustomPropagandaDataset(test_input_embeddings_labelled)
dev_dataset = CustomPropagandaDataset(val_input_embeddings_labelled)

In [56]:
batch_size_train = 32
batch_size_test = 32
batch_size_dev = 32

train_dataloader = DataLoader(train_dataset, batch_size=batch_size_train, shuffle=True)
test_dataloader = DataLoader(test_dataset, batch_size=batch_size_test, shuffle=True)
dev_dataloader = DataLoader(dev_dataset, batch_size=batch_size_dev, shuffle=True)

In [57]:
boop = next(iter(train_dataloader))
boop['input_ids'].shape

torch.Size([32, 160])

In [58]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)
model.to(device)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly i

In [60]:
epochs = 2
optimizer = torch.optim.AdamW(params=model.parameters(), lr=1e-5)

train_losses = []
train_accuracy = []
dev_losses = []
dev_accuracy = []

for epoch in range(epochs):
  train_running_losses = []
  train_total = 0
  train_correct = 0
  model.train()
  for i, batch in enumerate(tqdm(train_dataloader)):

    batch = {k: v.to(device) for k, v in batch.items()}

    outputs = model(**batch)

    loss = outputs[0]
    # print(outputs)
    loss.backward()
    optimizer.step()
    optimizer.zero_grad()
    # Convert outputs to predicted labels (0 or 1 based on threshold 0.5)
    _, indices = torch.max(outputs['logits'], dim=1)
    predicted_labels = indices.float()

    train_total += batch['labels'].size(0)
    train_correct += (predicted_labels == batch['labels']).sum().item()
    train_running_losses.append(loss.item())

  train_losses.append(sum(train_running_losses)/len(train_running_losses))
  train_accuracy.append(train_correct/train_total)


  model.eval()
  with torch.no_grad():
    dev_running_losses = []
    dev_total = 0
    dev_correct = 0
    for i, batch in enumerate(tqdm(dev_dataloader)):
      batch = {k: v.to(device) for k, v in batch.items()}
      outputs = model(**batch)
      loss = outputs[0]
      # Convert outputs to predicted labels (0 or 1 based on threshold 0.5)
      _, indices = torch.max(outputs['logits'], dim=1)
      predicted_labels = indices.float()

      # Calculate accuracy
      dev_total += batch['labels'].size(0)
      dev_correct += (predicted_labels == batch['labels']).sum().item()
      dev_running_losses.append(loss.item())

  dev_losses.append(sum(dev_running_losses)/len(dev_running_losses))
  dev_accuracy.append(dev_correct/dev_total)

  print(f'TRAIN: Epoch [{epoch}/{epochs}] Loss: {sum(train_running_losses)/len(train_running_losses)} Acc: {train_correct/train_total}')
  print(f'DEV: Epoch [{epoch + 1}/{epochs}] Loss: {sum(dev_running_losses)/len(dev_running_losses)} Acc: {dev_correct/dev_total}')


BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12,

  4%|▍         | 3/70 [04:42<1:45:19, 94.32s/it]


KeyboardInterrupt: 