In [1]:
from transformers import BertTokenizer, BertForSequenceClassification
import torch
from tqdm import tqdm
from torchvision import datasets
from torch.utils.data import DataLoader
from torch import nn
from torch.nn import functional as F

from utils import *

device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(device)

2024-05-02 09:04:30.061420: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


cpu


[nltk_data] Downloading package punkt to /Users/rfd/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [2]:
epochs = 3
lr = 0.1
n_labels = 9
manual_loss= False

In [3]:
label_to_id = {'flag_waving': 0, 'exaggeration,minimisation': 1, 'causal_oversimplification': 2, 'name_calling,labeling': 3, 'repetition': 4, 'doubt': 5, 'not_propaganda': 6, 'loaded_language': 7, 'appeal_to_fear_prejudice': 8}
id_to_label = {0: 'flag_waving', 1: 'exaggeration,minimisation', 2: 'causal_oversimplification', 3: 'name_calling,labeling', 4: 'repetition', 5: 'doubt', 6: 'not_propaganda', 7: 'loaded_language', 8: 'appeal_to_fear_prejudice'}


In [4]:
include_dev=False
tokenizer = BertTokenizer.from_pretrained('bert-base-cased')
batch_size_train = 8
batch_size_test = 8
batch_size_dev = 8

if include_dev:
    train_df, val_df, test_df = get_processed_data(dev=True)

    train_df= get_cols_for_bert(train_df, 'snip')
    val_df= get_cols_for_bert(val_df, 'snip')
    test_df= get_cols_for_bert(test_df, 'snip')

    train_input_embeddings_labelled = format_and_tokenise_from_df(train_df, tokenizer, task='snip', max_len=64)
    val_input_embeddings_labelled = format_and_tokenise_from_df(val_df, tokenizer, task='snip', max_len=64)
    test_input_embeddings_labelled = format_and_tokenise_from_df(test_df, tokenizer, task='snip', max_len=64)

    train_dataset = CustomPropagandaDataset(train_input_embeddings_labelled)
    test_dataset = CustomPropagandaDataset(test_input_embeddings_labelled)
    val_dataset = CustomPropagandaDataset(val_input_embeddings_labelled)



    train_dataloader = DataLoader(train_dataset, batch_size=batch_size_train, shuffle=True)
    test_dataloader = DataLoader(test_dataset, batch_size=batch_size_test, shuffle=True)
    val_dataloader = DataLoader(val_dataset, batch_size=batch_size_dev, shuffle=True)


else:
    train_df, val_df = get_processed_data(dev=False)

    train_df= get_cols_for_bert(train_df, 'snip')
    val_df= get_cols_for_bert(val_df, 'snip')

    train_input_embeddings_labelled = format_and_tokenise_from_df(train_df, tokenizer, task='snip', max_len=64)
    val_input_embeddings_labelled = format_and_tokenise_from_df(val_df, tokenizer, task='snip', max_len=64)

    train_dataset = CustomPropagandaDataset(train_input_embeddings_labelled)
    val_dataset = CustomPropagandaDataset(val_input_embeddings_labelled)

    train_dataloader = DataLoader(train_dataset, batch_size=batch_size_train, shuffle=True)
    val_dataloader = DataLoader(val_dataset, batch_size=batch_size_dev, shuffle=True)




2560
2560
[6, 6, 0, 6, 6]
['he', 'won’t make things any worse than they are for President Trump.', 'American people', 'and', 'Location: Westerville, Ohio']
640
640
[6, 2, 8, 6, 4]
['according to a UN estimate.', 'the country would not last long without an outside high IQ elite to run the country', 'gets Earl Warren and Sen. Richard Russel to join the Warren Commission by telling them that the assassination could lead to World War III', 'You', 'infidels']


In [6]:
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=n_labels)
optimizer = torch.optim.Adam(params=model.parameters(), lr=lr)
criterion = nn.CrossEntropyLoss()

train_losses = []
train_accuracy = []
val_losses = []
val_accuracy = []

model.to(device)

for epoch in range(epochs):
    train_running_losses = []
    train_total = 0
    train_correct = 0
    
    model.train()
    for batch in tqdm(train_dataloader):
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        
        if manual_loss:
            loss = criterion(outputs.logits, batch['labels'])
        else:
            loss = outputs.loss
        
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()
        
        _, predicted_labels = torch.max(outputs.logits, dim=1)
        train_total += batch['labels'].size(0)
        train_correct += (predicted_labels == batch['labels']).sum().item()
        train_running_losses.append(loss.item())
    
    train_losses.append(sum(train_running_losses) / len(train_running_losses))
    train_accuracy.append(train_correct / train_total)
    print(f'TRAIN: Epoch [{epoch+1}/{epochs}] Loss: {sum(train_running_losses)/len(train_running_losses)} Acc: {train_correct/train_total}')
    
    model.eval()
    with torch.no_grad():
        val_running_losses = []
        val_total = 0
        val_correct = 0
        
        for batch in tqdm(val_dataloader):
            batch = {k: v.to(device) for k, v in batch.items()}
            outputs = model(**batch)
            loss = outputs.loss
            
            _, predicted_labels = torch.max(outputs.logits, dim=1)
            val_total += batch['labels'].size(0)
            val_correct += (predicted_labels == batch['labels']).sum().item()
            val_running_losses.append(loss.item())
        
        val_losses.append(sum(val_running_losses) / len(val_running_losses))
        val_accuracy.append(val_correct / val_total)
        print(f'VAL: Epoch [{epoch+1}/{epochs}] Loss: {sum(val_running_losses)/len(val_running_losses)} Acc: {val_correct/val_total}')
    
    if include_dev:
        print('TESTING...')
        test_losses = []
        test_accuracy = []
        
        model.eval()
        with torch.no_grad():
            test_running_losses = []
            test_total = 0
            test_correct = 0
            
            for batch in test_dataloader:
                batch = {k: v.to(device) for k, v in batch.items()}
                outputs = model(**batch)
                loss = outputs.loss
                
                _, predicted_labels = torch.max(outputs.logits, dim=1)
                test_total += batch['labels'].size(0)
                test_correct += (predicted_labels == batch['labels']).sum().item()
                test_running_losses.append(loss.item())
            
            test_losses.append(sum(test_running_losses) / len(test_running_losses))
            test_accuracy.append(test_correct / test_total)
            print(f'TEST: Epoch [{epoch+1}/{epochs}] Loss: {sum(test_running_losses)/len(test_running_losses)} Acc: {test_correct/test_total}')

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly i

KeyboardInterrupt: 