In [15]:
import os
import pandas as pd

parentdir = "./propaganda_dataset_v2"
train_file= "propaganda_train.tsv"
val_file= "propaganda_val.tsv"

train_path=os.path.join(parentdir,train_file)
val_path=os.path.join(parentdir,val_file)

train_df=pd.read_csv(train_path,delimiter="\t",quotechar='|')
val_df=pd.read_csv(val_path,delimiter="\t",quotechar='|')



In [23]:
def transform_binaryify(row):
    new_value = 0 if row['label'] == 'not_propaganda' else 1
    return new_value

def transform_strip_tag(row):
    sent = row['tagged_in_context']
    cleaned_string = sent.replace("<BOS>", "")
    cleaned_string = cleaned_string.replace("<EOS>", "")
    return cleaned_string


train_df['propaganda'] = train_df.apply(transform_binaryify, axis=1)
train_df['original_without_snip_tags'] = train_df.apply(transform_strip_tag, axis=1)


val_df['propaganda'] = val_df.apply(transform_binaryify, axis=1)
val_df['original_without_snip_tags'] = val_df.apply(transform_strip_tag, axis=1)

train_df
val_df

Unnamed: 0,label,tagged_in_context,propaganda,original_without_snip_tags
0,not_propaganda,"On average, between 300 and 600 infections are...",0,"On average, between 300 and 600 infections are..."
1,causal_oversimplification,Mostly because <BOS> the country would not las...,1,Mostly because the country would not last lon...
2,appeal_to_fear_prejudice,Lyndon Johnson <BOS> gets Earl Warren and Sen....,1,Lyndon Johnson gets Earl Warren and Sen. Rich...
3,not_propaganda,<BOS> You <EOS> may opt out at anytime.,0,You may opt out at anytime.
4,repetition,It must be exacted from him directly in order ...,1,It must be exacted from him directly in order ...
...,...,...,...,...
635,not_propaganda,"NewsCatholic Church, <BOS> Family, Marriage <E...",0,"NewsCatholic Church, Family, Marriage"
636,not_propaganda,"Remember our saying, modern day fairy <BOS> ta...",0,"Remember our saying, modern day fairy tales s..."
637,not_propaganda,Why <BOS> not <EOS> open up to Iran with massi...,0,Why not open up to Iran with massive amounts...
638,flag_waving,<BOS> He also sang an Islamic State fight song...,1,He also sang an Islamic State fight song and ...


In [24]:
from torch.utils.data import DataLoader, Dataset
from transformers import BertModel, BertTokenizer
import torch
import numpy as np


class CustomPropagandaDataset_vanilla(Dataset):
    def __init__(self,df):
        tokenizer = BertTokenizer.from_pretrained('bert-base-cased')
        
        self.labels=torch.tensor([label for label in df['propaganda']])
        self.texts=[tokenizer(text,padding='max_length',max_length=150,truncation=True,return_tensors="pt") for text in df['original_without_snip_tags']]

    def classes(self):
        return self.labels

    def __len__(self):
        return len(self.labels)

    def get_batch_labels(self,idx):
        return np.array(self.labels[idx])

    def get_batch_texts(self,idx):
        return self.texts[idx]

    def __getitem__(self,idx):
        batch_texts=self.get_batch_texts(idx)
        batch_y=self.get_batch_labels(idx)

        return batch_texts,batch_y


def prepare_inputs(input1,label,device):
  label=label.to(device)
  mask=input1['attention_mask'].to(device)
  input_id=input1['input_ids'].squeeze(1).to(device)
  return (input_id,mask,label)

In [25]:
train_data = CustomPropagandaDataset_vanilla(train_df)
val_data = CustomPropagandaDataset_vanilla(val_df)

In [26]:
train_dataloader=torch.utils.data.DataLoader(train_data,batch_size=50,shuffle=True)
val_dataloader=torch.utils.data.DataLoader(val_data,batch_size=50)

In [27]:
from torch import nn

class BertClassifier(nn.Module):

    def __init__(self,dropout=0.5,num_classes=2):
        super(BertClassifier, self).__init__()

        self.bert=BertModel.from_pretrained('bert-base-uncased')
        self.dropout=nn.Dropout(dropout)
        self.linear=nn.Linear(768,num_classes)
        self.relu=nn.ReLU()

    def forward(self,input_id,mask):

        _, pooled_output = self.bert(input_ids=input_id,attention_mask=mask,return_dict=False)
        dropout_output=self.dropout(pooled_output)
        linear_output=self.linear(dropout_output)
        final_layer=self.relu(linear_output)

        return final_layer

device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(device)

cuda


In [28]:
# wills
epochs = 16
lr = 5e-6
batch_size=50
max_len=150

In [29]:
from tqdm import tqdm


model=BertClassifier(num_classes=2).to(device)
criterion=nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(params=model.parameters(), lr=lr)

for epoch_num in range(epochs):
        total_acc_train=0
        total_loss_train=0
        model.train()
        for train_input,train_label in tqdm(train_dataloader):

            input_id,mask, train_label=prepare_inputs(train_input,train_label,device)

            output_1=model(input_id,mask)

            batch_loss_1=criterion(output_1,train_label.long())
            total_loss_train +=batch_loss_1.item()

            acc=(output_1.argmax(dim=1)==train_label).sum().item()
            total_acc_train+=acc

            model.zero_grad()
            batch_loss_1.backward()
            optimizer.step()

        total_acc_val=0
        total_loss_val=0
        model.eval()
        with torch.no_grad():
            for val_input,val_label in val_dataloader:

                input_id,mask, val_label=prepare_inputs(val_input,val_label,device)

                output_2= model(input_id,mask)

                batch_loss_2=criterion(output_2,val_label.long())

                total_loss_val+=batch_loss_2.item()

                acc=(output_2.argmax(dim=1)==val_label).sum().item()
                total_acc_val+=acc

        print(f'Epochs: {epoch_num+1} | Train Loss: {total_loss_train / len(train_data):.3f} | Train Accuracy: {total_acc_train/len(train_data):.3f}')
        print(f'Val loss: {total_loss_val/len(val_data):.3f} | Val Accuracy: {total_acc_val / len(val_data):.3f}')
        
        if epoch_num == epochs-1:
            print('____________')
            print(f'LR: {lr} FINAL ACC = {total_acc_val / len(val_data):.3f}')
            print('____________')

100%|██████████| 52/52 [00:29<00:00,  1.78it/s]


Epochs: 1 | Train Loss: 0.014 | Train Accuracy: 0.491
Val loss: 0.014 | Val Accuracy: 0.517


100%|██████████| 52/52 [00:30<00:00,  1.71it/s]


Epochs: 2 | Train Loss: 0.014 | Train Accuracy: 0.502
Val loss: 0.014 | Val Accuracy: 0.517


100%|██████████| 52/52 [00:31<00:00,  1.67it/s]


Epochs: 3 | Train Loss: 0.014 | Train Accuracy: 0.552
Val loss: 0.013 | Val Accuracy: 0.614


100%|██████████| 52/52 [00:31<00:00,  1.65it/s]


Epochs: 4 | Train Loss: 0.014 | Train Accuracy: 0.598
Val loss: 0.013 | Val Accuracy: 0.623


100%|██████████| 52/52 [00:31<00:00,  1.65it/s]


Epochs: 5 | Train Loss: 0.013 | Train Accuracy: 0.618
Val loss: 0.013 | Val Accuracy: 0.622


 79%|███████▉  | 41/52 [00:25<00:06,  1.61it/s]

: 