In [1]:
try :
  from google.colab import drive
  IN_COLAB = True
except:
  IN_COLAB = False
print('IN COLAB: ', IN_COLAB)

IN COLAB:  False


In [2]:
import os
import pandas as pd

parentdir = "./propaganda_dataset_v2"
train_file= "propaganda_train.tsv"
val_file= "propaganda_val.tsv"

train_path=os.path.join(parentdir,train_file)
val_path=os.path.join(parentdir,val_file)

if IN_COLAB:
  train_path = '/content/propaganda_train.tsv'
  val_path = '/content/propaganda_val.tsv'

train_df=pd.read_csv(train_path,delimiter="\t",quotechar='|')
val_df=pd.read_csv(val_path,delimiter="\t",quotechar='|')
train_df = train_df.drop(train_df[train_df['label'] == 'not_propaganda'].index)
val_df = val_df.drop(val_df[val_df['label'] == 'not_propaganda'].index)
train_df
val_df


  from pandas.core import (


Unnamed: 0,label,tagged_in_context
1,causal_oversimplification,Mostly because <BOS> the country would not las...
2,appeal_to_fear_prejudice,Lyndon Johnson <BOS> gets Earl Warren and Sen....
4,repetition,It must be exacted from him directly in order ...
5,"name_calling,labeling",Is it any wonder that priests and laity alike ...
6,loaded_language,Health workers have been asked to work with co...
...,...,...
629,flag_waving,"As a result, 58,177 <BOS> Americans <EOS> woul..."
631,flag_waving,<BOS> But if you are a freedom-loving American...
632,loaded_language,I heard lots of <BOS> gut-wrenching stories <E...
638,flag_waving,<BOS> He also sang an Islamic State fight song...


In [3]:
import re
experiment = True

def extract_tagged_section(row):
    pattern = r'<BOS>.*?<EOS>'
    match = re.search(pattern, row['tagged_in_context'])
    return match.group() if match else ""

def extract_text_inside_tags(row):
    pattern = r'<BOS>(.*?)<EOS>'
    match = re.search(pattern, row['tagged_in_context'])
    return match.group(1).strip() if match else ""

def transform_multi_label(row):
    new_value = label_to_id[row['label']]
    return new_value

def transform_strip_tag(row):
    sent = row['tagged_in_context']
    cleaned_string = sent.replace("<BOS>", "")
    cleaned_string = cleaned_string.replace("<EOS>", "")
    return cleaned_string



label_to_id = {'flag_waving': 0, 'exaggeration,minimisation': 1, 'causal_oversimplification': 2, 'name_calling,labeling': 3, 'repetition': 4, 'doubt': 5, 'loaded_language': 6, 'appeal_to_fear_prejudice': 7}
id_to_label = {0: 'flag_waving', 1: 'exaggeration,minimisation', 2: 'causal_oversimplification', 3: 'name_calling,labeling', 4: 'repetition', 5: 'doubt',6: 'loaded_language', 7: 'appeal_to_fear_prejudice'}

train_df['label_str'] = train_df.apply(transform_multi_label, axis=1)
train_df['extract_no_tags'] = train_df.apply(extract_text_inside_tags, axis=1)
train_df['extract_with_tags'] = train_df.apply(extract_tagged_section, axis=1)

val_df['label_str'] = val_df.apply(transform_multi_label, axis=1)
val_df['extract_no_tags'] = val_df.apply(extract_text_inside_tags, axis=1)
val_df['extract_with_tags'] = val_df.apply(extract_tagged_section, axis=1)

val_df

Unnamed: 0,label,tagged_in_context,label_str,extract_no_tags,extract_with_tags
1,causal_oversimplification,Mostly because <BOS> the country would not las...,2,the country would not last long without an out...,<BOS> the country would not last long without ...
2,appeal_to_fear_prejudice,Lyndon Johnson <BOS> gets Earl Warren and Sen....,7,gets Earl Warren and Sen. Richard Russel to jo...,<BOS> gets Earl Warren and Sen. Richard Russel...
4,repetition,It must be exacted from him directly in order ...,4,infidels,<BOS> infidels <EOS>
5,"name_calling,labeling",Is it any wonder that priests and laity alike ...,3,"the ""gay lifestyle","<BOS> the ""gay lifestyle <EOS>"
6,loaded_language,Health workers have been asked to work with co...,6,devastating communities,<BOS> devastating communities <EOS>
...,...,...,...,...,...
629,flag_waving,"As a result, 58,177 <BOS> Americans <EOS> woul...",0,Americans,<BOS> Americans <EOS>
631,flag_waving,<BOS> But if you are a freedom-loving American...,0,But if you are a freedom-loving American,<BOS> But if you are a freedom-loving American...
632,loaded_language,I heard lots of <BOS> gut-wrenching stories <E...,6,gut-wrenching stories,<BOS> gut-wrenching stories <EOS>
638,flag_waving,<BOS> He also sang an Islamic State fight song...,0,He also sang an Islamic State fight song and r...,<BOS> He also sang an Islamic State fight song...


In [4]:
# longest_string = val_df['extract_no_tags'].apply(lambda x: len(str(x))).max()
# print("Longest string length:", longest_string)

In [14]:
epochs = 24
lr = 1e-5
batch_size=25
max_len=150
n_classes = 8

sent_col = 'extract_no_tags'
target_col = 'label_str'

# sent_col = 'extract_with_tags'

In [15]:
from torch.utils.data import DataLoader, Dataset
from transformers import BertModel, BertTokenizer
import torch
import numpy as np



class CustomPropagandaDataset_vanilla(Dataset):
    def __init__(self,df, max_len, sent_col, target_col):
        tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

        self.labels=torch.tensor([label for label in df[target_col]])
        # print(self.labels)
        self.texts=[tokenizer(text,padding='max_length',max_length=max_len,truncation=True,return_tensors="pt") for text in df[sent_col]]
        # print(self.texts)
    def classes(self):
        return self.labels

    def __len__(self):
        return len(self.labels)

    def get_batch_labels(self,idx):
        return np.array(self.labels[idx])

    def get_batch_texts(self,idx):
        return self.texts[idx]

    def __getitem__(self,idx):
        batch_texts=self.get_batch_texts(idx)
        batch_y=self.get_batch_labels(idx)

        return batch_texts,batch_y


def prepare_inputs(input1,label,device):
  label=label.to(device)
  mask=input1['attention_mask'].to(device)
  input_id=input1['input_ids'].squeeze(1).to(device)
  return (input_id,mask,label)

In [16]:
train_data = CustomPropagandaDataset_vanilla(train_df, max_len, sent_col, target_col)
val_data = CustomPropagandaDataset_vanilla(val_df, max_len, sent_col, target_col)

In [17]:
train_dataloader=torch.utils.data.DataLoader(train_data,batch_size=batch_size,shuffle=True)
val_dataloader=torch.utils.data.DataLoader(val_data,batch_size=batch_size)

In [18]:
from torch import nn

class BertClassifier(nn.Module):

    def __init__(self,dropout=0.5,num_classes=8):
        super(BertClassifier, self).__init__()

        self.bert=BertModel.from_pretrained('bert-base-uncased')
        self.dropout=nn.Dropout(dropout)
        self.linear=nn.Linear(768,num_classes)
        self.relu=nn.ReLU()

    def forward(self,input_id,mask):

        _, pooled_output = self.bert(input_ids=input_id,attention_mask=mask,return_dict=False)
        dropout_output=self.dropout(pooled_output)
        linear_output=self.linear(dropout_output)
        final_layer=self.relu(linear_output)

        return final_layer

device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(device)

cuda


In [19]:
from tqdm import tqdm
import matplotlib.pyplot as plt
import uuid


model=BertClassifier(num_classes=8)
criterion=nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(params=model.parameters(), lr=lr)


train_acc_list = []
train_loss_list = []
val_acc_list = []
val_loss_list = []

best_val_acc = 0
best_epoch = 0
best_model_state = None

model_id = str(uuid.uuid4())


model.to(device)
for epoch_num in range(epochs):
        total_acc_train=0
        total_loss_train=0
        model.train()
        for train_input,train_label in tqdm(train_dataloader):

            input_id,mask, train_label=prepare_inputs(train_input,train_label,device)

            output_1=model(input_id,mask)

            batch_loss_1=criterion(output_1,train_label.long())
            total_loss_train +=batch_loss_1.item()

            acc=(output_1.argmax(dim=1)==train_label).sum().item()
            total_acc_train+=acc

            model.zero_grad()
            batch_loss_1.backward()
            optimizer.step()

        total_acc_val=0
        total_loss_val=0

        y_true = []
        y_pred = []
        model.eval()
        with torch.no_grad():
            for val_input,val_label in val_dataloader:

                input_id,mask, val_label=prepare_inputs(val_input,val_label,device)

                output_2= model(input_id,mask)

                # for scoring
                predicted = output_2.argmax(dim=1)
                y_true.extend(val_label.cpu().numpy())
                y_pred.extend(predicted.cpu().numpy())

                batch_loss_2=criterion(output_2,val_label.long())

                total_loss_val+=batch_loss_2.item()

                acc=(output_2.argmax(dim=1)==val_label).sum().item()
                total_acc_val+=acc
            train_acc = total_acc_train / len(train_data)

        train_loss = total_loss_train / len(train_data)
        val_acc = total_acc_val / len(val_data)
        val_loss = total_loss_val / len(val_data)

        train_acc_list.append(train_acc)
        train_loss_list.append(train_loss)
        val_acc_list.append(val_acc)
        val_loss_list.append(val_loss)

        print(f'Epochs: {epoch_num+1} | Train Loss: {total_loss_train / len(train_data):.3f} | Train Accuracy: {total_acc_train/len(train_data):.3f}')
        print(f'Val loss: {total_loss_val/len(val_data):.3f} | Val Accuracy: {total_acc_val / len(val_data):.3f}')
        if val_acc > best_val_acc:
          best_epoch = epoch_num
          y_true_best = y_true.copy()
          y_pred_best = y_pred.copy()
          best_val_acc = val_acc
          best_model_state = model.state_dict()

        if epoch_num == epochs-1:
            print(f'______{model_id}______')
            print(f'LR: {lr} FINAL ACC = {total_acc_val / len(val_data):.3f}')
            print(f'LR: {lr} BEST ACC = {best_val_acc:.3f}')
            print('____________')


# Plot the accuracy and loss curves over epochs
epochs_range = range(1, epochs+1)

plt.figure(figsize=(12, 6))
plt.subplot(1, 2, 1)
plt.plot(epochs_range, train_acc_list, label='Training Accuracy')
plt.plot(epochs_range, val_acc_list, label='Validation Accuracy')
plt.xlabel('Epochs')
plt.ylabel('Accuracy')
plt.legend(loc='lower right')
plt.title('Accuracy Curves')

plt.subplot(1, 2, 2)
plt.plot(epochs_range, train_loss_list, label='Training Loss')
plt.plot(epochs_range, val_loss_list, label='Validation Loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend(loc='upper right')
plt.title('Loss Curves')

plt.tight_layout()
plt.show()


100%|██████████| 52/52 [01:23<00:00,  1.61s/it]


Epochs: 1 | Train Loss: 0.084 | Train Accuracy: 0.148
Val loss: 0.086 | Val Accuracy: 0.188


 13%|█▎        | 7/52 [00:12<01:19,  1.76s/it]


KeyboardInterrupt: 

In [None]:
from sklearn.metrics import confusion_matrix, classification_report, ConfusionMatrixDisplay
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
from sklearn.metrics import confusion_matrix, classification_report
import json
# analysis of best performing model
classes = list(id_to_label.values())
cm = confusion_matrix(y_true_best, y_pred_best)

disp = ConfusionMatrixDisplay(cm)
disp.plot()

# Compute precision, recall, F1-score, and other metrics for the best model
report = classification_report(y_true_best, y_pred_best, target_names=classes)
print('Classification Report:')
print(report)

TypeError: too many positional arguments

In [12]:
results_dict = {
    'model_id': model_id,
    'train_accuracy': train_acc_list,
    'train_loss': train_loss_list,
    'val_accuracy': val_acc_list,
    'val_loss': val_loss_list,
    'hyperparameters': {
        'learning_rate': lr,
        'num_epochs': epochs,
        'batch_size': batch_size,
        'max_len': max_len
    },
    'results': {
        "classes": n_classes,
        "last_acc": val_acc_list[-1],
        'best_acc': best_val_acc,
        'best_acc_epoch': best_epoch,
        'confusion_matrix': cm.tolist(),
        'classification_report': report
    }
}


# Save the results dictionary as a JSON file with the model ID
results_filename = f'./results/multiclass_results_{model_id}.json'
with open(results_filename, 'w') as f:
    json.dump(results_dict, f, indent=4)

# Save the best model state with the model ID
model_filename = f'./results/multiclass_best_model_{model_id}.pth'
torch.save(best_model_state, model_filename)

NameError: name 'cm' is not defined