# CODE FOR ENTIRE W2V MODEL BUILDING AND TESTING FOR TASK 1

In [21]:
# imports

import os
import pandas as pd 
from gensim.models import KeyedVectors
import matplotlib.pyplot as plt
import nltk
from nltk import word_tokenize as tokenize
from nltk.corpus import stopwords as stopwords
from nltk.stem import PorterStemmer, SnowballStemmer
import torch
from torch.utils.data import DataLoader, Dataset
import numpy as np
from torch import nn
from tqdm import tqdm
import uuid
import re
from sklearn.metrics import confusion_matrix, classification_report, ConfusionMatrixDisplay
import json

nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')



[nltk_data] Downloading package punkt to /Users/rfd/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/rfd/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to /Users/rfd/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [22]:
# loading data if in colab/not in colab

try :
  from google.colab import drive
  IN_COLAB = True
except:
  IN_COLAB = False
print('IN COLAB: ', IN_COLAB)


parentdir = "./propaganda_dataset_v2"
train_file= "propaganda_train.tsv"
val_file= "propaganda_val.tsv"

train_path=os.path.join(parentdir,train_file)
val_path=os.path.join(parentdir,val_file)

if IN_COLAB:
  train_path = '/content/propaganda_train.tsv'
  val_path = '/content/propaganda_val.tsv'

train_df=pd.read_csv(train_path,delimiter="\t",quotechar='|')
val_df=pd.read_csv(val_path,delimiter="\t",quotechar='|')


IN COLAB:  False


In [23]:
# pre-process data for binary task 
# remove snippet tags
# create binary target variable

def transform_binaryify(row):
    new_value = 0 if row['label'] == 'not_propaganda' else 1
    return new_value

def transform_strip_tag(row):
    sent = row['tagged_in_context']
    cleaned_string = sent.replace("<BOS>", "")
    cleaned_string = cleaned_string.replace("<EOS>", "")
    return cleaned_string


train_df['propaganda'] = train_df.apply(transform_binaryify, axis=1)
train_df['original_without_snip_tags'] = train_df.apply(transform_strip_tag, axis=1)


val_df['propaganda'] = val_df.apply(transform_binaryify, axis=1)
val_df['original_without_snip_tags'] = val_df.apply(transform_strip_tag, axis=1)

# check it has worked correctly

train_df
val_df

Unnamed: 0,label,tagged_in_context,propaganda,original_without_snip_tags
0,not_propaganda,"On average, between 300 and 600 infections are...",0,"On average, between 300 and 600 infections are..."
1,causal_oversimplification,Mostly because <BOS> the country would not las...,1,Mostly because the country would not last lon...
2,appeal_to_fear_prejudice,Lyndon Johnson <BOS> gets Earl Warren and Sen....,1,Lyndon Johnson gets Earl Warren and Sen. Rich...
3,not_propaganda,<BOS> You <EOS> may opt out at anytime.,0,You may opt out at anytime.
4,repetition,It must be exacted from him directly in order ...,1,It must be exacted from him directly in order ...
...,...,...,...,...
635,not_propaganda,"NewsCatholic Church, <BOS> Family, Marriage <E...",0,"NewsCatholic Church, Family, Marriage"
636,not_propaganda,"Remember our saying, modern day fairy <BOS> ta...",0,"Remember our saying, modern day fairy tales s..."
637,not_propaganda,Why <BOS> not <EOS> open up to Iran with massi...,0,Why not open up to Iran with massive amounts...
638,flag_waving,<BOS> He also sang an Islamic State fight song...,1,He also sang an Islamic State fight song and ...


In [24]:
# load in W2V vectors
pretrained_model_path = './GoogleNews-vectors-negative300.bin.gz' 
my_model = KeyedVectors.load_word2vec_format(pretrained_model_path, binary=True)

In [30]:
# further pre-processing
# these functions carry out the pre processing describe in the assignment
# they are called as part of the custom Dataset initialisation
# both functions take in a single string and return the word2vec pooled vector representation

def transform_tokens_to_w2v_with_stems(string, model, pool_type='mean', normalise = False):
    print('string:', string)
    
    stops = set(stopwords.words('english'))
    porter_stemmer = PorterStemmer()
    snowball_stemmer = SnowballStemmer('english')
    vectors = []
    original_tokens = tokenize(string)
    print('original tokens:', original_tokens)
    cleaned_tokens = [w for w in original_tokens if len(w) > 1 and w.isalpha() and w not in stops]
    print('cleaned:',cleaned_tokens)
    
    for token in cleaned_tokens:
        if token in model.key_to_index:
            vectors.append(model.get_vector(token, norm=normalise))
        else:
            print('STEMMING')
            porter_stem = porter_stemmer.stem(token)
            snowball_stem = snowball_stemmer.stem(token)

            if porter_stem in model.key_to_index:
                print('porter', porter_stem)
                vectors.append(model.get_vector(porter_stem, norm=normalise))
            elif snowball_stem in model.key_to_index:
                print('snowball', snowball_stem)
                vectors.append(model.get_vector(snowball_stem, norm=normalise))
    
    if not vectors:
        return np.zeros(300)
    
    if pool_type == 'mean':
        return np.mean(vectors, axis=0)
    elif pool_type == 'max':
        return np.max(vectors, axis=0)
    elif pool_type == 'mult':
        result = np.ones(300)
        for vector in vectors:
            result *= vector
        return result
    else:
        raise ValueError("Invalid pool_type. Choose either 'mean' or 'max' or 'mult'.")

# specific function for computing reuseable variable for sif pooling
def precompute_sif_weights(model, a=1e-3):
    corpus_total_words = sum(model.get_vecattr(word, 'count') for word in model.key_to_index)
    word_weights = {word: a / (a + model.get_vecattr(word, 'count') / corpus_total_words) for word in model.key_to_index}
    return word_weights, corpus_total_words

def transform_tokens_to_w2v_no_stems(string, model, pool_type='mean', normalise=False, word_weights=None, corpus_total_words=None):
    print('string:', string)
    vectors = []
    legit_tokens = []
    stops = set(stopwords.words('english'))
    original_tokens = tokenize(string)
    print('original tokens:', original_tokens)
    
    cleaned_tokens = [w for w in original_tokens if len(w) > 1 and w.isalpha() and w not in stops]
    print('cleaned:',cleaned_tokens)
    for token in cleaned_tokens:
        if token in model.key_to_index:
            legit_tokens.append(token)
            vectors.append(model.get_vector(token, norm=normalise))

    if not vectors:
        return np.zeros(300)

    if pool_type == 'mean':
        return np.mean(vectors, axis=0)
    elif pool_type == 'max':
        return np.max(vectors, axis=0)
    elif pool_type == 'mult':
        result = np.ones(300)
        for vector in vectors:
            result *= vector
        return result
    elif pool_type == 'sif':
        if word_weights is None or corpus_total_words is None:
            raise ValueError("Word weights and corpus total words must be provided for SIF pooling.")

        # Compute the weighted average of word vectors
        weighted_average = np.zeros(300)
        word_weights_sum = sum(word_weights[token] for token in legit_tokens)
        for token, vector in zip(legit_tokens, vectors):
            weighted_average += word_weights[token] * vector
        weighted_average /= word_weights_sum

        # Perform common component removal
        pc = np.mean(vectors, axis=0)
        weighted_average -= np.dot(weighted_average, pc) * pc

        return weighted_average
    else:
        raise ValueError("Invalid pool_type. Choose either 'mean', 'max', 'mult', or 'sif'.")

In [31]:
# custom dataset for handling the data trasnformation and with dataloader handling batching

class CustomPropagandaDataset_vanilla(Dataset):
    def __init__(self,df, model, stems=False, norm = False, pool = 'mean'):
        self.word_weights, self.corpus_total_words = precompute_sif_weights(model)

        self.stems = stems
        self.labels=torch.tensor([label for label in df['propaganda']])
        
        if self.stems==True:
            self.texts=torch.tensor([transform_tokens_to_w2v_with_stems(string, model, normalise = norm, pool_type=pool) for string in df['original_without_snip_tags']])
        else:
            self.texts=torch.tensor([transform_tokens_to_w2v_no_stems(string, model, normalise = norm, pool_type=pool, word_weights=self.word_weights, corpus_total_words=self.corpus_total_words) for string in df['original_without_snip_tags']])
        
    def classes(self):
        return self.labels
    
    def __len__(self):
        return len(self.labels)
    
    def __getitem__(self,idx):
        texts=self.texts[idx]
        y=self.labels[idx]
        return texts,y



In [32]:
# define the NN

class Custom_MLP_W2V(nn.Module):
    def __init__(self, dropout=0.5, num_classes=2): 
        super().__init__()
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(dropout)
        self.linear_1 = nn.Linear(300, 128)
        self.bn1 = nn.BatchNorm1d(128)  
        self.linear_2 = nn.Linear(128, 128)
        self.bn2 = nn.BatchNorm1d(128)  
        self.linear_3 = nn.Linear(128, num_classes)

    def forward(self, inputs):
        x = self.linear_1(inputs)
        x = self.bn1(x)  
        x = self.relu(x)
        x = self.dropout(x)
        x = self.linear_2(x)
        x = self.bn2(x)  
        x = self.relu(x)
        x = self.dropout(x)
        x = self.linear_3(x)
        return x

device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(device)

cpu


In [34]:
# define hyperparamaters for experimentation/refinement

epochs =50
n_classes = 2
bn = 100
lr = 5e-4
stem_v = False
pool_v = 'mean'
norm_v = True
dr = 0.5




for pool_v in ['mean']: # this first line can be used to iterate through different hyperparamater settings

    # initialise datasets
    train_data = CustomPropagandaDataset_vanilla(train_df, my_model, stems=stem_v, norm = norm_v, pool=pool_v)
    val_data = CustomPropagandaDataset_vanilla(val_df, my_model, stems=stem_v, norm = norm_v, pool=pool_v)
    # initialise dataloaders
    train_dataloader=torch.utils.data.DataLoader(train_data,batch_size=bn,shuffle=True)
    val_dataloader=torch.utils.data.DataLoader(val_data,batch_size=bn)

    # initialise model instance, loss and optimiser
    model=Custom_MLP_W2V(num_classes=n_classes, dropout=dr).to(device)
    criterion=nn.CrossEntropyLoss()
    optimizer = torch.optim.Adam(params=model.parameters(), lr=lr)

    # for accumulating data for plotting later
    train_acc_list = []
    train_loss_list = []
    val_acc_list = []
    val_loss_list = []

    # for tracking best performance
    best_val_acc = 0
    best_epoch = 0
    best_model_state = None
    
    # unique id for saving data
    model_id = str(uuid.uuid4())

    # beggining of training iteration
    for epoch_num in range(epochs):

        total_acc_train=0
        total_loss_train=0
        model.train()
        # training loop
        for train_input,train_label in train_dataloader:

            train_input = train_input.to(device).float()
            train_label = train_label.to(device)
            output_1=model(train_input)

            batch_loss_1=criterion(output_1,train_label)

            total_loss_train += batch_loss_1.item()

            acc=(output_1.argmax(dim=1)==train_label).sum().item()
            total_acc_train+=acc

            model.zero_grad()
            batch_loss_1.backward()
            optimizer.step()

        total_acc_val=0
        total_loss_val=0

        y_true = []
        y_pred = []
        model.eval()
        # validation loop
        with torch.no_grad():
            for val_input,val_label in val_dataloader:

                val_input = val_input.to(device).float()
                val_label = val_label.to(device)

                output_2=model(val_input)

                batch_loss_2=criterion(output_2,val_label)

                total_loss_val+=batch_loss_2.item()

                acc=(output_2.argmax(dim=1)==val_label).sum().item()
                total_acc_val+=acc

                # for scoring
                predicted = output_2.argmax(dim=1)
                y_true.extend(val_label.cpu().numpy())
                y_pred.extend(predicted.cpu().numpy())
        
        train_acc = total_acc_train / len(train_data)
        train_loss = total_loss_train / len(train_data)
        val_acc = total_acc_val / len(val_data)
        val_loss = total_loss_val / len(val_data)

        train_acc_list.append(train_acc)
        train_loss_list.append(train_loss)
        val_acc_list.append(val_acc)
        val_loss_list.append(val_loss)
        
        # print running performance
        print(f'Epochs: {epoch_num+1} | Train Loss: {total_loss_train / len(train_data):.3f} | Train Accuracy: {total_acc_train/len(train_data):.3f}')
        print(f'Val loss: {total_loss_val/len(val_data):.3f} | Val Accuracy: {total_acc_val / len(val_data):.3f}')
        if val_acc > best_val_acc:
            best_epoch = epoch_num
            y_true_best = y_true.copy()
            y_pred_best = y_pred.copy()
            best_val_acc = val_acc
            best_model_state = model.state_dict()

        if epoch_num == epochs-1:
            print(f'______{model_id}______')
            print(f'LR: {lr} FINAL ACC = {total_acc_val / len(val_data):.3f}')
            print(f'LR: {lr} BEST ACC = {best_val_acc:.3f}')
            print('____________')
            
    # accuracy and loss curves over epochs
    epochs_range = range(1, epochs+1)

    plt.figure(figsize=(12, 6))
    plt.subplot(1, 2, 1)
    plt.plot(epochs_range, train_acc_list, label='Training Accuracy')
    plt.plot(epochs_range, val_acc_list, label='Validation Accuracy')
    plt.xlabel('Epochs')
    plt.ylabel('Accuracy')
    plt.legend(loc='lower right')
    plt.ylim(0, 1)  # Set y-axis limits from 0 to 1 for accuracy
    plt.title(f'Accuracy for lr {lr}, batch size {bn}, best ac: {best_val_acc:.2f}')

    # plt.subplot(1, 2, 2)
    # plt.plot(epochs_range, train_loss_list, label='Training Loss')
    # plt.plot(epochs_range, val_loss_list, label='Validation Loss')
    # plt.xlabel('Epochs')
    # plt.ylabel('Loss')
    # plt.legend(loc='upper right')
    # plt.title('Loss Curves')

    plt.tight_layout()
    plt.show()
    
    # analysis of model at best performing
    cm = confusion_matrix(y_true_best, y_pred_best)
    disp = ConfusionMatrixDisplay(cm)
    disp.plot()

    # Compute precision, recall, F1-score, and other metrics for the best model
    report = classification_report(y_true_best, y_pred_best, target_names=['class 0', 'class 1'])
    print('Classification Report:')
    print(report)

    results_dict = {
        'model_id': model_id,
        'train_accuracy': train_acc_list,
        'train_loss': train_loss_list,
        'val_accuracy': val_acc_list,
        'val_loss': val_loss_list,
        'hyperparameters': {
            'learning_rate': lr,
            'num_epochs': epochs,
            'batch_size': bn,
            'stem_v':stem_v,
            'pool_v':pool_v,
            'norm_v':norm_v,
            'dropout_rate': dr,
        },
        'results': {
            "classes": n_classes,
            "last_acc": val_acc_list[-1],
            'best_acc': best_val_acc,
            'best_acc_epoch': best_epoch,
            'confusion_matrix': cm.tolist(),
            'classification_report': report
        }
    }


    # Save the results dictionary as a JSON file with the model ID
    results_filename = f'./results/w2v_binary/w2v_binary_results_{model_id}.json'
    with open(results_filename, 'w') as f:
        json.dump(results_dict, f, indent=4)

string: No,  he  will not be confirmed. 
original tokens: ['No', ',', 'he', 'will', 'not', 'be', 'confirmed', '.']
cleaned: ['No', 'confirmed']
string: This declassification effort  won’t make things any worse than they are for President Trump.  
original tokens: ['This', 'declassification', 'effort', 'won', '’', 't', 'make', 'things', 'any', 'worse', 'than', 'they', 'are', 'for', 'President', 'Trump', '.']
cleaned: ['This', 'declassification', 'effort', 'make', 'things', 'worse', 'President', 'Trump']
string: "The Obama administration misled the  American people  and Congress because they were desperate to get a deal with Iran," said Sen. 
original tokens: ['``', 'The', 'Obama', 'administration', 'misled', 'the', 'American', 'people', 'and', 'Congress', 'because', 'they', 'were', 'desperate', 'to', 'get', 'a', 'deal', 'with', 'Iran', ',', "''", 'said', 'Sen', '.']
cleaned: ['The', 'Obama', 'administration', 'misled', 'American', 'people', 'Congress', 'desperate', 'get', 'deal', 'Iran'