In [3]:
import pandas as pd
import nltk
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.svm import SVC
import string
from transformers import BertTokenizer, BertForSequenceClassification, AdamW
import torch
import torch.nn as nn
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from tqdm import tqdm
import re
import torch.optim as optim
from sklearn.metrics import classification_report

nltk.download('punkt')
nltk.download('stopwords')

# Read training data from CSV file
train_df = pd.read_csv("propaganda_train.tsv",delimiter='\t', names=['label', 'sentence'])
# Read validation data from CSV file
val_df = pd.read_csv("propaganda_val.tsv", delimiter='\t', names=['label', 'sentence'])



[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\r25\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\r25\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [4]:
print(val_df.head())

                       label  \
0                      label   
1             not_propaganda   
2  causal_oversimplification   
3   appeal_to_fear_prejudice   
4             not_propaganda   

                                            sentence  
0                                  tagged_in_context  
1  On average, between 300 and 600 infections are...  
2  Mostly because <BOS> the country would not las...  
3  Lyndon Johnson <BOS> gets Earl Warren and Sen....  
4           <BOS> You <EOS> may opt out at anytime.   


In [5]:
def preprocess_text(text):
    special_tokens = ['<BOS>', '<EOS>', 'eos', 'bos', '<eos>', '<bos>']
    for token in special_tokens:
        text = text.replace(token, '')
    tokens = word_tokenize(text.lower())
    remove_tokens = set(stopwords.words('english') + list(string.punctuation))
    stemmer = PorterStemmer()
    tokens = [word for word in tokens if word not in remove_tokens]
    words = [stemmer.stem(token) for token in tokens]
    return ' '.join(words)

# Apply preprocessing to the dataset
train_df['sentence'] = train_df['sentence'].apply(preprocess_text)
val_df['sentence'] = val_df['sentence'].apply(preprocess_text)

# Remove erroneous 'label' entries
train_df = train_df[train_df['label'] != 'label']
val_df = val_df[val_df['label'] != 'label']

# Update labels for binary classification: 1 for any propaganda technique, 0 for 'not_propaganda'
train_df['binary_label'] = (train_df['label'] != 'not_propaganda').astype(int)
val_df['binary_label'] = (val_df['label'] != 'not_propaganda').astype(int)


In [6]:
print(train_df.columns)


Index(['label', 'sentence', 'binary_label'], dtype='object')


In [7]:
print(val_df.head())

                       label  \
1             not_propaganda   
2  causal_oversimplification   
3   appeal_to_fear_prejudice   
4             not_propaganda   
5                 repetition   

                                            sentence  binary_label  
1  averag 300 600 infect record everi year among ...             0  
2  mostli countri would last long without outsid ...             1  
3  lyndon johnson get earl warren sen. richard ru...             1  
4                                     may opt anytim             0  
5  must exact directli order vilifi humili islam ...             1  


In [8]:
print(train_df.head())

            label                                           sentence  \
1  not_propaganda                                            confirm   
2  not_propaganda    declassif effort ’ make thing wors presid trump   
3     flag_waving  obama administr misl american peopl congress d...   
4  not_propaganda  “ look like ’ captur demis dark vortex ’ diffe...   
5  not_propaganda                               locat westervil ohio   

   binary_label  
1             0  
2             0  
3             1  
4             0  
5             0  


In [9]:
print(train_df.columns)


Index(['label', 'sentence', 'binary_label'], dtype='object')


In [10]:


# Vectorize the text data
vectorizer = CountVectorizer()
X_train_vec = vectorizer.fit_transform(train_df['sentence'])
X_test_vec = vectorizer.transform(val_df['sentence'])

#Hyperparameter Tuning: MultinomialNB with Laplace smoothing
clf = MultinomialNB(alpha=1.0)  # Laplace smoothing (alpha=1.0)
clf.fit(X_train_vec, train_df['binary_label'])

# Predict on the training set
y_train_pred = clf.predict(X_train_vec)
# Calculate training accuracy
train_accuracy = accuracy_score(train_df['binary_label'], y_train_pred)

# Predict on the test set
y_test_pred = clf.predict(X_test_vec)
# Calculate testing accuracy
test_accuracy = accuracy_score(val_df['binary_label'], y_test_pred)

# Print training and testing accuracies
print("Training Accuracy:", train_accuracy)
print("Testing Accuracy:", test_accuracy)

Training Accuracy: 0.8910521955260977
Testing Accuracy: 0.6827586206896552


In [11]:
# Traditional Machine Learning with TF-IDF features
vectorizer = TfidfVectorizer()
X_train_tfidf = vectorizer.fit_transform(train_df['sentence'])
X_val_tfidf = vectorizer.transform(val_df['sentence'])

y_train = train_df['label'].tolist()
y_val = val_df['label'].tolist()

# Encode labels
from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()
y_train_encoded = label_encoder.fit_transform(y_train)
y_val_encoded = label_encoder.transform(y_val)

# Train SVM classifier
svm_classifier = SVC(kernel='linear')
svm_classifier.fit(X_train_tfidf, y_train_encoded)

# Evaluate SVM classifier
svm_train_predictions = svm_classifier.predict(X_train_tfidf)
svm_val_predictions = svm_classifier.predict(X_val_tfidf)

svm_train_accuracy = accuracy_score(y_train_encoded, svm_train_predictions)
svm_val_accuracy = accuracy_score(y_val_encoded, svm_val_predictions)

print("SVM Classifier:")
print(f"Training Accuracy: {svm_train_accuracy:.4f}")
print(f"Testing Accuracy: {svm_val_accuracy:.4f}")

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Define BERT DataLoader for training and validation
batch_size = 16

train_encodings = tokenizer(list(train_df['sentence']), truncation=True, padding=True)
val_encodings = tokenizer(list(val_df['sentence']), truncation=True, padding=True)

train_inputs = torch.tensor(train_encodings['input_ids'])
train_masks = torch.tensor(train_encodings['attention_mask'])
train_labels = torch.tensor(y_train_encoded, dtype=torch.long)  # Use encoded labels

val_inputs = torch.tensor(val_encodings['input_ids'])
val_masks = torch.tensor(val_encodings['attention_mask'])
val_labels = torch.tensor(y_val_encoded, dtype=torch.long)  # Use encoded labels

train_data = TensorDataset(train_inputs, train_masks, train_labels)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

val_data = TensorDataset(val_inputs, val_masks, val_labels)
val_sampler = SequentialSampler(val_data)
val_dataloader = DataLoader(val_data, sampler=val_sampler, batch_size=batch_size)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
if torch.cuda.is_available():
    # CUDA is available, print the CUDA device
    device = torch.device("cuda")
    print("CUDA is available. Using GPU:", torch.cuda.get_device_name(0))
else:
    # CUDA is not available, use CPU
    device = torch.device("cpu")
    print("CUDA is not available. Using CPU.")
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=len(train_df['label'].unique()))
model.to(device)
optimizer = AdamW(model.parameters(), lr=2e-5)

# Train BERT model
epochs = 6
for epoch in range(epochs):
    model.train()
    total_train_loss = 0
    train_predictions = []
    for batch in tqdm(train_dataloader, desc=f"Epoch {epoch + 1}"):
        batch = tuple(t.to(device) for t in batch)
        inputs = {"input_ids": batch[0], "attention_mask": batch[1], "labels": batch[2]}
        optimizer.zero_grad()
        outputs = model(**inputs)
        loss = outputs.loss
        total_train_loss += loss.item()
        loss.backward()
        optimizer.step()
        train_predictions.extend(torch.argmax(outputs.logits, dim=1).cpu().detach().numpy())

    model.eval()
    total_val_loss = 0
    val_predictions, val_true_labels = [], []
    for batch in val_dataloader:
        batch = tuple(t.to(device) for t in batch)
        with torch.no_grad():
            inputs = {"input_ids": batch[0], "attention_mask": batch[1], "labels": batch[2]}
            outputs = model(**inputs)
            loss = outputs.loss
            total_val_loss += loss.item()
            val_predictions.extend(torch.argmax(outputs.logits, dim=1).cpu().detach().numpy())
            val_true_labels.extend(inputs["labels"].cpu().detach().numpy())

    avg_train_loss = total_train_loss / len(train_dataloader)
    avg_val_loss = total_val_loss / len(val_dataloader)
    train_accuracy = accuracy_score(y_train_encoded, train_predictions)
    val_accuracy = accuracy_score(y_val_encoded, val_predictions)

    print(f"BERT Classifier:")
    print(f"Epoch {epoch + 1}:")
    print(f"  Training Loss: {avg_train_loss:.4f}, Accuracy: {train_accuracy:.4f}")
    print(f"  Validation Loss: {avg_val_loss:.4f}, Accuracy: {val_accuracy:.4f}")

    # Evaluate BERT classifier
    print("BERT Classifier:")
    print(classification_report(y_val_encoded, val_predictions, target_names=label_encoder.classes_))


SVM Classifier:
Training Accuracy: 0.7415
Testing Accuracy: 0.5655




CUDA is available. Using GPU: NVIDIA RTX A4000


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Epoch 1: 100%|██████████| 151/151 [01:46<00:00,  1.42it/s]
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


BERT Classifier:
Epoch 1:
  Training Loss: 1.7477, Accuracy: 0.4743
  Validation Loss: 1.5958, Accuracy: 0.5190
BERT Classifier:
                           precision    recall  f1-score   support

 appeal_to_fear_prejudice       0.00      0.00      0.00        43
causal_oversimplification       0.00      0.00      0.00        31
                    doubt       0.00      0.00      0.00        38
exaggeration,minimisation       0.00      0.00      0.00        28
              flag_waving       0.00      0.00      0.00        39
          loaded_language       0.00      0.00      0.00        37
    name_calling,labeling       0.00      0.00      0.00        31
           not_propaganda       0.52      1.00      0.68       301
               repetition       0.00      0.00      0.00        32

                 accuracy                           0.52       580
                macro avg       0.06      0.11      0.08       580
             weighted avg       0.27      0.52      0.35       58

Epoch 2: 100%|██████████| 151/151 [01:46<00:00,  1.41it/s]
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


BERT Classifier:
Epoch 2:
  Training Loss: 1.5701, Accuracy: 0.4640
  Validation Loss: 1.5206, Accuracy: 0.5414
BERT Classifier:
                           precision    recall  f1-score   support

 appeal_to_fear_prejudice       0.50      0.09      0.16        43
causal_oversimplification       0.12      0.13      0.12        31
                    doubt       1.00      0.03      0.05        38
exaggeration,minimisation       0.00      0.00      0.00        28
              flag_waving       0.34      0.46      0.39        39
          loaded_language       0.00      0.00      0.00        37
    name_calling,labeling       0.00      0.00      0.00        31
           not_propaganda       0.59      0.95      0.73       301
               repetition       0.00      0.00      0.00        32

                 accuracy                           0.54       580
                macro avg       0.28      0.18      0.16       580
             weighted avg       0.44      0.54      0.43       58

Epoch 3: 100%|██████████| 151/151 [01:48<00:00,  1.39it/s]


BERT Classifier:
Epoch 3:
  Training Loss: 1.3609, Accuracy: 0.3691
  Validation Loss: 1.4863, Accuracy: 0.5500
BERT Classifier:
                           precision    recall  f1-score   support

 appeal_to_fear_prejudice       0.34      0.37      0.36        43
causal_oversimplification       0.00      0.00      0.00        31
                    doubt       0.00      0.00      0.00        38
exaggeration,minimisation       1.00      0.04      0.07        28
              flag_waving       0.57      0.54      0.55        39
          loaded_language       0.20      0.03      0.05        37
    name_calling,labeling       0.13      0.29      0.18        31
           not_propaganda       0.66      0.89      0.76       301
               repetition       0.20      0.06      0.10        32

                 accuracy                           0.55       580
                macro avg       0.34      0.25      0.23       580
             weighted avg       0.49      0.55      0.48       58

Epoch 4: 100%|██████████| 151/151 [01:48<00:00,  1.39it/s]


BERT Classifier:
Epoch 4:
  Training Loss: 1.0949, Accuracy: 0.3007
  Validation Loss: 1.5844, Accuracy: 0.5397
BERT Classifier:
                           precision    recall  f1-score   support

 appeal_to_fear_prejudice       0.25      0.49      0.33        43
causal_oversimplification       0.22      0.16      0.19        31
                    doubt       0.67      0.05      0.10        38
exaggeration,minimisation       0.00      0.00      0.00        28
              flag_waving       0.47      0.59      0.52        39
          loaded_language       0.18      0.11      0.14        37
    name_calling,labeling       0.13      0.06      0.09        31
           not_propaganda       0.67      0.85      0.75       301
               repetition       0.00      0.00      0.00        32

                 accuracy                           0.54       580
                macro avg       0.29      0.26      0.23       580
             weighted avg       0.47      0.54      0.48       58

Epoch 5: 100%|██████████| 151/151 [01:47<00:00,  1.40it/s]


BERT Classifier:
Epoch 5:
  Training Loss: 0.8884, Accuracy: 0.2962
  Validation Loss: 1.7038, Accuracy: 0.5448
BERT Classifier:
                           precision    recall  f1-score   support

 appeal_to_fear_prejudice       0.36      0.30      0.33        43
causal_oversimplification       0.19      0.45      0.27        31
                    doubt       0.50      0.05      0.10        38
exaggeration,minimisation       0.23      0.11      0.15        28
              flag_waving       0.53      0.51      0.52        39
          loaded_language       0.25      0.05      0.09        37
    name_calling,labeling       0.16      0.10      0.12        31
           not_propaganda       0.68      0.85      0.75       301
               repetition       0.29      0.12      0.17        32

                 accuracy                           0.54       580
                macro avg       0.35      0.28      0.28       580
             weighted avg       0.51      0.54      0.50       58

Epoch 6: 100%|██████████| 151/151 [01:48<00:00,  1.39it/s]


BERT Classifier:
Epoch 6:
  Training Loss: 0.7141, Accuracy: 0.2850
  Validation Loss: 1.7764, Accuracy: 0.5259
BERT Classifier:
                           precision    recall  f1-score   support

 appeal_to_fear_prejudice       0.30      0.26      0.28        43
causal_oversimplification       0.15      0.26      0.19        31
                    doubt       0.17      0.13      0.15        38
exaggeration,minimisation       0.33      0.14      0.20        28
              flag_waving       0.62      0.54      0.58        39
          loaded_language       0.18      0.05      0.08        37
    name_calling,labeling       0.27      0.13      0.17        31
           not_propaganda       0.68      0.81      0.74       301
               repetition       0.22      0.22      0.22        32

                 accuracy                           0.53       580
                macro avg       0.32      0.28      0.29       580
             weighted avg       0.49      0.53      0.50       58

In [12]:
from sklearn.metrics import classification_report

# Evaluate SVM classifier
print("SVM Classifier:")
print(classification_report(y_val_encoded, svm_val_predictions, target_names=label_encoder.classes_))


SVM Classifier:
                           precision    recall  f1-score   support

 appeal_to_fear_prejudice       0.46      0.14      0.21        43
causal_oversimplification       0.20      0.03      0.06        31
                    doubt       0.40      0.11      0.17        38
exaggeration,minimisation       0.50      0.11      0.18        28
              flag_waving       0.76      0.41      0.53        39
          loaded_language       0.00      0.00      0.00        37
    name_calling,labeling       0.20      0.03      0.06        31
           not_propaganda       0.57      0.97      0.72       301
               repetition       0.83      0.16      0.26        32

                 accuracy                           0.57       580
                macro avg       0.44      0.22      0.24       580
             weighted avg       0.50      0.57      0.46       580



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [13]:
from sklearn.metrics import classification_report

# Evaluate BERT classifier
print("BERT Classifier:")
print(classification_report(y_val_encoded, val_predictions, target_names=label_encoder.classes_))


BERT Classifier:
                           precision    recall  f1-score   support

 appeal_to_fear_prejudice       0.30      0.26      0.28        43
causal_oversimplification       0.15      0.26      0.19        31
                    doubt       0.17      0.13      0.15        38
exaggeration,minimisation       0.33      0.14      0.20        28
              flag_waving       0.62      0.54      0.58        39
          loaded_language       0.18      0.05      0.08        37
    name_calling,labeling       0.27      0.13      0.17        31
           not_propaganda       0.68      0.81      0.74       301
               repetition       0.22      0.22      0.22        32

                 accuracy                           0.53       580
                macro avg       0.32      0.28      0.29       580
             weighted avg       0.49      0.53      0.50       580



2nd question

In [14]:
import pandas as pd
import re
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score
import torch
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from transformers import BertTokenizer, BertForSequenceClassification, AdamW
from tqdm import tqdm

# Load data
train_df = pd.read_csv("propaganda_train.tsv", delimiter='\t', names=['label', 'sentence'])
val_df = pd.read_csv("propaganda_val.tsv", delimiter='\t', names=['label', 'sentence'])

# Define preprocessing functions
def extract_text_between_bos_eos(text):
    pattern = re.compile(r'<BOS>(.*?)<EOS>')
    matches = pattern.findall(text)
    return ' '.join(matches)

def preprocess_text(text):
    text = text.lower().replace('<bos>', '').replace('<eos>', '')
    tokens = word_tokenize(text)
    remove_tokens = set(stopwords.words('english') + list(string.punctuation))
    stemmer = PorterStemmer()
    tokens = [stemmer.stem(word) for word in tokens if word not in remove_tokens]
    return ' '.join(tokens)

# Apply preprocessing
train_df['tagged_in_context'] = train_df['sentence'].apply(extract_text_between_bos_eos)
val_df['tagged_in_context'] = val_df['sentence'].apply(extract_text_between_bos_eos)

train_df['tagged_in_context'] = train_df['tagged_in_context'].apply(preprocess_text)
val_df['tagged_in_context'] = val_df['tagged_in_context'].apply(preprocess_text)

# Remove erroneous 'label' entries
train_df = train_df[train_df['label'] != 'label']
val_df = val_df[val_df['label'] != 'label']

# Filter out 'not_propaganda' entries
train_df = train_df[train_df['label'] != 'not_propaganda']
val_df = val_df[val_df['label'] != 'not_propaganda']

# Traditional Machine Learning with CountVectorizer
vectorizer = CountVectorizer()
X_train_tfidf = vectorizer.fit_transform(train_df['tagged_in_context'])
X_val_tfidf = vectorizer.transform(val_df['tagged_in_context'])

y_train = train_df['label'].tolist()
y_val = val_df['label'].tolist()

label_encoder = LabelEncoder()
y_train_encoded = label_encoder.fit_transform(y_train)
y_val_encoded = label_encoder.transform(y_val)

# Train Multinomial Naive Bayes classifier
nb_classifier = MultinomialNB()
nb_classifier.fit(X_train_tfidf, y_train_encoded)

# Evaluate Multinomial Naive Bayes classifier
nb_train_predictions = nb_classifier.predict(X_train_tfidf)
nb_val_predictions = nb_classifier.predict(X_val_tfidf)

nb_train_accuracy = accuracy_score(y_train_encoded, nb_train_predictions)
nb_val_accuracy = accuracy_score(y_val_encoded, nb_val_predictions)

print("MultinomialNB Classifier:")
print(f"Training Accuracy: {nb_train_accuracy:.4f}")
print(f"Testing Accuracy: {nb_val_accuracy:.4f}")

# Fine-tune BERT model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=len(train_df['label'].unique()))

optimizer = AdamW(model.parameters(), lr=2e-5)

batch_size = 16
train_encodings = tokenizer(list(train_df['tagged_in_context']), truncation=True, padding=True)
val_encodings = tokenizer(list(val_df['tagged_in_context']), truncation=True, padding=True)

train_inputs = torch.tensor(train_encodings['input_ids'])
train_masks = torch.tensor(train_encodings['attention_mask'])
train_labels = torch.tensor(y_train_encoded, dtype=torch.long)  # Convert to torch.long

val_inputs = torch.tensor(val_encodings['input_ids'])
val_masks = torch.tensor(val_encodings['attention_mask'])
val_labels = torch.tensor(y_val_encoded, dtype=torch.long)  # Convert to torch.long

train_data = TensorDataset(train_inputs, train_masks, train_labels)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

val_data = TensorDataset(val_inputs, val_masks, val_labels)
val_sampler = SequentialSampler(val_data)
val_dataloader = DataLoader(val_data, sampler=val_sampler, batch_size=batch_size)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
epochs = 10

for epoch in range(epochs):
    model.train()
    total_train_loss = 0
    train_predictions = []
    for batch in tqdm(train_dataloader, desc=f"Epoch {epoch + 1}"):
        batch = tuple(t.to(device) for t in batch)
        inputs = {"input_ids": batch[0], "attention_mask": batch[1], "labels": batch[2]}
        optimizer.zero_grad()
        outputs = model(**inputs)
        loss = outputs.loss
        total_train_loss += loss.item()
        loss.backward()
        optimizer.step()
        train_predictions.extend(torch.argmax(outputs.logits, dim=1).cpu().detach().numpy())

    model.eval()
    total_val_loss = 0
    val_predictions, val_true_labels = [], []
    for batch in val_dataloader:
        batch = tuple(t.to(device) for t in batch)
        with torch.no_grad():
            inputs = {"input_ids": batch[0], "attention_mask": batch[1], "labels": batch[2]}
            outputs = model(**inputs)
            loss = outputs.loss
            total_val_loss += loss.item()
            val_predictions.extend(torch.argmax(outputs.logits, dim=1).cpu().detach().numpy())
            val_true_labels.extend(inputs["labels"].cpu().detach().numpy())

    avg_train_loss = total_train_loss / len(train_dataloader)
    avg_val_loss = total_val_loss / len(val_dataloader)
    train_accuracy = accuracy_score(y_train_encoded, train_predictions)
    val_accuracy = accuracy_score(y_val_encoded, val_predictions)

    print(f"BERT Classifier:")
    print(f"Epoch {epoch + 1}:")
    print(f"  Training Loss: {avg_train_loss:.4f}, Accuracy: {train_accuracy:.4f}")
    print(f"  Validation Loss: {avg_val_loss:.4f}, Accuracy: {val_accuracy:.4f}")


MultinomialNB Classifier:
Training Accuracy: 0.8659
Testing Accuracy: 0.3978


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Epoch 1: 100%|██████████| 77/77 [00:50<00:00,  1.51it/s]


BERT Classifier:
Epoch 1:
  Training Loss: 1.9719, Accuracy: 0.1325
  Validation Loss: 1.8509, Accuracy: 0.2688


Epoch 2: 100%|██████████| 77/77 [00:49<00:00,  1.54it/s]


BERT Classifier:
Epoch 2:
  Training Loss: 1.7341, Accuracy: 0.1308
  Validation Loss: 1.7466, Accuracy: 0.3333


Epoch 3: 100%|██████████| 77/77 [00:49<00:00,  1.55it/s]


BERT Classifier:
Epoch 3:
  Training Loss: 1.4847, Accuracy: 0.1259
  Validation Loss: 1.6573, Accuracy: 0.3978


Epoch 4: 100%|██████████| 77/77 [00:49<00:00,  1.55it/s]


BERT Classifier:
Epoch 4:
  Training Loss: 1.1980, Accuracy: 0.1235
  Validation Loss: 1.6331, Accuracy: 0.4373


Epoch 5: 100%|██████████| 77/77 [00:49<00:00,  1.54it/s]


BERT Classifier:
Epoch 5:
  Training Loss: 0.9172, Accuracy: 0.1259
  Validation Loss: 1.6346, Accuracy: 0.4480


Epoch 6: 100%|██████████| 77/77 [00:50<00:00,  1.54it/s]


BERT Classifier:
Epoch 6:
  Training Loss: 0.6112, Accuracy: 0.1496
  Validation Loss: 1.7150, Accuracy: 0.4552


Epoch 7: 100%|██████████| 77/77 [00:50<00:00,  1.54it/s]


BERT Classifier:
Epoch 7:
  Training Loss: 0.4493, Accuracy: 0.1300
  Validation Loss: 1.7592, Accuracy: 0.4803


Epoch 8: 100%|██████████| 77/77 [00:49<00:00,  1.55it/s]


BERT Classifier:
Epoch 8:
  Training Loss: 0.2995, Accuracy: 0.1382
  Validation Loss: 1.9215, Accuracy: 0.4695


Epoch 9: 100%|██████████| 77/77 [00:49<00:00,  1.54it/s]


BERT Classifier:
Epoch 9:
  Training Loss: 0.2279, Accuracy: 0.1161
  Validation Loss: 1.9740, Accuracy: 0.4767


Epoch 10: 100%|██████████| 77/77 [00:49<00:00,  1.54it/s]


BERT Classifier:
Epoch 10:
  Training Loss: 0.1686, Accuracy: 0.1169
  Validation Loss: 2.1411, Accuracy: 0.4659


In [15]:

# Load data
train_df = pd.read_csv("propaganda_train.tsv", delimiter='\t', names=['label', 'sentence'])
val_df = pd.read_csv("propaganda_val.tsv", delimiter='\t', names=['label', 'sentence'])

# Define preprocessing functions
def extract_text_between_bos_eos(text):
    pattern = re.compile(r'<BOS>(.*?)<EOS>')
    matches = pattern.findall(text)
    return ' '.join(matches)

def preprocess_text(text):
    text = text.lower().replace('<bos>', '').replace('<eos>', '')
    tokens = word_tokenize(text)
    remove_tokens = set(stopwords.words('english') + list(string.punctuation))
    stemmer = PorterStemmer()
    tokens = [stemmer.stem(word) for word in tokens if word not in remove_tokens]
    return ' '.join(tokens)

# Apply preprocessing
train_df['tagged_in_context'] = train_df['sentence'].apply(extract_text_between_bos_eos)
val_df['tagged_in_context'] = val_df['sentence'].apply(extract_text_between_bos_eos)

train_df['tagged_in_context'] = train_df['tagged_in_context'].apply(preprocess_text)
val_df['tagged_in_context'] = val_df['tagged_in_context'].apply(preprocess_text)

# Remove erroneous 'label' entries
train_df = train_df[train_df['label'] != 'label']
val_df = val_df[val_df['label'] != 'label']

# Filter out 'not_propaganda' entries
train_df = train_df[train_df['label'] != 'not_propaganda']
val_df = val_df[val_df['label'] != 'not_propaganda']

# Traditional Machine Learning with TF-IDF features
vectorizer = TfidfVectorizer()
X_train_tfidf = vectorizer.fit_transform(train_df['tagged_in_context'])
X_val_tfidf = vectorizer.transform(val_df['tagged_in_context'])

y_train = train_df['label'].tolist()
y_val = val_df['label'].tolist()

label_encoder = LabelEncoder()
y_train_encoded = label_encoder.fit_transform(y_train)
y_val_encoded = label_encoder.transform(y_val)

# Train Multinomial Naive Bayes classifier
nb_classifier = MultinomialNB()
nb_classifier.fit(X_train_tfidf, y_train_encoded)

# Evaluate Multinomial Naive Bayes classifier
nb_train_predictions = nb_classifier.predict(X_train_tfidf)
nb_val_predictions = nb_classifier.predict(X_val_tfidf)

nb_train_accuracy = accuracy_score(y_train_encoded, nb_train_predictions)
nb_val_accuracy = accuracy_score(y_val_encoded, nb_val_predictions)

print("MultinomialNB Classifier:")
print(f"Training Accuracy: {nb_train_accuracy:.4f}")
print(f"Testing Accuracy: {nb_val_accuracy:.4f}")

# Fine-tune BERT model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=len(train_df['label'].unique()))

optimizer = AdamW(model.parameters(), lr=2e-5)

batch_size = 16
train_encodings = tokenizer(list(train_df['tagged_in_context']), truncation=True, padding=True)
val_encodings = tokenizer(list(val_df['tagged_in_context']), truncation=True, padding=True)

train_inputs = torch.tensor(train_encodings['input_ids'])
train_masks = torch.tensor(train_encodings['attention_mask'])
train_labels = torch.tensor(y_train_encoded, dtype=torch.long)  # Convert to torch.long

val_inputs = torch.tensor(val_encodings['input_ids'])
val_masks = torch.tensor(val_encodings['attention_mask'])
val_labels = torch.tensor(y_val_encoded, dtype=torch.long)  # Convert to torch.long

train_data = TensorDataset(train_inputs, train_masks, train_labels)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

val_data = TensorDataset(val_inputs, val_masks, val_labels)
val_sampler = SequentialSampler(val_data)
val_dataloader = DataLoader(val_data, sampler=val_sampler, batch_size=batch_size)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
epochs = 10

for epoch in range(epochs):
    model.train()
    total_train_loss = 0
    train_predictions = []
    for batch in tqdm(train_dataloader, desc=f"Epoch {epoch + 1}"):
        batch = tuple(t.to(device) for t in batch)
        inputs = {"input_ids": batch[0], "attention_mask": batch[1], "labels": batch[2]}
        optimizer.zero_grad()
        outputs = model(**inputs)
        loss = outputs.loss
        total_train_loss += loss.item()
        loss.backward()
        optimizer.step()
        train_predictions.extend(torch.argmax(outputs.logits, dim=1).cpu().detach().numpy())

    model.eval()
    total_val_loss = 0
    val_predictions, val_true_labels = [], []
    for batch in val_dataloader:
        batch = tuple(t.to(device) for t in batch)
        with torch.no_grad():
            inputs = {"input_ids": batch[0], "attention_mask": batch[1], "labels": batch[2]}
            outputs = model(**inputs)
            loss = outputs.loss
            total_val_loss += loss.item()
            val_predictions.extend(torch.argmax(outputs.logits, dim=1).cpu().detach().numpy())
            val_true_labels.extend(inputs["labels"].cpu().detach().numpy())

    avg_train_loss = total_train_loss / len(train_dataloader)
    avg_val_loss = total_val_loss / len(val_dataloader)
    train_accuracy = accuracy_score(y_train_encoded, train_predictions)
    val_accuracy = accuracy_score(y_val_encoded, val_predictions)

    print(f"BERT Classifier:")
    print(f"Epoch {epoch + 1}:")
    print(f"  Training Loss: {avg_train_loss:.4f}, Accuracy: {train_accuracy:.4f}")
    print(f"  Validation Loss: {avg_val_loss:.4f}, Accuracy: {val_accuracy:.4f}")


MultinomialNB Classifier:
Training Accuracy: 0.9117
Testing Accuracy: 0.4086


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Epoch 1: 100%|██████████| 77/77 [00:49<00:00,  1.55it/s]


BERT Classifier:
Epoch 1:
  Training Loss: 1.9885, Accuracy: 0.1128
  Validation Loss: 1.8334, Accuracy: 0.3118


Epoch 2: 100%|██████████| 77/77 [00:49<00:00,  1.54it/s]


BERT Classifier:
Epoch 2:
  Training Loss: 1.6987, Accuracy: 0.1292
  Validation Loss: 1.6497, Accuracy: 0.4158


Epoch 3: 100%|██████████| 77/77 [00:49<00:00,  1.54it/s]


BERT Classifier:
Epoch 3:
  Training Loss: 1.4401, Accuracy: 0.1186
  Validation Loss: 1.4986, Accuracy: 0.4409


Epoch 4: 100%|██████████| 77/77 [00:50<00:00,  1.53it/s]


BERT Classifier:
Epoch 4:
  Training Loss: 1.1081, Accuracy: 0.1398
  Validation Loss: 1.4561, Accuracy: 0.4695


Epoch 5: 100%|██████████| 77/77 [00:49<00:00,  1.54it/s]


BERT Classifier:
Epoch 5:
  Training Loss: 0.8210, Accuracy: 0.1357
  Validation Loss: 1.5338, Accuracy: 0.4516


Epoch 6: 100%|██████████| 77/77 [00:50<00:00,  1.54it/s]


BERT Classifier:
Epoch 6:
  Training Loss: 0.5599, Accuracy: 0.1243
  Validation Loss: 1.5647, Accuracy: 0.5018


Epoch 7: 100%|██████████| 77/77 [00:49<00:00,  1.54it/s]


BERT Classifier:
Epoch 7:
  Training Loss: 0.3722, Accuracy: 0.1186
  Validation Loss: 1.7402, Accuracy: 0.4803


Epoch 8: 100%|██████████| 77/77 [00:49<00:00,  1.54it/s]


BERT Classifier:
Epoch 8:
  Training Loss: 0.2504, Accuracy: 0.1038
  Validation Loss: 1.8036, Accuracy: 0.4946


Epoch 9: 100%|██████████| 77/77 [00:50<00:00,  1.53it/s]


BERT Classifier:
Epoch 9:
  Training Loss: 0.1728, Accuracy: 0.1267
  Validation Loss: 1.8671, Accuracy: 0.5125


Epoch 10: 100%|██████████| 77/77 [00:49<00:00,  1.54it/s]


BERT Classifier:
Epoch 10:
  Training Loss: 0.1490, Accuracy: 0.1243
  Validation Loss: 1.8736, Accuracy: 0.5125
