# Loading the dataset

In [27]:
import pandas as pd
df = pd.read_csv('/content/augmented_dataset_tmp.csv')  # This file is the augmented file from dataset.py output on dataset A

df

Unnamed: 0,comment_id,self_text,subreddit,created_date,post_id,author_name,source,label,dataset
0,l8hu7j0,"Hawkeye: War is not Hell. War is war, and Hell...",CombatFootage,6/13/2024,1df5t14,Not_an_alt_69_420,Israel-Palestine Comments,Undefined,A
1,l2u6bg5,"Yeah, and an actual war brainiac is one in whi...",IsraelPalestine,5/6/2024,1ck5j39,slplante78,Israel-Palestine Comments,Pro-Palestine,A
2,l0ipcsi,"Iraq war: [URL 34,144â71,544 combatants killed...",IsraelPalestine,4/20/2024,1c8hs1u,Aggravating_Key7750,Israel-Palestine Comments,Undefined,A
3,m1lxxx5,That is already what the Biden administration ...,worldnews,12/11/2024,1hc2nld,elihu,Israel-Palestine Comments,Undefined,A
4,k92915z,"""Never believe that anti-Semites are completel...",worldnews,11/13/2023,17u1i7e,IAMA_Drunk_Armadillo,Israel-Palestine Comments,Undefined,A
...,...,...,...,...,...,...,...,...,...
57570,l5um107_Augmented_2,They think Palestinian are children terrorists...,Palestine,5/27/2024,1d1dwi8,Loyal-Maker7195,Israel-Palestine Comments,Pro-Palestine,A
57571,l5um107_Augmented_3,They think Palestinian children are terrorists...,Palestine,5/27/2024,1d1dwi8,Loyal-Maker7195,Israel-Palestine Comments,Pro-Palestine,A
57572,k971ybn_Augmented_1,"So explain to me face killing 12,000 unrelated...",IsraelPalestine,11/14/2023,17ucdbi,ill-independent,Israel-Palestine Comments,Pro-Palestine,A
57573,k971ybn_Augmented_2,"So explain to me how killing 12,000 unrelated ...",IsraelPalestine,11/14/2023,17ucdbi,ill-independent,Israel-Palestine Comments,Pro-Palestine,A


In [28]:
df['label'].value_counts()

Unnamed: 0_level_0,count
label,Unnamed: 1_level_1
Pro-Palestine,21444
Undefined,20415
Pro-Israel,15716


# Fine-tuning DistilBERT

Best model was trained using the configurations described below and reached val_loss=0.2176 on the 3rd epoch.

Train-test split

In [4]:
from sklearn.model_selection import train_test_split
train_texts, val_texts, train_labels, val_labels = train_test_split(
    df['self_text'].tolist(),
    df['label'].tolist(),
    test_size=0.2,
    stratify=df['label'],
    random_state=42
)
# Ensure val and train does not have nan.
val_texts = [text if isinstance(text, str) else "" for text in val_texts]
train_texts = [text if isinstance(text, str) else "" for text in train_texts]

Convert labels to numerical format

In [5]:
from sklearn.preprocessing import LabelEncoder
import numpy as np

# Predefined label mapping
LABELS_DECODER = {
    "0": "Pro-Palestine",
    "1": "Pro-Israel",
    "2": "Undefined"
}

# Create a list of labels matching the predefined mapping
custom_classes = ["Pro-Palestine", "Pro-Israel", "Undefined"]

# Initialize the LabelEncoder and set custom classes
label_encoder = LabelEncoder()
label_encoder.classes_ = np.array(custom_classes)

# Transform the labels (train and validation)
train_labels = label_encoder.transform(train_labels)
val_labels = label_encoder.transform(val_labels)

print("Label Encoder Classes:", label_encoder.classes_)

Label Encoder Classes: ['Pro-Palestine' 'Pro-Israel' 'Undefined']


Tokenize Input

In [7]:
from transformers import DistilBertTokenizer

tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-cased')  # Using cased version as the dataset is not lowered

train_encodings = tokenizer(train_texts, truncation=True, padding=True, max_length=512)
val_encodings = tokenizer(val_texts, truncation=True, padding=True, max_length=512)

tokenizer_config.json:   0%|          | 0.00/49.0 [00:01<?, ?B/s]

vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/436k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/465 [00:00<?, ?B/s]

Convert tokenized inputs into PyTorch Dataset

In [8]:
import torch

class TextDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

train_dataset = TextDataset(train_encodings, train_labels)
val_dataset = TextDataset(val_encodings, val_labels)


Load DistilBERT and modify it for classification

In [33]:
from transformers import DistilBertForSequenceClassification
import os

# Filepath to save the best model
BEST_MODEL_PATH = "/content/distilbert-finetuned"

# Option to load existing weights
load_existing_weights = True  # Set to True if you want to resume training

# Initialize the model
model = DistilBertForSequenceClassification.from_pretrained(
    'distilbert-base-cased',
    num_labels=len(label_encoder.classes_)  # Assuming this is defined elsewhere
)

# Load weights if the option is enabled and the folder exists
if load_existing_weights and os.path.exists(BEST_MODEL_PATH):
    print(f"Loading model from {BEST_MODEL_PATH}...")
    model = DistilBertForSequenceClassification.from_pretrained(BEST_MODEL_PATH)
    tokenizer = DistilBertTokenizer.from_pretrained(BEST_MODEL_PATH)
else:
    print("No existing model found. Starting fresh.")

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


No existing model found. Starting fresh.


Ensure the model gives more importance to underrepresented classes by adjusting weights in the loss function

In [30]:
from sklearn.utils.class_weight import compute_class_weight
import torch
import numpy as np

# Ensure df['label'] contains string labels
print(df['label'].unique())  # Verify that these are string labels like 'Pro-Israel', etc.

# Compute class weights directly
class_weights = compute_class_weight(
    class_weight='balanced',
    classes=label_encoder.classes_,  # Ensure this matches the LabelEncoder order
    y=df['label']  # Use string labels directly
)

# Convert class weights to PyTorch tensor
class_weights = torch.tensor(class_weights, dtype=torch.float)

print("Class Weights:", class_weights)

['Undefined' 'Pro-Palestine' 'Pro-Israel']
Class Weights: tensor([0.8950, 1.2212, 0.9401])


### Fine-Tune the DistilBERT Model

Set up training configurations

In [35]:
from torch.utils.data import DataLoader
from transformers import AdamW
from transformers import get_scheduler
from torch.nn import CrossEntropyLoss

BATCH_SIZE=32
EPOCHS=8
LR = 5e-5
PATIENCE = 2  # Stop after 2 epochs of no improvement

# Create data loaders
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE)

# Optimizer
optimizer = AdamW(model.parameters(), lr=LR)

# Learning rate scheduler
num_training_steps = len(train_loader) * EPOCHS
num_warmup_steps = int(0.1 * num_training_steps)
lr_scheduler = get_scheduler("linear", optimizer=optimizer, num_warmup_steps=num_warmup_steps, num_training_steps=num_training_steps)

# Detect GPU or use CPU as fallback
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
print(f"Using device: {device}")

# Move model to the device
model.to(device)

# Class weights for imbalanced dataset
loss_fn = CrossEntropyLoss(weight=class_weights.to(device))

Using device: cuda


Training loop

In [None]:
from tqdm import tqdm

# Set up early stopping parameters
best_val_loss = float('inf')
no_improvement = 0  # Counter for no improvement in training

for epoch in range(EPOCHS):
    print(f"Epoch {epoch + 1}/{EPOCHS}")

    # Training Loop with Progress Bar
    model.train()
    train_progress = tqdm(train_loader, desc="Training", leave=False)
    for batch in train_progress:
        optimizer.zero_grad()
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        loss = loss_fn(outputs.logits, batch['labels'])
        loss.backward()
        optimizer.step()
        lr_scheduler.step()

        # Update progress bar with the current loss
        train_progress.set_postfix(loss=loss.item())

    # Validation Loop with Progress Bar
    model.eval()
    val_loss = 0
    val_progress = tqdm(val_loader, desc="Validation", leave=False)
    with torch.no_grad():
        for batch in val_progress:
            batch = {k: v.to(device) for k, v in batch.items()}
            outputs = model(**batch)
            loss = loss_fn(outputs.logits, batch['labels'])
            val_loss += loss.item()

            # Update progress bar with the current batch loss
            val_progress.set_postfix(loss=loss.item())

    val_loss /= len(val_loader)
    print(f"Epoch {epoch + 1}: Validation Loss = {val_loss}")

    # Early Stopping Logic and Saving Best Model
    if val_loss < best_val_loss:
        best_val_loss = val_loss
        no_improvement = 0

        # Save the best model
        # Save the entire model (weights, tokenizer, config) in one folder
        print(f"Saving best model at epoch {epoch + 1}")
        model.save_pretrained(BEST_MODEL_PATH)
        tokenizer.save_pretrained(BEST_MODEL_PATH)

    else:
        no_improvement += 1
        if no_improvement >= PATIENCE:
            print("Early stopping triggered")
            break

# Load the best model after training
print("Loading the best model from checkpoint...")
model.load_state_dict(torch.load(BEST_MODEL_PATH))
model.to(device)

Epoch 1/8




Epoch 1: Validation Loss = 0.4313041320691506
Saving best model at epoch 1
Epoch 2/8




Epoch 2: Validation Loss = 0.25484393255578147
Saving best model at epoch 2
Epoch 3/8




Epoch 3: Validation Loss = 0.2176127940739712
Saving best model at epoch 3
Epoch 4/8




Epoch 4: Validation Loss = 0.22716169894524177
Epoch 5/8


Training:  30%|██▉       | 431/1440 [10:09<23:46,  1.41s/it, loss=0.00175]

# Fitting a TF-IDF Representation

In [26]:
from sklearn.feature_extraction.text import TfidfVectorizer
import joblib

# Initialize the TF-IDF vectorizer
corpus_path = '/content/augmented_dataset_tmp.csv'
corpus = pd.read_csv(corpus_path, encoding='ISO-8859-1')['self_text']

# Fit the vectorizer
tfidf_vectorizer = TfidfVectorizer(max_features=768, stop_words='english')
tfidf_vectorizer.fit(corpus)

# Save the vectorizer
model_path = '/content/tfidf_vectorizer.pkl'
joblib.dump(tfidf_vectorizer, model_path)
print(f"TF-IDF vectorizer saved to {model_path}")

# Later, load the vectorizer
loaded_vectorizer = joblib.load(model_path)
print("TF-IDF vectorizer loaded successfully")

TF-IDF vectorizer saved to /content/tfidf_vectorizer.pkl
TF-IDF vectorizer loaded successfully
