# Loading the dataset

In [2]:
import pandas as pd
df = pd.read_csv('/content/full_research_data_tagged.csv')

df = df[df['dataset'].isin(['TRAIN', 'VAL'])] # Use the training subset of the data for the model's fine-tuning.
print('Train Dataset Size: ', len(df))

display(df.head())
print('Train Dataset Label Distribution:\n', df['final_label'].value_counts())

Train Dataset Size:  34421


Unnamed: 0,comment_id,self_text,subreddit,created_date,post_id,author_name,source,final_label,dataset,manually_tagged
0,lpxz7ep,-1,worldnews,10/02/2024,1ftxy96,MrSynckt,Israel-Palestine Comments,Undefined,TRAIN,1
1,l8w3z61,100%,worldnews,16/06/2024,1dhby42,go3dprintyourself,Israel-Palestine Comments,Undefined,TRAIN,1
2,kc6vglc,100,IsraelPalestine,12/06/2023,18bodap,ShesARedhead82,Israel-Palestine Comments,Undefined,TRAIN,1
3,k8z38yb,1948,IsraelPalestine,11/12/2023,17t9e0f,Theloneliestmonk222,Israel-Palestine Comments,Undefined,TRAIN,1
4,l8hu7j0,"Hawkeye: War isnâ€™t Hell. War is war, and...",CombatFootage,13/06/2024,1df5t14,Not_an_alt_69_420,Israel-Palestine Comments,Undefined,TRAIN,1


Train Dataset Label Distribution:
 final_label
Undefined        24388
Pro-Palestine     5711
Pro-Israel        4322
Name: count, dtype: int64


# Fine-tuning DistilBERT

Best model was trained using the configurations described below and reached val_loss=0.2176 on the 3rd epoch.

Train-test split

In [3]:
# -------------------------------------------------
# 1.  Split by the pre‑assigned dataset column
# -------------------------------------------------
train_df = df[df["dataset"] == "TRAIN"]
val_df   = df[df["dataset"] == "VAL"]

# -------------------------------------------------
# 2.  Extract texts / labels, replacing NaN with ""
# -------------------------------------------------
train_texts  = train_df["self_text"].fillna("").tolist()
train_labels = train_df["final_label"].tolist()

val_texts    = val_df["self_text"].fillna("").tolist()
val_labels   = val_df["final_label"].tolist()

Convert labels to numerical format

In [4]:
from sklearn.preprocessing import LabelEncoder
import numpy as np, pandas as pd
from collections import Counter
import torch

# Predefined label mapping
LABEL2ID = {
    "Pro-Palestine": 0,
    "Pro-Israel"  : 1,
    "Undefined"   : 2,
}
ID2LABEL = {v: k for k, v in LABEL2ID.items()}

# ────────────────────────────────────────────────────────────────
# 2️⃣  Encode the column that already holds the strings
# ────────────────────────────────────────────────────────────────

label_encoder = LabelEncoder()
label_encoder.classes_ = np.array(list(LABEL2ID.keys()))      # lock the desired order

# train_df / val_df already contain a 'final_label' column with the strings
y_train = label_encoder.transform(train_df["final_label"])
y_val   = label_encoder.transform(val_df["final_label"])
train_labels = y_train.tolist()
val_labels   = y_val.tolist()

# ────────────────────────────────────────────────────────────────
# 3️⃣  (Optional but recommended) quick sanity checks
# ────────────────────────────────────────────────────────────────
print("LabelEncoder classes_: ", label_encoder.classes_)
print("Sample mapping:")
for i in range(len(label_encoder.classes_)):
    print(f"  id {i}  ⇔  '{ID2LABEL[i]}'")

# class distribution
print("\nTrain class counts:", Counter(y_train))

# class-weights in label-id order (clip later if desired)
class_counts  = Counter(y_train)
class_weights = torch.tensor([1 / class_counts[i] for i in range(len(ID2LABEL))])
class_weights *= len(class_weights) / class_weights.sum()   # mean ≈ 1
print("Class weights:", class_weights)

for i in range(len(label_encoder.classes_)):
    print(f"id {i:>2} | label {ID2LABEL[i]:13} | count {class_counts[i]:6} | weight {class_weights[i]:.3f}")


# Detect GPU or use CPU as fallback
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
class_weights = class_weights.to(device)
print(f"Using device: {device}")

LabelEncoder classes_:  ['Pro-Palestine' 'Pro-Israel' 'Undefined']
Sample mapping:
  id 0  ⇔  'Pro-Palestine'
  id 1  ⇔  'Pro-Israel'
  id 2  ⇔  'Undefined'

Train class counts: Counter({np.int64(2): 21363, np.int64(0): 4962, np.int64(1): 3815})
Class weights: tensor([1.1844, 1.5405, 0.2751])
id  0 | label Pro-Palestine | count   4962 | weight 1.184
id  1 | label Pro-Israel    | count   3815 | weight 1.540
id  2 | label Undefined     | count  21363 | weight 0.275
Using device: cuda


Tokenize Input

In [5]:
from transformers import DistilBertTokenizer

tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-cased')  # Using cased version as the dataset is not lowered to better catch entities and tones

train_encodings = tokenizer(train_texts, truncation=True, padding=True, max_length=512)
val_encodings = tokenizer(val_texts, truncation=True, padding=True, max_length=512)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/436k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/465 [00:00<?, ?B/s]

Convert tokenized inputs into PyTorch Dataset

In [6]:
import torch

class TextDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx], dtype=torch.long) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx], dtype=torch.long)
        return item

train_dataset = TextDataset(train_encodings, train_labels)
val_dataset = TextDataset(val_encodings, val_labels)


Load DistilBERT and modify it for classification

In [7]:
from transformers import DistilBertForSequenceClassification
import os

# Filepath to save the best model
BEST_MODEL_PATH = "/content/distilbert-finetuned"

# Option to load existing weights
load_existing_weights = True  # Set to True if you want to resume training

# Initialize the model
model = DistilBertForSequenceClassification.from_pretrained(
    'distilbert-base-cased',
    num_labels=len(label_encoder.classes_)  # Assuming this is defined elsewhere
)
model = model.to(device)

# Load weights if the option is enabled and the folder exists
if load_existing_weights and os.path.exists(BEST_MODEL_PATH):
    print(f"Loading model from {BEST_MODEL_PATH}...")
    model = DistilBertForSequenceClassification.from_pretrained(BEST_MODEL_PATH)
    tokenizer = DistilBertTokenizer.from_pretrained(BEST_MODEL_PATH)
else:
    print("No existing model found. Starting fresh.")

model.safetensors:   0%|          | 0.00/263M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


No existing model found. Starting fresh.


Ensure the model gives more importance to underrepresented classes by adjusting weights in the loss function

### Fine-Tune the DistilBERT Model

Set up training configurations

In [8]:
from torch.utils.data import DataLoader
from torch.optim import AdamW
from transformers import get_scheduler
from torch.nn import CrossEntropyLoss

FREEZE_N_TRANSFORMER_LAYERS = 2 # keep the bottom N layers frozen
BATCH_SIZE=32
EPOCHS=10
WARMUP_RATIO = 0.1 # 10 % of total steps
LR = 3e-5
WEIGHT_DECAY = 1e-2
PATIENCE = 2  # Stop after K epochs with no improvement
GRAD_CLIP_NORM = 1.0

# Create data loaders
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE)

# (1) Freeze bottom layers – do this **before** creating the optimizer
for layer in model.distilbert.transformer.layer[:FREEZE_N_TRANSFORMER_LAYERS]:
    for p in layer.parameters():
        p.requires_grad = False

# (2) AdamW + weight-decay
optimizer = AdamW(
    params=model.parameters(),
    lr=LR,
    weight_decay=WEIGHT_DECAY
)

# (3) Cosine schedule with warm-up
num_training_steps = EPOCHS * len(train_loader)
num_warmup_steps   = int(WARMUP_RATIO * num_training_steps)

lr_scheduler = get_scheduler(
    name="cosine",
    optimizer=optimizer,
    num_warmup_steps=num_warmup_steps,
    num_training_steps=num_training_steps
)

# (4) Clip extremely large class-weights (optional but often helpful)
loss_fn = CrossEntropyLoss(
    weight=torch.clamp(class_weights, max=5.).to(device)
)

# Ensure all tensors on the correct device
def move_to_device(batch, device):
    return {
        k: (
            v.to(device, non_blocking=True)
            if isinstance(v, torch.Tensor)
            else torch.tensor(v, dtype=torch.long, device=device)
        )
        for k, v in batch.items()
    }

Training loop

In [9]:
from tqdm import tqdm
from sklearn.metrics import accuracy_score, f1_score

# ──────────────────────────────────────────────────────────────────────────
#   Training loop with metric tracking & early-stopping (by best val F1 score, not loss)
# ──────────────────────────────────────────────────────────────────────────
best_val_f1     = 0.0
no_improvement  = 0

history = {"train_loss": [], "val_loss": [], "val_acc": [], "val_f1": []}

for epoch in range(1, EPOCHS + 1):
    print(f"\nEpoch {epoch}/{EPOCHS}")

    # ── TRAIN ────────────────────────────────────────────────────────────
    model.train()
    running_train_loss = 0.0

    train_bar = tqdm(train_loader, desc="Training", leave=False)
    for batch in train_bar:
        optimizer.zero_grad()

        batch = move_to_device(batch, device)
        outputs  = model(**batch)
        loss     = loss_fn(outputs.logits, batch["labels"])

        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), GRAD_CLIP_NORM)
        optimizer.step()
        lr_scheduler.step()

        running_train_loss += loss.item()
        train_bar.set_postfix(loss=f"{loss.item():.4f}"[:6])

    epoch_train_loss = running_train_loss / len(train_loader)
    history["train_loss"].append(epoch_train_loss)

    # ── VALIDATE ─────────────────────────────────────────────────────────
    model.eval()
    running_val_loss = 0.0
    all_preds, all_labels = [], []

    val_bar = tqdm(val_loader, desc="Validation", leave=False)
    with torch.no_grad():
        for batch in val_bar:
            batch = move_to_device(batch, device)
            outputs = model(**batch)
            loss    = loss_fn(outputs.logits, batch["labels"])
            running_val_loss += loss.item()

            preds  = outputs.logits.argmax(dim=1).cpu().numpy()
            labels = batch["labels"].cpu().numpy()
            all_preds.extend(preds)
            all_labels.extend(labels)

            val_bar.set_postfix(loss=f"{loss.item():.4f}"[:6])

    epoch_val_loss = running_val_loss / len(val_loader)
    epoch_val_acc  = accuracy_score(all_labels, all_preds)
    epoch_val_f1   = f1_score(all_labels, all_preds, average="weighted")

    history["val_loss"].append(epoch_val_loss)
    history["val_acc"].append(epoch_val_acc)
    history["val_f1"].append(epoch_val_f1)

    print(f"  Train loss {epoch_train_loss:.4f} | "
          f"Val loss {epoch_val_loss:.4f} | "
          f"Val acc {epoch_val_acc:.3f} | "
          f"Val F1 {epoch_val_f1:.3f}")

    # ── EARLY-STOPPING & CHECKPOINT ──────────────────────────────────────
    if epoch_val_f1 > best_val_f1:
        best_val_f1 = epoch_val_f1
        no_improvement = 0
        print(f"  🔸 New best model → saving to {BEST_MODEL_PATH}")
        model.save_pretrained(BEST_MODEL_PATH)
        tokenizer.save_pretrained(BEST_MODEL_PATH)
    else:
        no_improvement += 1
        if no_improvement >= PATIENCE:
            print("  ⏹ Early stopping – no improvement in "
                  f"{PATIENCE} consecutive epochs.")
            break


Epoch 1/10




  Train loss 0.7865 | Val loss 0.6352 | Val acc 0.753 | Val F1 0.774
  🔸 New best model → saving to /content/distilbert-finetuned

Epoch 2/10




  Train loss 0.5524 | Val loss 0.5532 | Val acc 0.780 | Val F1 0.799
  🔸 New best model → saving to /content/distilbert-finetuned

Epoch 3/10




  Train loss 0.3968 | Val loss 0.6219 | Val acc 0.821 | Val F1 0.832
  🔸 New best model → saving to /content/distilbert-finetuned

Epoch 4/10




  Train loss 0.2621 | Val loss 0.7554 | Val acc 0.823 | Val F1 0.830

Epoch 5/10




  Train loss 0.1682 | Val loss 0.9648 | Val acc 0.834 | Val F1 0.838
  🔸 New best model → saving to /content/distilbert-finetuned

Epoch 6/10




  Train loss 0.1077 | Val loss 1.2758 | Val acc 0.833 | Val F1 0.838

Epoch 7/10




  Train loss 0.0695 | Val loss 1.5728 | Val acc 0.841 | Val F1 0.840
  🔸 New best model → saving to /content/distilbert-finetuned

Epoch 8/10




  Train loss 0.0389 | Val loss 1.7516 | Val acc 0.841 | Val F1 0.840

Epoch 9/10




  Train loss 0.0272 | Val loss 1.8224 | Val acc 0.840 | Val F1 0.840

Epoch 10/10


                                                                          

  Train loss 0.0203 | Val loss 1.8290 | Val acc 0.840 | Val F1 0.840




# Fitting a TF-IDF Representation

In [None]:
!pip install scikit-learn==1.3.2  # Lower version for compatibility with local code

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd
import joblib

# Paths for corpus and model
corpus_path = '/content/full_research_data_tagged.csv'
model_path = '/content/tfidf_vectorizer.pkl'


df = df[df['dataset'] =='TRAIN'] # Use the training subset of the data for the model's fine-tuning.

# Step 1: Load the corpus
try:
    corpus = pd.read_csv(corpus_path, encoding='ISO-8859-1')
    corpus = corpus[corpus['dataset'] =='TRAIN'] # Use the training subset of the data for the model's fine-tuning.
    corpus = corpus['self_text']
    print(f"Corpus loaded successfully from {corpus_path}")
except Exception as e:
    raise ValueError(f"Failed to load the corpus: {e}")

# Step 2: Initialize and fit the TF-IDF vectorizer (same dim as DistilBERT's embedding dim)
tfidf_vectorizer = TfidfVectorizer(max_features=768, stop_words='english')
try:
    tfidf_vectorizer.fit(corpus)
    print("TF-IDF vectorizer fitted successfully")
except Exception as e:
    raise ValueError(f"Error during vectorizer fitting: {e}")

# Step 3: Save the fitted TF-IDF vectorizer
try:
    joblib.dump(tfidf_vectorizer, model_path)
    print(f"TF-IDF vectorizer saved to {model_path}")
except Exception as e:
    raise ValueError(f"Failed to save the TF-IDF vectorizer: {e}")