# mberta for polarozation classification

---



## Introduction

In this part of the starter notebook, we will take you through the process of all three Subtasks.

In [None]:
from google.colab import drive
drive.mount('/content/drive')

## Subtask 1 - Polarization detection

This is a binary classification to determine whether a post contains polarized content (Polarized or Not Polarized).

In [None]:
!unzip /content/drive/MyDrive/dev_phase.zip

## Imports

In [None]:
import pandas as pd

from sklearn.metrics import recall_score, precision_score, f1_score
import numpy as np

import torch

from sklearn.metrics import f1_score

from transformers import (
    AutoTokenizer,
    AutoConfig,
    AutoModelForSequenceClassification,
    Trainer,
    TrainingArguments,
    DataCollatorWithPadding
)
from torch.utils.data import Dataset

In [None]:
!pip install wandb

In [None]:
import wandb

# Disable wandb logging for this script
wandb.init(mode="disabled")

Using mberta model for classification


In [None]:

drive.mount('/content/drive')
import torch
torch.cuda.empty_cache()
import pandas as pd
import numpy as np
import torch
from sklearn.metrics import f1_score
from sklearn.model_selection import train_test_split
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    Trainer,
    TrainingArguments,
    DataCollatorWithPadding
)
from torch.utils.data import Dataset


# Dataset class

class PolarizationDataset(Dataset):
    def __init__(self, df, tokenizer, require_labels=True):
        self.texts = df["text"].fillna("").tolist()
        if require_labels:
            self.labels = df["polarization"].astype(int).tolist()
        else:
            self.labels = [0] * len(self.texts)  # dummy labels
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        enc = self.tokenizer(
            self.texts[idx],
            truncation=True,
            padding="max_length",
            max_length=128,
            return_tensors="pt"
        )
        item = {k: v.squeeze(0) for k,v in enc.items()}
        item["labels"] = torch.tensor(self.labels[idx])
        return item


# Load data

languages = ["eng","hin","spa","urd","zho","arb"]
data = {}

for lang in languages:
    train_df = pd.read_csv(f"subtask1/train/{lang}.csv")   # labeled
    dev_df   = pd.read_csv(f"subtask1/dev/{lang}.csv")     # unlabeled
    data[lang] = {"train": train_df, "dev": dev_df}

tokenizer = AutoTokenizer.from_pretrained("microsoft/mdeberta-v3-base")


# Metric

def compute_metrics(p):
    preds = np.argmax(p.predictions, axis=1)
    return {"f1_macro": f1_score(p.label_ids, preds, average="macro")}


# MAIN LOOP: TRAIN/VAL SPLIT + DEV PREDICTION

f1_results = []
predicted_outputs = {}

for lang, dfs in data.items():
    print("\n====================================")
    print(f"LANGUAGE: {lang}")

    train_df = dfs["train"]
    dev_df   = dfs["dev"]

    # Filter ONLY labeled training rows
    train_labeled = train_df.dropna(subset=["polarization"]).reset_index(drop=True)

    # Split train into train/validation
    train_split, val_split = train_test_split(
        train_labeled,
        test_size=0.20,
        stratify=train_labeled["polarization"],
        random_state=42,
        shuffle=True,
    )

    print(f"Train size: {len(train_split)},  Validation size: {len(val_split)}")

    train_dataset = PolarizationDataset(train_split, tokenizer, require_labels=True)
    val_dataset   = PolarizationDataset(val_split,   tokenizer, require_labels=True)

    # Train model
    model = AutoModelForSequenceClassification.from_pretrained(
        "microsoft/mdeberta-v3-base", num_labels=2
    )

    training_args = TrainingArguments(
        output_dir=f"./model_{lang}",
        learning_rate=2e-5,
        num_train_epochs=30,
        per_device_train_batch_size=16,
        per_device_eval_batch_size=16,
        eval_strategy="epoch",
        save_strategy="no",
        logging_steps=20
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=val_dataset,
        compute_metrics=compute_metrics,
        data_collator=DataCollatorWithPadding(tokenizer)
    )

    trainer.train()

    # Compute F1 on validation
    metrics = trainer.evaluate()
    f1 = metrics["eval_f1_macro"]
    print(f"{lang} Validation F1 = {f1:.4f}")

    f1_results.append({"language": lang, "f1_macro": f1})

    # Predict on dev (UNLABELED)
    print(f"ðŸ”® Predicting for dev set ({len(dev_df)} rows)...")
    dev_dataset = PolarizationDataset(dev_df, tokenizer, require_labels=False)
    preds = trainer.predict(dev_dataset)
    pred_labels = np.argmax(preds.predictions, axis=1)

    dev_df["predicted_polarization"] = pred_labels
    predicted_outputs[lang] = dev_df


# SAVE PREDICTIONS

for lang, df_pred in predicted_outputs.items():
    df_pred.to_csv(f"{lang}_dev_predicted.csv", index=False)
    print(f"Saved: {lang}_dev_predicted.csv")


# FINAL F1 SCORES

f1_df = pd.DataFrame(f1_results)
print("\nFINAL F1 SCORES:")
print(f1_df)
