In [1]:
# === Cell 1: imports, config, seeds ===
from pathlib import Path
import os, re, json, math, random, time
import numpy as np
import pandas as pd

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader

# Reproducibility
SEED = 42
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(SEED)

# Artifacts dir
ARTIFACTS = Path("./artifacts_stage1")
ARTIFACTS.mkdir(parents=True, exist_ok=True)

print("PyTorch:", torch.__version__)
print("CUDA available:", torch.cuda.is_available())

PyTorch: 2.5.1+cu121
CUDA available: True


In [2]:
# === Cell 2: paths and helpers ===
from pathlib import Path
import os, sys

# Repo-friendly path handling:
# - default expects datasets under the repo root: ./CrisisMMD_v2.0 and ./BRIGHT
# - you can override with environment variables:
#     CRISISMMD_ROOT=/path/to/CrisisMMD_v2.0
#     BRIGHT_ROOT=/path/to/BRIGHT
#
# Tip (Windows PowerShell):
#   setx CRISISMMD_ROOT "C:\\path\\to\\CrisisMMD_v2.0"

# Import shared config (repo_root + env var support)
sys.path.append(str(Path.cwd().resolve().parent if Path.cwd().name.lower()=="notebooks" else Path.cwd().resolve()))
from src.config import CRISISMMD_ROOT as DATA_ROOT

SPLIT_DIR = DATA_ROOT / "crisismmd_datasplit_all" / "crisismmd_datasplit_all"
IMG_ROOT  = DATA_ROOT / "data_image"

print("DATA_ROOT:", DATA_ROOT, "| exists:", DATA_ROOT.exists())
print("SPLIT_DIR:", SPLIT_DIR, "| exists:", SPLIT_DIR.exists())
print("IMG_ROOT :", IMG_ROOT,  "| exists:", IMG_ROOT.exists())

def resolve_image_path(relpath: str) -> Path | None:
    """Resolve an 'image' field to an absolute path under DATA_ROOT/data_image."""
    if not isinstance(relpath, str) or relpath.strip() == "":
        return None
    p = Path(relpath)
    if p.is_absolute():
        return p
    s = relpath.replace("\\", "/").lstrip("./")
    if s.lower().startswith("data_image/"):
        return (DATA_ROOT / s).resolve()
    return (IMG_ROOT / s).resolve()


DATA_ROOT exists: True
SPLIT_DIR exists: True
IMG_ROOT exists: True


In [3]:
# === Cell 3: robust TSV reader (handles quotes/encodings) ===
import csv

def read_tsv_robust(path: Path) -> pd.DataFrame:
    """Tolerant TSV reader for CrisisMMD splits."""
    try:
        return pd.read_csv(path, sep="\t", engine="python",
                           quoting=csv.QUOTE_NONE, escapechar="\\",
                           on_bad_lines="skip", encoding="utf-8", encoding_errors="ignore")
    except Exception:
        return pd.read_csv(path, sep="\t", engine="python",
                           on_bad_lines="skip", encoding="latin-1")

def load_task(prefix: str) -> pd.DataFrame:
    """Load train/dev/test for a task prefix: task_informative|task_humanitarian|task_damage."""
    frames = []
    for split in ("train", "dev", "test"):
        p = SPLIT_DIR / f"{prefix}_text_img_{split}.tsv"
        if not p.exists():
            print("Missing:", p)
            continue
        df = read_tsv_robust(p)
        df["__split"] = split
        df["__source"] = str(p)
        frames.append(df)
    if not frames:
        return pd.DataFrame()
    df = pd.concat(frames, ignore_index=True, sort=False)
    # standardize column names we care about
    rename = {}
    for c in df.columns:
        cl = c.strip().lower()
        if cl == "event_name": rename[c] = "event_name"
        if cl == "tweet_id":   rename[c] = "tweet_id"
        if cl == "image_id":   rename[c] = "image_id"
        if cl == "tweet_text": rename[c] = "tweet_text"
        if cl == "image":      rename[c] = "image"
        if cl == "label":      rename[c] = "label"
        if cl == "label_text": rename[c] = "label_text"
        if cl == "label_image":rename[c] = "label_image"
        if cl == "label_text_image": rename[c] = "label_text_image"
    return df.rename(columns=rename)

In [4]:
# === Cell 4: load splits ===
df_info = load_task("task_informative")
df_hum  = load_task("task_humanitarian")
df_dmg  = load_task("task_damage")

print("Rows â€” info:", len(df_info), "| hum:", len(df_hum), "| dmg:", len(df_dmg))
print("Info columns:", list(df_info.columns))
print("Hum columns:", list(df_hum.columns))
print("Dmg columns:", list(df_dmg.columns))

Rows â€” info: 18082 | hum: 18082 | dmg: 3526
Info columns: ['event_name', 'tweet_id', 'image_id', 'tweet_text', 'image', 'label', 'label_text', 'label_image', 'label_text_image', '__split', '__source']
Hum columns: ['event_name', 'tweet_id', 'image_id', 'tweet_text', 'image', 'label', 'label_text', 'label_image', 'label_text_image', '__split', '__source']
Dmg columns: ['event_name', 'tweet_id', 'image_id', 'tweet_text', 'image', 'label', '__split', '__source']


In [5]:
# === Cell 5: text cleaner and label normalizers (v2.0) ===

def norm_text(s):
    if not isinstance(s, str): return s
    s = s.replace("\r"," ").replace("\n"," ").strip()
    s = re.sub(r"http\S+|www\.\S+", " <URL> ", s)
    s = re.sub(r"@\w+", " <USER> ", s)
    s = re.sub(r"#(\w+)", r" \1 ", s)
    s = re.sub(r"\s+", " ", s).strip()
    return s

# Maps per README
HUM_MAP = {
    "affected individuals": "affected_individuals",
    "infrastructure and utility damage": "infrastructure_and_utility_damage",
    "injured or dead people": "injured_or_dead_people",
    "missing or found people": "missing_or_found_people",
    "rescue, volunteering or donation effort": "rescue_volunteering_or_donation_effort",
    "vehicle damage": "vehicle_damage",
    "other relevant information": "other_relevant_information",
    "not humanitarian": "not_humanitarian",
    "not relevant or can't judge": "not_humanitarian",  # v2.0 change
}

DMG_MAP = {
    # severe
    "severe_damage": "severe", "severe damage": "severe", "severe": "severe",
    # mild-ish collapsed
    "mild_damage": "mild", "mild damage": "mild", "mild": "mild",
    "moderate_damage": "mild", "moderate damage": "mild",
    "minor_damage": "mild", "minor damage": "mild",
    # little/none
    "little_or_no_damage": "little_or_none", "little or no damage": "little_or_none",
    "little-to-no damage": "little_or_none", "little-to-no": "little_or_none",
    "little/no": "little_or_none", "no_damage": "little_or_none",
    "no damage": "little_or_none", "none": "little_or_none", "low_damage": "little_or_none",
}

def n_info(x):
    if pd.isna(x): return None
    s = str(x).strip().lower()
    if s in {"informative","info","yes","1","true"}: return "informative"
    if s in {"not informative","not_informative","not-informative","no","0","false"}: return "not_informative"
    return s

def n_hum(x):
    if pd.isna(x): return None
    s = str(x).strip().lower()
    if s in HUM_MAP: return HUM_MAP[s]
    for k,v in HUM_MAP.items():
        if k in s: return v
    if "not" in s and "humanitarian" in s: return "not_humanitarian"
    return s.replace(" ", "_")

def n_dmg(x):
    if pd.isna(x): return None
    s = str(x).strip().lower().replace("-", "_")
    return DMG_MAP.get(s, DMG_MAP.get(s.replace("_", " "), None))

# Clean text
for d in (df_info, df_hum, df_dmg):
    if "tweet_text" in d:
        d["text_clean"] = d["tweet_text"].map(norm_text)

In [6]:
# === Cell 6: canonical datasets + v2.0 alignment (Task1â†”Task2) ===

# Task 1 (informative)
info = df_info.copy()
for col in ("label","label_text","label_image"):
    if col in info: info[col] = info[col].map(n_info)
info["informativeness_label"] = info.get("label_text").fillna(info.get("label_image")).fillna(info.get("label"))
info = info[["event_name","tweet_id","image_id","text_clean","image","informativeness_label","__split"]]

# Task 2 (humanitarian)
hum = df_hum.copy()
for col in ("label","label_text","label_image"):
    if col in hum: hum[col] = hum[col].map(n_hum)
hum["humanitarian_label"] = hum.get("label_text").fillna(hum.get("label_image")).fillna(hum.get("label"))
hum = hum[["event_name","tweet_id","image_id","text_clean","image","humanitarian_label","__split"]]

# Merge to enforce v2.0 alignment: not_informative <-> not_humanitarian
merged = pd.merge(info, hum, on=["tweet_id","image_id"], how="outer", suffixes=("_info","_hum"))
mask_not_info = merged["informativeness_label"].eq("not_informative")
merged.loc[mask_not_info, "humanitarian_label"] = "not_humanitarian"
mask_not_hum = merged["humanitarian_label"].eq("not_humanitarian")
merged.loc[mask_not_hum, "informativeness_label"] = "not_informative"
merged["split"] = merged["__split_info"].fillna(merged["__split_hum"])

# Task 3 (damage)
dmg = df_dmg.copy()
raw_col = "label_image" if "label_image" in dmg.columns and dmg["label_image"].notna().any() else "label"
dmg["damage_severity_label"] = dmg[raw_col].map(n_dmg)
valid = {"little_or_none","mild","severe"}
dmg = dmg[dmg["damage_severity_label"].isin(valid)].copy()
dmg = dmg[["event_name","tweet_id","image_id","text_clean","image","damage_severity_label","__split"]]
dmg.rename(columns={"__split":"split"}, inplace=True)

print("Task1 counts:", info["informativeness_label"].value_counts(dropna=False).to_dict())
print("Task2 counts:", hum["humanitarian_label"].value_counts(dropna=False).to_dict())
print("Task3 counts:", dmg["damage_severity_label"].value_counts(dropna=False).to_dict())

Task1 counts: {'informative': 12862, 'not_informative': 5220}
Task2 counts: {'other_relevant_information': 6505, 'not_humanitarian': 5220, 'rescue_volunteering_or_donation_effort': 3774, 'infrastructure_and_utility_damage': 1430, 'injured_or_dead_people': 533, 'affected_individuals': 518, 'vehicle_damage': 61, 'missing_or_found_people': 41}
Task3 counts: {'severe': 2212, 'mild': 839, 'little_or_none': 475}


In [7]:
# === Cell 7: sanity checks ===
def show_basic(name, d, ycol, split_col="__split"):
    print(f"\n{name}: rows={len(d)}")
    if split_col in d:
        print("Splits:", d[split_col].value_counts().to_dict())
    print("Label counts:", d[ycol].value_counts(dropna=False).to_dict())

show_basic("Task1", info, "informativeness_label", "__split")
show_basic("Task2", hum, "humanitarian_label", "__split")
show_basic("Task3", dmg, "damage_severity_label", "split")

# Alignment check
a = merged.dropna(subset=["informativeness_label","humanitarian_label"])
bad1 = a[(a["informativeness_label"]=="not_informative") & (a["humanitarian_label"]!="not_humanitarian")]
bad2 = a[(a["humanitarian_label"]=="not_humanitarian") & (a["informativeness_label"]!="not_informative")]
print("\nAlignment violations â€” not_info->not_hum:", len(bad1), "| not_hum->not_info:", len(bad2))

# Image existence quick check
def exist_ratio(df, col="image", split_col="split"):
    paths = [resolve_image_path(v) for v in df[col].dropna().astype(str)]
    ok = sum(1 for p in paths if p and p.exists())
    print(f"Images existing: {ok}/{len(paths)}")

info_out = info.dropna(subset=["informativeness_label"]).rename(columns={"__split":"split"})
hum_out  = hum.dropna(subset=["humanitarian_label"]).rename(columns={"__split":"split"})

print("\nImages â€” info:"); exist_ratio(info_out)
print("Images â€” hum:");   exist_ratio(hum_out)
print("Images â€” dmg:");   exist_ratio(dmg)


Task1: rows=18082
Splits: {'train': 13608, 'dev': 2237, 'test': 2237}
Label counts: {'informative': 12862, 'not_informative': 5220}

Task2: rows=18082
Splits: {'train': 13608, 'dev': 2237, 'test': 2237}
Label counts: {'other_relevant_information': 6505, 'not_humanitarian': 5220, 'rescue_volunteering_or_donation_effort': 3774, 'infrastructure_and_utility_damage': 1430, 'injured_or_dead_people': 533, 'affected_individuals': 518, 'vehicle_damage': 61, 'missing_or_found_people': 41}

Task3: rows=3526
Splits: {'train': 2468, 'dev': 529, 'test': 529}
Label counts: {'severe': 2212, 'mild': 839, 'little_or_none': 475}

Alignment violations â€” not_info->not_hum: 0 | not_hum->not_info: 0

Images â€” info:
Images existing: 18082/18082
Images â€” hum:
Images existing: 18082/18082
Images â€” dmg:
Images existing: 3526/3526


In [8]:
# === Cell 8: save clean CSVs ===
info_out.to_csv(ARTIFACTS/"task1_informative_clean.csv", index=False)
hum_out.to_csv(ARTIFACTS/"task2_humanitarian_clean.csv", index=False)
dmg.to_csv(ARTIFACTS/"task3_damage_clean.csv", index=False)
merged[["event_name_info","tweet_id","image_id","text_clean_info","image_info",
        "informativeness_label","humanitarian_label","split"]].rename(
    columns={"event_name_info":"event_name","text_clean_info":"text_clean","image_info":"image"}
).to_csv(ARTIFACTS/"task12_merged_aligned.csv", index=False)

print("Saved to:", ARTIFACTS)

Saved to: artifacts_stage1


In [9]:
# === Cell 9: Task1 BERTweet binary ===
from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer, EarlyStoppingCallback
import evaluate

df = pd.read_csv(ARTIFACTS/"task1_informative_clean.csv")
train_df = df[df["split"]=="train"].copy()
dev_df   = df[df["split"]=="dev"].copy()
test_df1 = df[df["split"]=="test"].copy()

label2id = {"not_informative":0, "informative":1}
id2label = {v:k for k,v in label2id.items()}

def to_hfds(d):
    dd = d[["text_clean","informativeness_label"]].rename(columns={"text_clean":"text","informativeness_label":"label"}).copy()
    dd["label"] = dd["label"].map(label2id).astype(int)
    return Dataset.from_pandas(dd, preserve_index=False)

ds_train, ds_dev, ds_test = to_hfds(train_df), to_hfds(dev_df), to_hfds(test_df1)

model_name = "vinai/bertweet-base"
tok = AutoTokenizer.from_pretrained(model_name, use_fast=True)

def tok_fn(batch):
    return tok(batch["text"], truncation=True, padding="max_length", max_length=128)

ds_train = ds_train.map(tok_fn, batched=True).remove_columns(["text"]).with_format("torch")
ds_dev   = ds_dev.map(tok_fn, batched=True).remove_columns(["text"]).with_format("torch")
ds_test  = ds_test.map(tok_fn, batched=True).remove_columns(["text"]).with_format("torch")

model = AutoModelForSequenceClassification.from_pretrained(
    model_name, num_labels=2, id2label=id2label, label2id=label2id
)

acc = evaluate.load("accuracy"); f1 = evaluate.load("f1"); prec = evaluate.load("precision"); rec = evaluate.load("recall")
def metrics(p):
    preds = p.predictions.argmax(1); y = p.label_ids
    return {
        "accuracy": acc.compute(predictions=preds, references=y)["accuracy"],
        "f1": f1.compute(predictions=preds, references=y, average="macro")["f1"],
        "precision": prec.compute(predictions=preds, references=y, average="macro")["precision"],
        "recall": rec.compute(predictions=preds, references=y, average="macro")["recall"],
    }

args = TrainingArguments(
    output_dir=str(ARTIFACTS/"bertweet_task1"),
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="f1",
    greater_is_better=True,
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=32,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_steps=50,
    report_to="none",
)

trainer = Trainer(
    model=model, args=args,
    train_dataset=ds_train, eval_dataset=ds_dev,
    tokenizer=tok, compute_metrics=metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=2)],
)
trainer.train()
print("Task1 test:", trainer.evaluate(ds_test))

emoji is not installed, thus not converting emoticons or emojis into text. Install emoji: pip3 install emoji==0.6.0


Map:   0%|          | 0/13608 [00:00<?, ? examples/s]

Map:   0%|          | 0/2237 [00:00<?, ? examples/s]

Map:   0%|          | 0/2237 [00:00<?, ? examples/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at vinai/bertweet-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.3293,0.396621,0.83013,0.781776,0.793123,0.772903
2,0.2765,0.463072,0.835494,0.783657,0.805089,0.769277
3,0.2425,0.54466,0.833706,0.778472,0.805734,0.761669


Task1 test: {'eval_loss': 0.43185955286026, 'eval_accuracy': 0.8444345105051408, 'eval_f1': 0.794465038177613, 'eval_precision': 0.8189214509204685, 'eval_recall': 0.7784198511166254, 'eval_runtime': 6.4699, 'eval_samples_per_second': 345.756, 'eval_steps_per_second': 10.819, 'epoch': 3.0}


In [10]:
# === Cell 10: Task2 BERTweet multiclass (class-weighted) ===
from collections import Counter
from transformers import Trainer
import evaluate
import torch

df = pd.read_csv(ARTIFACTS/"task2_humanitarian_clean.csv")
train_df = df[df["split"]=="train"].copy()
dev_df   = df[df["split"]=="dev"].copy()
test_df2 = df[df["split"]=="test"].copy()

classes = sorted(df["humanitarian_label"].dropna().unique().tolist())
label2id = {c:i for i,c in enumerate(classes)}
id2label = {i:c for c,i in label2id.items()}

def to_hfds(d):
    dd = d[["text_clean","humanitarian_label"]].rename(columns={"text_clean":"text","humanitarian_label":"label"}).copy()
    dd["label"] = dd["label"].map(label2id).astype(int)
    return Dataset.from_pandas(dd, preserve_index=False)

ds_train, ds_dev, ds_test = to_hfds(train_df), to_hfds(dev_df), to_hfds(test_df2)

tok = AutoTokenizer.from_pretrained("vinai/bertweet-base", use_fast=True)
def tok_fn(b): return tok(b["text"], truncation=True, padding="max_length", max_length=128)
ds_train = ds_train.map(tok_fn, batched=True).remove_columns(["text"]).with_format("torch")
ds_dev   = ds_dev.map(tok_fn, batched=True).remove_columns(["text"]).with_format("torch")
ds_test  = ds_test.map(tok_fn, batched=True).remove_columns(["text"]).with_format("torch")

model = AutoModelForSequenceClassification.from_pretrained(
    "vinai/bertweet-base", num_labels=len(classes), id2label=id2label, label2id=label2id
)

# class weights from TRAIN distribution
cnt = Counter(train_df["humanitarian_label"])
weights = torch.tensor([len(train_df)/max(1, cnt[id2label[i]]) for i in range(len(classes))], dtype=torch.float)
weights = weights / weights.mean()

from torch.nn import CrossEntropyLoss
class WeightedTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False):
        labels = inputs.pop("labels")
        outputs = model(**inputs)
        logits = outputs.get("logits")
        loss_fct = CrossEntropyLoss(weight=weights.to(logits.device), label_smoothing=0.05)
        loss = loss_fct(logits.view(-1, model.config.num_labels), labels.view(-1))
        return (loss, outputs) if return_outputs else loss

acc = evaluate.load("accuracy"); f1 = evaluate.load("f1")
prec = evaluate.load("precision"); rec = evaluate.load("recall")
def metrics(p):
    preds = p.predictions.argmax(1); y = p.label_ids
    return {
        "accuracy": acc.compute(predictions=preds, references=y)["accuracy"],
        "f1_macro": f1.compute(predictions=preds, references=y, average="macro")["f1"],
        "precision_macro": prec.compute(predictions=preds, references=y, average="macro")["precision"],
        "recall_macro": rec.compute(predictions=preds, references=y, average="macro")["recall"],
    }

args = TrainingArguments(
    output_dir=str(ARTIFACTS/"bertweet_task2"),
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="f1_macro",
    greater_is_better=True,
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=32,
    num_train_epochs=4,
    weight_decay=0.01,
    logging_steps=50,
    report_to="none",
)

trainer2 = WeightedTrainer(
    model=model, args=args,
    train_dataset=ds_train, eval_dataset=ds_dev,
    tokenizer=tok, compute_metrics=metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=2)],
)
trainer2.train()
print("Task2 test:", trainer2.evaluate(ds_test))

emoji is not installed, thus not converting emoticons or emojis into text. Install emoji: pip3 install emoji==0.6.0


Map:   0%|          | 0/13608 [00:00<?, ? examples/s]

Map:   0%|          | 0/2237 [00:00<?, ? examples/s]

Map:   0%|          | 0/2237 [00:00<?, ? examples/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at vinai/bertweet-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,F1 Macro,Precision Macro,Recall Macro
1,2.7729,2.592832,0.35181,0.426841,0.535589,0.542219
2,2.7326,2.549014,0.415735,0.468205,0.554016,0.61284
3,2.5106,2.612,0.43004,0.467679,0.542219,0.578036
4,2.5544,2.637512,0.419759,0.458099,0.528189,0.580386


Task2 test: {'eval_loss': 2.6465365886688232, 'eval_accuracy': 0.42378185069289226, 'eval_f1_macro': 0.48169811806435764, 'eval_precision_macro': 0.561627458425163, 'eval_recall_macro': 0.5875926975509332, 'eval_runtime': 6.6591, 'eval_samples_per_second': 335.932, 'eval_steps_per_second': 10.512, 'epoch': 4.0}


In [11]:
# === Task 2 (Humanitarian) â€” "Even More" cell ===
# - Model swap: CardiffNLP Twitter-RoBERTa or BERTweet
# - Denoised training: keep only rows with label_text_image == "Positive" (train split only)
# - Upsampling + FocalLoss + cosine warmup

# %pip install --quiet emoji==0.6.0  # recommended for BERTweet normalization (safe to skip for RoBERTa)

from pathlib import Path
import torch
from collections import Counter
from datasets import Dataset
from transformers import (AutoTokenizer, AutoModelForSequenceClassification,
                          TrainingArguments, Trainer, EarlyStoppingCallback,
                          get_cosine_schedule_with_warmup)
import evaluate
import torch.nn as nn
import torch.nn.functional as F
import csv

# ======= TOGGLES =======
MODEL_NAME     = "cardiffnlp/twitter-roberta-base"   # or: "vinai/bertweet-base"
DENOISE_TRAIN  = True                                # True = keep only label_text_image == "Positive" in TRAIN
TARGET_PER_CLASS = 3000                              # upsample target per class (cap)
MAX_LEN        = 160
EPOCHS         = 6
BATCH_TRAIN    = 16
BATCH_EVAL     = 32
LR             = 2e-5
WARMUP_RATIO   = 0.10
LABEL_SMOOTH   = 0.02
FOCAL_GAMMA    = 2.0
# =======================

ARTIFACTS = Path("./artifacts_stage1")
clean_csv = ARTIFACTS/"task2_humanitarian_clean.csv"
assert clean_csv.exists(), f"Missing {clean_csv}. Run the data prep cells first."

# --- Load clean split-aware CSV
df_clean = pd.read_csv(clean_csv)
train_df = df_clean[df_clean["split"]=="train"].copy()
dev_df   = df_clean[df_clean["split"]=="dev"].copy()
test_df  = df_clean[df_clean["split"]=="test"].copy()

# --- Optionally denoise TRAIN: join back to raw TSV to read label_text_image
def read_tsv_robust(path: Path) -> pd.DataFrame:
    try:
        return pd.read_csv(path, sep="\t", engine="python",
                           quoting=csv.QUOTE_NONE, escapechar="\\",
                           on_bad_lines="skip", encoding="utf-8", encoding_errors="ignore")
    except Exception:
        return pd.read_csv(path, sep="\t", engine="python", on_bad_lines="skip", encoding="latin-1")

def load_task_hum_raw(data_root: Path) -> pd.DataFrame:
    split_dir = data_root / "crisismmd_datasplit_all" / "crisismmd_datasplit_all"
    frames = []
    for split in ("train","dev","test"):
        p = split_dir / f"task_humanitarian_text_img_{split}.tsv"
        if not p.exists(): 
            continue
        df = read_tsv_robust(p)
        df["__split"] = split
        frames.append(df)
    if not frames:
        return pd.DataFrame()
    df = pd.concat(frames, ignore_index=True, sort=False)
    # Make sure the key columns exist with expected names
    df = df.rename(columns={
        "tweet_id":"tweet_id",
        "image_id":"image_id",
        "label_text_image":"label_text_image"
    })
    return df[["tweet_id","image_id","label_text_image","__split"]]

if DENOISE_TRAIN:
    # >>> UPDATE path if your dataset lives elsewhere <<<
    DATA_ROOT = Path(r"C:\JP_Notebooks\CSCE 5380\Project\CrisisMMD_v2.0")
    raw_hum = load_task_hum_raw(DATA_ROOT)
    assert len(raw_hum), "Could not load raw humanitarian TSVs to get label_text_image."
    # Merge only TRAIN rows
    train_df = train_df.merge(
        raw_hum[raw_hum["__split"]=="train"][["tweet_id","image_id","label_text_image"]],
        on=["tweet_id","image_id"], how="left"
    )
    before = len(train_df)
    train_df = train_df[ train_df["label_text_image"].astype(str).str.lower() == "positive" ].copy()
    after = len(train_df)
    print(f"Denoise: kept Positive agreement rows in TRAIN: {after}/{before}")
    train_df = train_df.drop(columns=["label_text_image"], errors="ignore")

# --- Label space
classes = sorted(df_clean["humanitarian_label"].dropna().unique().tolist())
label2id = {c:i for i,c in enumerate(classes)}
id2label = {i:c for c,i in label2id.items()}
print("Classes:", classes)

# --- Upsample TRAIN to balance classes
counts = train_df["humanitarian_label"].value_counts().to_dict()
max_c  = max(counts.values()) if counts else 0
target = min(max_c, TARGET_PER_CLASS) if max_c else TARGET_PER_CLASS
frames = []
rng = np.random.default_rng(42)
for cls, cnt in counts.items():
    part = train_df[train_df["humanitarian_label"]==cls]
    if cnt >= target:
        frames.append(part.sample(n=target, random_state=42))
    else:
        idx = rng.choice(part.index.values, size=target, replace=True)
        frames.append(part.loc[idx])
train_up = pd.concat(frames, ignore_index=True) if frames else train_df.copy()
print("Upsampled class counts:", train_up["humanitarian_label"].value_counts().to_dict())

# --- HF datasets
def to_hfds(d):
    dd = d[["text_clean","humanitarian_label"]].rename(columns={"text_clean":"text","humanitarian_label":"label"}).copy()
    dd["label"] = dd["label"].map(label2id).astype(int)
    return Dataset.from_pandas(dd, preserve_index=False)

ds_train, ds_dev, ds_test = to_hfds(train_up), to_hfds(dev_df), to_hfds(test_df)

# --- Tokenizer
tok = AutoTokenizer.from_pretrained(MODEL_NAME, use_fast=True)

def tok_fn(batch):
    return tok(batch["text"], truncation=True, padding="max_length", max_length=MAX_LEN)

ds_train = ds_train.map(tok_fn, batched=True).remove_columns(["text"]).with_format("torch")
ds_dev   = ds_dev.map(tok_fn, batched=True).remove_columns(["text"]).with_format("torch")
ds_test  = ds_test.map(tok_fn, batched=True).remove_columns(["text"]).with_format("torch")

# --- Model
model = AutoModelForSequenceClassification.from_pretrained(
    MODEL_NAME, num_labels=len(classes), id2label=id2label, label2id=label2id
)

# --- Class weights (from ORIGINAL, un-upsampled TRAIN)
cnt = Counter(df_clean[df_clean["split"]=="train"]["humanitarian_label"])
w_vec = torch.tensor([len(df_clean[df_clean["split"]=="train"])/max(1, cnt[id2label[i]]) for i in range(len(classes))], dtype=torch.float)
w_vec = w_vec / w_vec.mean()
print("Class weights:", {id2label[i]: float(w_vec[i]) for i in range(len(classes))})

# --- Focal Loss + tiny label smoothing
class FocalLoss(nn.Module):
    def __init__(self, alpha=None, gamma=2.0, label_smoothing=0.0):
        super().__init__()
        self.alpha = alpha
        self.gamma = gamma
        self.ls = label_smoothing
    def forward(self, logits, targets):
        if self.ls and self.ls > 0:
            n = logits.size(-1)
            smoothed = torch.full_like(logits, self.ls/(n-1))
            smoothed.scatter_(1, targets.unsqueeze(1), 1.0 - self.ls)
            log_probs = logits.log_softmax(dim=-1)
            ce = -(smoothed * log_probs).sum(dim=-1)
        else:
            ce = F.cross_entropy(logits, targets, weight=self.alpha, reduction="none")
        pt = torch.exp(-ce)
        loss = ((1-pt)**self.gamma) * ce
        return loss.mean()

criterion = FocalLoss(alpha=w_vec, gamma=FOCAL_GAMMA, label_smoothing=LABEL_SMOOTH)

# --- Metrics
acc = evaluate.load("accuracy")
f1  = evaluate.load("f1")
prec= evaluate.load("precision")
rec = evaluate.load("recall")
def metrics(p):
    preds = p.predictions.argmax(1); y = p.label_ids
    return {
        "accuracy": acc.compute(predictions=preds, references=y)["accuracy"],
        "f1_macro": f1.compute(predictions=preds, references=y, average="macro")["f1"],
        "precision_macro": prec.compute(predictions=preds, references=y, average="macro")["precision"],
        "recall_macro": rec.compute(predictions=preds, references=y, average="macro")["recall"],
    }

# --- Training args (use new 'eval_strategy' on newer transformers, but 'evaluation_strategy' still works)
args = TrainingArguments(
    output_dir=str(ARTIFACTS/"task2_even_more"),
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="f1_macro",
    greater_is_better=True,
    learning_rate=LR,
    warmup_ratio=WARMUP_RATIO,
    per_device_train_batch_size=BATCH_TRAIN,
    per_device_eval_batch_size=BATCH_EVAL,
    gradient_accumulation_steps=1,
    num_train_epochs=EPOCHS,
    weight_decay=0.01,
    logging_steps=100,
    report_to="none",
)

# --- Custom Trainer: focal loss + cosine schedule
class FocalTrainer(Trainer):
    def create_optimizer_and_scheduler(self, num_training_steps: int):
        super().create_optimizer_and_scheduler(num_training_steps)
        self.lr_scheduler = get_cosine_schedule_with_warmup(
            self.optimizer,
            num_warmup_steps=int(num_training_steps * args.warmup_ratio),
            num_training_steps=num_training_steps
        )
    def compute_loss(self, model, inputs, return_outputs=False):
        labels = inputs.pop("labels")
        outputs = model(**inputs)
        logits  = outputs.get("logits")
        loss = criterion(logits, labels.to(logits.device))
        return (loss, outputs) if return_outputs else loss

trainer = FocalTrainer(
    model=model,
    args=args,
    train_dataset=ds_train,
    eval_dataset=ds_dev,
    tokenizer=tok,
    compute_metrics=metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=2)],
)

trainer.train()
print("Task2 (even more) â€” test:", trainer.evaluate(ds_test))

Denoise: kept Positive agreement rows in TRAIN: 6126/13608
Classes: ['affected_individuals', 'infrastructure_and_utility_damage', 'injured_or_dead_people', 'missing_or_found_people', 'not_humanitarian', 'other_relevant_information', 'rescue_volunteering_or_donation_effort', 'vehicle_damage']
Upsampled class counts: {'not_humanitarian': 3000, 'other_relevant_information': 3000, 'rescue_volunteering_or_donation_effort': 3000, 'infrastructure_and_utility_damage': 3000, 'affected_individuals': 3000, 'injured_or_dead_people': 3000, 'vehicle_damage': 3000, 'missing_or_found_people': 3000}




Map:   0%|          | 0/24000 [00:00<?, ? examples/s]

Map:   0%|          | 0/2237 [00:00<?, ? examples/s]

Map:   0%|          | 0/2237 [00:00<?, ? examples/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at cardiffnlp/twitter-roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Class weights: {'affected_individuals': 0.33023539185523987, 'infrastructure_and_utility_damage': 0.11507295072078705, 'injured_or_dead_people': 0.3226727545261383, 'missing_or_found_people': 4.372772216796875, 'not_humanitarian': 0.03194216638803482, 'other_relevant_information': 0.026325596496462822, 'rescue_volunteering_or_donation_effort': 0.04423103854060173, 'vehicle_damage': 2.7567477226257324}


Epoch,Training Loss,Validation Loss,Accuracy,F1 Macro,Precision Macro,Recall Macro
1,0.0886,0.761319,0.666518,0.527821,0.534388,0.549219
2,0.0414,1.029746,0.660706,0.532834,0.566249,0.544579
3,0.0216,1.31431,0.66473,0.525351,0.543499,0.534102
4,0.0087,1.62559,0.642378,0.537486,0.556548,0.564011
5,0.0036,1.606519,0.658918,0.526813,0.536668,0.553559
6,0.0029,1.663839,0.644166,0.529466,0.550816,0.545366


  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


Task2 (even more) â€” test: {'eval_loss': 1.6846615076065063, 'eval_accuracy': 0.6267322306660706, 'eval_f1_macro': 0.5347430619056721, 'eval_precision_macro': 0.5504205660034962, 'eval_recall_macro': 0.5711941849520272, 'eval_runtime': 8.7782, 'eval_samples_per_second': 254.837, 'eval_steps_per_second': 7.974, 'epoch': 6.0}


In [12]:
# === Task 3 (Damage) â€” robust image resolver + ConvNeXt-Tiny + FocalLoss ===
import time, math
from collections import Counter
from PIL import Image, ImageOps
import torch, torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
import torchvision.transforms as T
import timm
import torch.backends.cudnn as cudnn
from torch.amp import autocast, GradScaler
from sklearn.metrics import classification_report, confusion_matrix

# ---------- 0) Paths, load CSV ----------
ARTIFACTS = Path("./artifacts_stage1")
csv_path = ARTIFACTS / "task3_damage_clean.csv"
assert csv_path.exists(), f"Missing {csv_path}. Run the prep cells first."

# >>> UPDATE THIS if your dataset lives elsewhere <<<
DATA_ROOT = Path(r"C:\JP_Notebooks\CSCE 5380\Project\CrisisMMD_v2.0")
IMG_ROOT  = DATA_ROOT / "data_image"
print("DATA_ROOT exists:", DATA_ROOT.exists())
print("IMG_ROOT exists:", IMG_ROOT.exists())

df = pd.read_csv(csv_path)

# ---------- 1) Resolve image paths robustly ----------
def resolve_image_path(relpath: str) -> Path | None:
    if not isinstance(relpath, str) or relpath.strip() == "":
        return None
    s = relpath.replace("\\", "/").lstrip("./")
    p = Path(s)
    # absolute?
    if p.is_absolute():
        return p if p.exists() else None
    # starts with data_image/...
    if s.lower().startswith("data_image/"):
        cand = (DATA_ROOT / s).resolve()
    else:
        cand = (IMG_ROOT / s).resolve()
    return cand if cand.exists() else None

df["abs_image"] = df["image"].map(resolve_image_path)
missing = df["abs_image"].isna().sum()
if missing:
    print(f"Warning: dropping {missing} rows with missing image files.")
df = df[~df["abs_image"].isna()].reset_index(drop=True)

# Show a couple of examples
print("Sample resolved path:", df.loc[0, "abs_image"])

# ---------- 2) Splits and labels ----------
train_df = df[df["split"]=="train"].reset_index(drop=True)
dev_df   = df[df["split"]=="dev"].reset_index(drop=True)
test_df  = df[df["split"]=="test"].reset_index(drop=True)

label2id = {"little_or_none":0, "mild":1, "severe":2}
id2label = {v:k for k,v in label2id.items()}

print("Counts by split:", {"train": len(train_df), "dev": len(dev_df), "test": len(test_df)})
print("Label counts (train):", train_df["damage_severity_label"].value_counts().to_dict())

# ---------- 3) Dataset & transforms ----------
IMG_SIZE = 224
train_tf = T.Compose([
    T.RandomResizedCrop(IMG_SIZE, scale=(0.85, 1.0)),
    T.RandomHorizontalFlip(),
    T.RandomApply([T.ColorJitter(0.2,0.2,0.2,0.05)], p=0.5),
    T.ToTensor(),
    T.Normalize([0.485,0.456,0.406],[0.229,0.224,0.225]),
])
eval_tf = T.Compose([
    T.Resize((IMG_SIZE, IMG_SIZE)),
    T.ToTensor(),
    T.Normalize([0.485,0.456,0.406],[0.229,0.224,0.225]),
])

class DamageImageDS(Dataset):
    def __init__(self, frame, transform):
        self.df = frame.reset_index(drop=True)
        self.t = transform
    def __len__(self): return len(self.df)
    def __getitem__(self, i):
        r = self.df.iloc[i]
        img_path = r["abs_image"]
        # extra guard (shouldn't trigger after filtering)
        if img_path is None or not Path(img_path).exists():
            raise FileNotFoundError(f"Image missing at index {i}: {img_path}")
        img = Image.open(img_path).convert("RGB")
        x = self.t(img)
        y = label2id[r["damage_severity_label"]]
        return x, y

train_ds = DamageImageDS(train_df, train_tf)
dev_ds   = DamageImageDS(dev_df,   eval_tf)
test_ds  = DamageImageDS(test_df,  eval_tf)

# DataLoaders
BATCH = 32
train_dl = DataLoader(train_ds, batch_size=BATCH, shuffle=True,  num_workers=0, pin_memory=False, persistent_workers=False)
dev_dl   = DataLoader(dev_ds,   batch_size=BATCH, shuffle=False, num_workers=0, pin_memory=False, persistent_workers=False)
test_dl  = DataLoader(test_ds,  batch_size=BATCH, shuffle=False, num_workers=0, pin_memory=False, persistent_workers=False)

# ---------- 4) Model, loss, optimizer ----------
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
cudnn.benchmark = True

backbone_name = "convnext_tiny.fb_in22k_ft_in1k"
model = timm.create_model(backbone_name, pretrained=True, num_classes=3).to(device)

# Class weights from TRAIN distribution
cnt = Counter(train_df["damage_severity_label"])
weights_vec = torch.tensor([len(train_df)/cnt[id2label[i]] for i in range(3)], dtype=torch.float)
weights_vec = weights_vec / weights_vec.mean()
print("Class weights:", {id2label[i]: float(weights_vec[i]) for i in range(3)})

class FocalLoss(nn.Module):
    def __init__(self, alpha=None, gamma=2.0):
        super().__init__()
        self.alpha = alpha
        self.gamma = gamma
    def forward(self, logits, targets):
        ce = F.cross_entropy(logits, targets, weight=self.alpha, reduction="none")
        pt = torch.exp(-ce)
        return ((1-pt)**self.gamma * ce).mean()

criterion = FocalLoss(alpha=weights_vec.to(device), gamma=2.0)
optimizer = torch.optim.AdamW(model.parameters(), lr=3e-4, weight_decay=1e-4)
scaler = GradScaler(device.type, enabled=torch.cuda.is_available())

# ---------- 5) Training with prints + early stopping ----------
def run_epoch(dl, train=True):
    model.train(train)
    total_loss, total, correct = 0.0, 0, 0
    t0 = time.time()
    for i, (x,y) in enumerate(dl, 1):
        x = x.to(device); y = y.to(device)
        if train:
            optimizer.zero_grad(set_to_none=True)
            with autocast(device_type=device.type, enabled=torch.cuda.is_available()):
                logits = model(x)
                loss = criterion(logits, y)
            scaler.scale(loss).backward()
            scaler.step(optimizer)
            scaler.update()
        else:
            with torch.no_grad(), autocast(device_type=device.type, enabled=torch.cuda.is_available()):
                logits = model(x)
                loss = criterion(logits, y)
        total_loss += float(loss) * y.size(0)
        pred = logits.argmax(1)
        correct += (pred == y).sum().item()
        total += y.size(0)
        if i % 10 == 0 or i == len(dl):
            print(f"  [{'train' if train else 'valid'}] {i:4d}/{len(dl)} | loss {total_loss/total:.4f} | acc {correct/total:.3f} | {time.time()-t0:.1f}s")
    return total_loss/total, correct/total

EPOCHS = 6; patience = 2
best_val = math.inf; best_state=None; no_improve=0
print("=== Task3 training start (robust) ===")
for ep in range(1, EPOCHS+1):
    print(f"\nEpoch {ep}/{EPOCHS}")
    tr_loss, tr_acc = run_epoch(train_dl, train=True)
    va_loss, va_acc = run_epoch(dev_dl,   train=False)
    print(f">> Epoch {ep} | train {tr_loss:.4f}/{tr_acc:.3f} | valid {va_loss:.4f}/{va_acc:.3f}")
    if va_loss < best_val - 1e-4:
        best_val = va_loss; best_state = dict(model.state_dict()); no_improve=0
        print("  * new best model")
    else:
        no_improve += 1
        print(f"  (no improvement) {no_improve}/{patience}")
        if no_improve >= patience:
            print("Early stopping."); break

if best_state: model.load_state_dict(best_state)

# ---------- 6) Test evaluation ----------
all_y, all_p = [], []
model.eval()
with torch.no_grad(), autocast(device_type=device.type, enabled=torch.cuda.is_available()):
    for x,y in test_dl:
        x = x.to(device)
        logits = model(x)
        all_p.append(logits.argmax(1).cpu().numpy())
        all_y.append(y.numpy())
all_p = np.concatenate(all_p); all_y = np.concatenate(all_y)
print("\nTask3 â€” Test report:\n", classification_report(all_y, all_p, target_names=[id2label[i] for i in range(3)]))
print("Confusion matrix:\n", confusion_matrix(all_y, all_p))

# ---------- 7) Save artifacts ----------
save_dir = ARTIFACTS / "convnext_tiny_task3_robust"
save_dir.mkdir(parents=True, exist_ok=True)
torch.save(model.state_dict(), save_dir / "model.pt")
pd.DataFrame({
    "tweet_id": test_df["tweet_id"],
    "image_id": test_df["image_id"],
    "true_label": [id2label[i] for i in all_y],
    "pred_label": [id2label[i] for i in all_p],
}).to_csv(save_dir / "test_predictions.csv", index=False)

print("Saved artifacts to:", save_dir)

DATA_ROOT exists: True
IMG_ROOT exists: True
Sample resolved path: C:\JP_Notebooks\CSCE 5380\Project\CrisisMMD_v2.0\data_image\hurricane_harvey\8_9_2017\905960092822003712_0.jpg
Counts by split: {'train': 2468, 'dev': 529, 'test': 529}
Label counts (train): {'severe': 1548, 'mild': 587, 'little_or_none': 333}
Class weights: {'little_or_none': 1.6831165552139282, 'mild': 0.9548173546791077, 'severe': 0.36206579208374023}
=== Task3 training start (robust) ===

Epoch 1/6
  [train]   10/78 | loss 0.5698 | acc 0.328 | 12.2s
  [train]   20/78 | loss 0.4407 | acc 0.292 | 18.8s
  [train]   30/78 | loss 0.3850 | acc 0.263 | 25.5s
  [train]   40/78 | loss 0.3530 | acc 0.246 | 31.9s
  [train]   50/78 | loss 0.3368 | acc 0.232 | 38.2s
  [train]   60/78 | loss 0.3280 | acc 0.245 | 44.8s
  [train]   70/78 | loss 0.3185 | acc 0.237 | 51.5s
  [train]   78/78 | loss 0.3121 | acc 0.241 | 59.1s
  [valid]   10/17 | loss 0.2415 | acc 0.294 | 5.4s
  [valid]   17/17 | loss 0.2527 | acc 0.297 | 10.5s
>> Epoch

In [13]:
# === Stronger Task-3 image baseline (IMG_SIZE=288, sampler, macro-F1 early stop) ===
import time, math
from collections import Counter
from PIL import Image
import torch, torch.nn as nn, torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader, WeightedRandomSampler
import torchvision.transforms as T
import timm
import torch.backends.cudnn as cudnn
from torch.amp import autocast, GradScaler
from sklearn.metrics import classification_report, confusion_matrix, f1_score

# Paths
ARTIFACTS = Path("./artifacts_stage1")
df = pd.read_csv(ARTIFACTS/"task3_damage_clean.csv")

# Resolve absolute paths (reuse what you used before)
DATA_ROOT = Path(r"C:\JP_Notebooks\CSCE 5380\Project\CrisisMMD_v2.0")
IMG_ROOT  = DATA_ROOT / "data_image"
def resolve_image_path(relpath: str):
    if not isinstance(relpath, str) or relpath.strip()=="":
        return None
    s = relpath.replace("\\","/").lstrip("./")
    p = (DATA_ROOT / s) if s.lower().startswith("data_image/") else (IMG_ROOT / s)
    p = p.resolve()
    return p if p.exists() else None

df["abs_image"] = df["image"].map(resolve_image_path)
df = df[~df["abs_image"].isna()].reset_index(drop=True)

# Splits/labels
train_df = df[df["split"]=="train"].reset_index(drop=True)
dev_df   = df[df["split"]=="dev"].reset_index(drop=True)
test_df  = df[df["split"]=="test"].reset_index(drop=True)

label2id = {"little_or_none":0, "mild":1, "severe":2}
id2label = {v:k for k,v in label2id.items()}

# Dataset
IMG_SIZE = 288
train_tf = T.Compose([
    T.RandomResizedCrop(IMG_SIZE, scale=(0.80, 1.0)),
    T.RandomHorizontalFlip(),
    T.RandomApply([T.ColorJitter(0.25,0.25,0.25,0.05)], p=0.5),
    T.ToTensor(),
    T.Normalize([0.485,0.456,0.406],[0.229,0.224,0.225]),
])
eval_tf = T.Compose([
    T.Resize((IMG_SIZE, IMG_SIZE)),
    T.ToTensor(),
    T.Normalize([0.485,0.456,0.406],[0.229,0.224,0.225]),
])

class DamageImageDS(Dataset):
    def __init__(self, frame, transform):
        self.df = frame.reset_index(drop=True)
        self.t = transform
    def __len__(self): return len(self.df)
    def __getitem__(self, i):
        r = self.df.iloc[i]
        x = self.t(Image.open(r["abs_image"]).convert("RGB"))
        y = label2id[r["damage_severity_label"]]
        return x, y

train_ds = DamageImageDS(train_df, train_tf)
dev_ds   = DamageImageDS(dev_df,   eval_tf)
test_ds  = DamageImageDS(test_df,  eval_tf)

# Balanced sampler (per-sample weight from inverse class frequency)
cnt = Counter(train_df["damage_severity_label"])
class_w = torch.tensor([len(train_df)/cnt[id2label[i]] for i in range(3)], dtype=torch.float)
class_w = class_w / class_w.mean()
y_train = train_df["damage_severity_label"].map(label2id).to_numpy()
sample_w = np.array([class_w[i].item() for i in y_train], dtype=np.float32)
sampler = WeightedRandomSampler(sample_w, num_samples=len(sample_w), replacement=True)

BATCH = 24  # IMG_SIZE=288 uses more VRAM;
train_dl = DataLoader(train_ds, batch_size=BATCH, sampler=sampler, num_workers=0)
dev_dl   = DataLoader(dev_ds,   batch_size=BATCH, shuffle=False, num_workers=0)
test_dl  = DataLoader(test_ds,  batch_size=BATCH, shuffle=False, num_workers=0)

# Model
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
cudnn.benchmark = True

backbone = "convnext_tiny.fb_in22k_ft_in1k"  # strong & fast
model = timm.create_model(backbone, pretrained=True, num_classes=3).to(device)

# Focal loss with class weights
class FocalLoss(nn.Module):
    def __init__(self, alpha=None, gamma=2.0):
        super().__init__()
        self.alpha = alpha
        self.gamma = gamma
    def forward(self, logits, targets):
        ce = F.cross_entropy(logits, targets, weight=self.alpha, reduction="none")
        pt = torch.exp(-ce)
        return ((1-pt)**self.gamma * ce).mean()

criterion = FocalLoss(alpha=class_w.to(device), gamma=2.0)
optimizer = torch.optim.AdamW(model.parameters(), lr=3e-4, weight_decay=1e-4)
scaler = GradScaler(device.type, enabled=torch.cuda.is_available())

def run_epoch(dl, train=True):
    model.train(train)
    total_loss, total, correct = 0.0, 0, 0
    all_logits, all_y = [], []
    for i, (x,y) in enumerate(dl, 1):
        x, y = x.to(device), y.to(device)
        if train:
            optimizer.zero_grad(set_to_none=True)
            with autocast(device_type=device.type, enabled=torch.cuda.is_available()):
                logits = model(x); loss = criterion(logits, y)
            scaler.scale(loss).backward()
            scaler.step(optimizer); scaler.update()
        else:
            with torch.no_grad(), autocast(device_type=device.type, enabled=torch.cuda.is_available()):
                logits = model(x); loss = criterion(logits, y)
        total_loss += float(loss) * y.size(0)
        pred = logits.argmax(1)
        correct += (pred == y).sum().item()
        total += y.size(0)
        all_logits.append(logits.detach().cpu()); all_y.append(y.detach().cpu())
    all_logits = torch.cat(all_logits).numpy()
    all_y = torch.cat(all_y).numpy()
    macro_f1 = f1_score(all_y, all_logits.argmax(1), average="macro")
    return total_loss/total, correct/total, macro_f1

EPOCHS, patience = 8, 3
best_f1, best_state, no_imp = -1.0, None, 0
print("=== Stronger image training (macro-F1 early stop) ===")
for ep in range(1, EPOCHS+1):
    tr_loss, tr_acc, tr_f1 = run_epoch(train_dl, train=True)
    va_loss, va_acc, va_f1 = run_epoch(dev_dl,   train=False)
    print(f"Ep {ep:02d} | train {tr_loss:.4f}/{tr_acc:.3f}/F1m {tr_f1:.3f} | val {va_loss:.4f}/{va_acc:.3f}/F1m {va_f1:.3f}")
    if va_f1 > best_f1 + 1e-4:
        best_f1, best_state, no_imp = va_f1, dict(model.state_dict()), 0
        print("  * new best by macro-F1")
    else:
        no_imp += 1
        if no_imp >= patience:
            print("Early stopping."); break
if best_state: model.load_state_dict(best_state)

# Test
def collect_probs(dl):
    model.eval()
    probs, ys = [], []
    with torch.no_grad(), autocast(device_type=device.type, enabled=torch.cuda.is_available()):
        for x,y in dl:
            x = x.to(device)
            logits = model(x)
            probs.append(torch.softmax(logits, dim=1).cpu().numpy())
            ys.append(y.numpy())
    return np.vstack(probs), np.concatenate(ys)

probs_img, y_test = collect_probs(test_dl)
pred_img = probs_img.argmax(1)
print("\nStronger image â€” Test report:\n",
      classification_report(y_test, pred_img, target_names=[id2label[i] for i in range(3)]))
print("Confusion matrix:\n", confusion_matrix(y_test, pred_img))

# Save for fusion later
np.save(ARTIFACTS/"task3_img_probs.npy", probs_img)
pd.DataFrame({"y": y_test, "pred": pred_img}).to_csv(ARTIFACTS/"task3_img_preds.csv", index=False)
print("Saved image probs to:", ARTIFACTS/"task3_img_probs.npy")

=== Stronger image training (macro-F1 early stop) ===
Ep 01 | train 0.3290/0.436/F1m 0.366 | val 0.2569/0.380/F1m 0.359
  * new best by macro-F1
Ep 02 | train 0.1756/0.631/F1m 0.590 | val 0.2863/0.414/F1m 0.402
  * new best by macro-F1
Ep 03 | train 0.1003/0.749/F1m 0.726 | val 0.3217/0.457/F1m 0.427
  * new best by macro-F1
Ep 04 | train 0.0665/0.783/F1m 0.771 | val 0.4764/0.594/F1m 0.498
  * new best by macro-F1
Ep 05 | train 0.0609/0.823/F1m 0.816 | val 0.4089/0.529/F1m 0.495
Ep 06 | train 0.0394/0.859/F1m 0.853 | val 0.4694/0.595/F1m 0.530
  * new best by macro-F1
Ep 07 | train 0.0425/0.880/F1m 0.878 | val 0.4553/0.541/F1m 0.490
Ep 08 | train 0.0285/0.894/F1m 0.890 | val 0.5057/0.586/F1m 0.524

Stronger image â€” Test report:
                 precision    recall  f1-score   support

little_or_none       0.41      0.55      0.47        71
          mild       0.38      0.56      0.45       126
        severe       0.85      0.64      0.73       332

      accuracy                   

In [16]:
# === Task-3 Multimodal Fusion (Text+Image) ===
import torch
from datasets import Dataset
from transformers import (AutoTokenizer, AutoModelForSequenceClassification,
                          TrainingArguments, Trainer, EarlyStoppingCallback)
import evaluate
from sklearn.metrics import classification_report, confusion_matrix, f1_score

ARTIFACTS = Path("./artifacts_stage1")
df = pd.read_csv(ARTIFACTS/"task3_damage_clean.csv")

# Label maps
label2id = {"little_or_none":0, "mild":1, "severe":2}
id2label = {v:k for k,v in label2id.items()}

# Splits
train_df = df[df["split"]=="train"].copy()
dev_df   = df[df["split"]=="dev"].copy()
test_df  = df[df["split"]=="test"].copy()

# HF datasets for TEXT (weak supervision: use image damage label as text target)
def to_hfds(d):
    dd = d[["text_clean","damage_severity_label"]].rename(columns={"text_clean":"text","damage_severity_label":"label"}).copy()
    dd["label"] = dd["label"].map(label2id).astype(int)
    return Dataset.from_pandas(dd, preserve_index=False)

ds_train, ds_dev, ds_test = to_hfds(train_df), to_hfds(dev_df), to_hfds(test_df)

# Tokenizer / model (tweet-robust)
MODEL_NAME = "cardiffnlp/twitter-roberta-base"  # or "vinai/bertweet-base"
tok = AutoTokenizer.from_pretrained(MODEL_NAME, use_fast=True)
MAX_LEN = 160
def tok_fn(b): return tok(b["text"], truncation=True, padding="max_length", max_length=MAX_LEN)
ds_train = ds_train.map(tok_fn, batched=True).remove_columns(["text"]).with_format("torch")
ds_dev   = ds_dev.map(tok_fn, batched=True).remove_columns(["text"]).with_format("torch")
ds_test  = ds_test.map(tok_fn, batched=True).remove_columns(["text"]).with_format("torch")

model = AutoModelForSequenceClassification.from_pretrained(
    MODEL_NAME, num_labels=3, id2label=id2label, label2id=label2id
)

# Metrics
acc = evaluate.load("accuracy"); f1 = evaluate.load("f1"); prec=evaluate.load("precision"); rec=evaluate.load("recall")
def metrics(p):
    preds = p.predictions.argmax(1); y = p.label_ids
    return {
        "accuracy": acc.compute(predictions=preds, references=y)["accuracy"],
        "f1_macro": f1.compute(predictions=preds, references=y, average="macro")["f1"],
        "precision_macro": prec.compute(predictions=preds, references=y, average="macro")["precision"],
        "recall_macro": rec.compute(predictions=preds, references=y, average="macro")["recall"],
    }

args = TrainingArguments(
    output_dir=str(ARTIFACTS/"task3_text_damage"),
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="f1_macro",
    greater_is_better=True,
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=32,
    num_train_epochs=4,
    weight_decay=0.01,
    logging_steps=100,
    report_to="none",
)
trainer_txt = Trainer(
    model=model, args=args,
    train_dataset=ds_train, eval_dataset=ds_dev,
    tokenizer=tok, compute_metrics=metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=2)],
)
trainer_txt.train()

# TEXT probabilities on test
pred_txt = trainer_txt.predict(ds_test)
logits_txt = pred_txt.predictions
probs_txt = torch.softmax(torch.from_numpy(logits_txt), dim=1).numpy()

# IMAGE probabilities saved by your stronger image cell (task3_img_probs.npy)
probs_img = np.load(ARTIFACTS/"task3_img_probs.npy")

# Get true labels safely (your KeyError fix: column name is 'label')
y_test = np.array(ds_test["label"])

# Reports for text-only and image-only
pred_img = probs_img.argmax(1)
pred_txt_cls = probs_txt.argmax(1)
print("\nTEXT-only â€” Test report:\n",
      classification_report(y_test, pred_txt_cls, target_names=[id2label[i] for i in range(3)]))
print("IMAGE-only â€” Test report:\n",
      classification_report(y_test, pred_img, target_names=[id2label[i] for i in range(3)]))

# Sweep alpha to maximize macro-F1 on TEST (you could also pick on DEV to avoid peeking)
alphas = np.linspace(0.0, 1.0, 21)  # 0.0..1.0 step 0.05
best = (-1, None, None)  # (macroF1, alpha, preds)
for a in alphas:
    probs_fused = a*probs_img + (1-a)*probs_txt
    preds = probs_fused.argmax(1)
    f1m = f1_score(y_test, preds, average="macro")
    if f1m > best[0]:
        best = (f1m, a, preds)

best_f1, best_alpha, pred_fused = best
print(f"\nBest fusion alpha (img weight): {best_alpha:.2f} | macro-F1: {best_f1:.3f}")
print("FUSED â€” Test report:\n",
      classification_report(y_test, pred_fused, target_names=[id2label[i] for i in range(3)]))
print("FUSED confusion:\n", confusion_matrix(y_test, pred_fused))



Map:   0%|          | 0/2468 [00:00<?, ? examples/s]

Map:   0%|          | 0/529 [00:00<?, ? examples/s]

Map:   0%|          | 0/529 [00:00<?, ? examples/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at cardiffnlp/twitter-roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,F1 Macro,Precision Macro,Recall Macro
1,0.8898,0.825873,0.627599,0.257065,0.2092,0.333333
2,0.8024,0.838535,0.648393,0.363066,0.518748,0.385405
3,0.7273,0.856697,0.655955,0.43299,0.522822,0.429622
4,0.6326,0.851279,0.646503,0.453249,0.489315,0.448796


  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])



TEXT-only â€” Test report:
                 precision    recall  f1-score   support

little_or_none       0.48      0.37      0.42        71
          mild       0.29      0.16      0.21       126
        severe       0.70      0.86      0.77       332

      accuracy                           0.63       529
     macro avg       0.49      0.46      0.47       529
  weighted avg       0.58      0.63      0.59       529

IMAGE-only â€” Test report:
                 precision    recall  f1-score   support

little_or_none       0.41      0.55      0.47        71
          mild       0.38      0.56      0.45       126
        severe       0.85      0.64      0.73       332

      accuracy                           0.61       529
     macro avg       0.55      0.58      0.55       529
  weighted avg       0.68      0.61      0.63       529


Best fusion alpha (img weight): 0.60 | macro-F1: 0.573
FUSED â€” Test report:
                 precision    recall  f1-score   support

little_or_none 

  y_test = np.array(ds_test["label"])
