In [3]:
!pip install transformers

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)




In [4]:
# roberta_contrastive_minimal.py
"""
Memory-light contrastive + classifier script based on your original.
Main changes to reduce memory:
 - roberta-base instead of roberta-large
 - MAX_LEN reduced to 128
 - single encoder forward; two projection views are generated by dropout in proj head
 - encoder frozen by default (only projection + classifier trained)
 - smaller batch size, smaller proj dim
 - gradient clipping
 - optional: unfreeze last N encoder layers (see UNFREEZE_LAST_N)
"""

import os
import math
import random
import numpy as np
import pandas as pd
import tensorflow as tf
import matplotlib.pyplot as plt
from tqdm import tqdm
from transformers import AutoTokenizer, TFRobertaModel
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score, recall_score, f1_score

# ---------------- CONFIG (tune these) ----------------
DATA_DIR = "/kaggle/input/promid-2025/promid_task_3/promid_task_3/"
MISINFO_CSV = os.path.join(DATA_DIR, "misinfo_train.csv")
NONMISINFO_CSV = os.path.join(DATA_DIR, "nonmisinfo_train.csv")
TEST_CSV = os.path.join(DATA_DIR, "test_final_merge_withoutlabel.csv")

OUTPUT_TRAIN_INTERIM = "train_final_llm.csv"
OUTPUT_TEST_INTERIM = "test_final_llm.csv"
PRED_CSV = "predictions.csv"
PLOTS_DIR = "plots"
MODELS_DIR = "models"

PDF_REF_PATH = "/mnt/data/Fake_News_Detection_in_Social_Media_Hybrid_Deep_Le.pdf"

# Encoder / training hyperparams (reduced for minimal resources)
BERT_NAME = "roberta-base"   # <-- smaller than large
MAX_LEN = 512                # <-- much smaller (saves huge memory)
BATCH_SIZE = 16               # <-- small batch
EPOCHS = 50                   # fewer epochs (safe default)
LR = 3e-4                    # heads-only fine-tuning lr
SEED = 42

# contrastive & heads
PROJ_DIM = 64
TEMPERATURE = 0.07
ALPHA = 1.0   # weight for contrastive loss
BETA = 1.0    # weight for classifier loss

# freeze encoder for memory savings (set False to fine-tune full model)
FREEZE_ENCODER = True
# if you want to unfreeze last N encoder layers, set FREEZE_ENCODER=True and UNFREEZE_LAST_N>0
UNFREEZE_LAST_N = 0

# callbacks
lr_reduce_patience = 3
lr_reduce_factor = 0.5
early_stop_patience = 6

USE_MIXED_PRECISION = False  # set False for stability on small setups

# reproducibility
np.random.seed(SEED)
tf.random.set_seed(SEED)
random.seed(SEED)

# create dirs
os.makedirs(MODELS_DIR, exist_ok=True)
os.makedirs(PLOTS_DIR, exist_ok=True)

# ---------------- helpers ----------------
def find_text_column(df):
    for c in df.columns:
        if "text" in c.lower():
            return c
    for cand in ["tweet","content","body"]:
        for c in df.columns:
            if cand in c.lower():
                return c
    return df.columns[0]

def safe_read_csv(path):
    if not os.path.exists(path):
        raise FileNotFoundError(f"File not found: {path}")
    return pd.read_csv(path)

# ---------------- mixed precision ----------------
if USE_MIXED_PRECISION:
    try:
        from tensorflow.keras import mixed_precision
        mixed_precision.set_global_policy('mixed_float16')
        print("Mixed precision ENABLED.")
    except Exception as e:
        print("Mixed precision unavailable, continuing in float32. Error:", e)
        USE_MIXED_PRECISION = False

# ---------------- GPU memory growth (help avoid full allocation) ----------------
gpus = tf.config.list_physical_devices('GPU')
if gpus:
    try:
        for g in gpus:
            tf.config.experimental.set_memory_growth(g, True)
        print("Enabled GPU memory growth for", len(gpus), "GPU(s).")
    except Exception as e:
        print("Could not set memory growth:", e)

# ---------------- load CSVs & preprocess ----------------
print("Loading CSVs...")
df_mis = safe_read_csv(MISINFO_CSV)
df_non = safe_read_csv(NONMISINFO_CSV)
df_test = safe_read_csv(TEST_CSV)

text_col_mis = find_text_column(df_mis)
text_col_non = find_text_column(df_non)
text_col_test = find_text_column(df_test)
print("Detected text columns:", text_col_mis, text_col_non, text_col_test)

# keep only text and label
df_mis_small = df_mis[[text_col_mis]].rename(columns={text_col_mis: "text"}).copy()
df_mis_small["label"] = "misinfo"
df_non_small = df_non[[text_col_non]].rename(columns={text_col_non: "text"}).copy()
df_non_small["label"] = "nonmisinfo"

# drop empty
df_mis_small['text'] = df_mis_small['text'].astype(str).str.strip()
df_non_small['text'] = df_non_small['text'].astype(str).str.strip()
df_mis_small = df_mis_small[df_mis_small['text'] != ""].reset_index(drop=True)
df_non_small = df_non_small[df_non_small['text'] != ""].reset_index(drop=True)

# downsample non to match mis count (balanced)
n_mis = len(df_mis_small)
n_non = len(df_non_small)
print(f"Counts before balancing -> misinfo: {n_mis}, nonmisinfo: {n_non}")
if n_mis == 0:
    raise ValueError("No misinfo rows found")
if n_non >= n_mis:
    df_non_down = df_non_small.sample(n=n_mis, random_state=SEED).reset_index(drop=True)
else:
    df_non_down = df_non_small.sample(n=n_mis, replace=True, random_state=SEED).reset_index(drop=True)

train_combined = pd.concat([df_mis_small, df_non_down], ignore_index=True)
train_combined = train_combined.sample(frac=1.0, random_state=SEED).reset_index(drop=True)
print("Combined balanced train shape:", train_combined.shape)
print(train_combined['label'].value_counts())

# prepare test (ensure id)
if 'id' not in df_test.columns:
    df_test = df_test.reset_index().rename(columns={'index':'id'})
else:
    df_test['id'] = df_test['id'].ffill().bfill()
df_test_small = df_test[['id', text_col_test]].rename(columns={text_col_test:"text"}).copy()
df_test_small['text'] = df_test_small['text'].astype(str).str.strip().replace("", "NA")

# save interim csvs
train_combined[['text','label']].to_csv(OUTPUT_TRAIN_INTERIM, index=False)
df_test_small[['id','text']].to_csv(OUTPUT_TEST_INTERIM, index=False)
print("Saved interim files:", OUTPUT_TRAIN_INTERIM, OUTPUT_TEST_INTERIM)

# label mapping
label_to_int = {"misinfo": 1, "nonmisinfo": 0}
train_combined['label_int'] = train_combined['label'].map(label_to_int).astype(int)

# stratified split 80:20
train_df, val_df = train_test_split(train_combined, test_size=0.2, random_state=SEED, stratify=train_combined['label_int'])
train_df = train_df.reset_index(drop=True)
val_df = val_df.reset_index(drop=True)
print("Train, Val, Test sizes:", len(train_df), len(val_df), len(df_test_small))

# ---------------- tokenizer & encoder ----------------
print("Loading tokenizer & encoder:", BERT_NAME)
tokenizer = AutoTokenizer.from_pretrained(BERT_NAME)
encoder = TFRobertaModel.from_pretrained(BERT_NAME)

# Freeze encoder if requested (massive memory saver)
if FREEZE_ENCODER:
    encoder.trainable = False
    print("Encoder frozen (FREEZE_ENCODER=True). Only heads will be trained.")
    if UNFREEZE_LAST_N > 0:
        # try to unfreeze last N transformer encoder layers (if desired)
        try:
            # transformer encoder layers are usually named like "roberta/encoder/layer_{i}"
            encoder.trainable = True
            for var in encoder.variables:
                var.trainable = False
            # find layer variables and set last UNFREEZE_LAST_N layers to trainable
            layer_vars = [v for v in encoder.variables if "encoder/layer" in v.name]
            # infer layer count
            layer_names = sorted({v.name.split("/encoder/layer_._")[-1].split("/")[0] for v in layer_vars})
            # This is best-effort; if it fails we simply keep encoder frozen
            num_layers = len(layer_names)
            if UNFREEZE_LAST_N >= num_layers:
                UNFREEZE_LAST_N = num_layers
            # set trainable for last N layers
            for v in encoder.variables:
                for i in range(num_layers - UNFREEZE_LAST_N, num_layers):
                    if f"encoder/layer_._{i}" in v.name:
                        v._trainable = True
            print(f"Unfroze last {UNFREEZE_LAST_N} encoder layers (best-effort).")
        except Exception as e:
            print("Could not partially unfreeze encoder, keeping it frozen. Error:", e)
            encoder.trainable = False

# ---------------- tokenization helper (batched & memory-friendly) ----------------
def tokenize_texts(texts, desc="tokenize", batch_size=256):
    all_input_ids = []
    all_attention = []
    n = len(texts)
    steps = math.ceil(n / batch_size)
    for i in tqdm(range(0, n, batch_size), total=steps, desc=desc, unit="batch"):
        batch = texts[i:i+batch_size]
        toks = tokenizer(batch, padding="max_length", truncation=True, max_length=MAX_LEN, return_tensors="tf")
        all_input_ids.append(toks["input_ids"])
        all_attention.append(toks["attention_mask"])
    input_ids = tf.concat(all_input_ids, axis=0)
    attention_mask = tf.concat(all_attention, axis=0)
    return input_ids, attention_mask

print("Tokenizing train/val/test (shows progress)...")
train_input_ids, train_attention = tokenize_texts(train_df['text'].astype(str).tolist(), desc="Tokenize train", batch_size=128)
val_input_ids, val_attention = tokenize_texts(val_df['text'].astype(str).tolist(), desc="Tokenize val", batch_size=128)
test_input_ids, test_attention = tokenize_texts(df_test_small['text'].astype(str).tolist(), desc="Tokenize test", batch_size=128)

train_labels = train_df['label_int'].values.astype(np.int32)
val_labels = val_df['label_int'].values.astype(np.int32)

# build tf.data datasets (shuffle only train)
train_ds = tf.data.Dataset.from_tensor_slices(((train_input_ids, train_attention), train_labels))
train_ds = train_ds.shuffle(buffer_size=2048, seed=SEED).batch(BATCH_SIZE).prefetch(tf.data.AUTOTUNE)

val_ds = tf.data.Dataset.from_tensor_slices(((val_input_ids, val_attention), val_labels)).batch(BATCH_SIZE).prefetch(tf.data.AUTOTUNE)
test_ds = tf.data.Dataset.from_tensor_slices(((test_input_ids, test_attention), np.zeros((test_input_ids.shape[0],), dtype=np.int32))).batch(BATCH_SIZE)

# ---------------- Model components ----------------
def mean_pool(last_hidden, mask):
    last_hidden_dtype = last_hidden.dtype
    mask = tf.cast(tf.expand_dims(mask, -1), last_hidden_dtype)
    summed = tf.reduce_sum(last_hidden * mask, axis=1)
    counts = tf.reduce_sum(mask, axis=1) + tf.constant(1e-10, dtype=last_hidden_dtype)
    return summed / counts

# projection head: includes dropout so calling it twice yields two stochastic views without re-running encoder.
def build_projection_head(proj_dim=PROJ_DIM, dropout_rate=0.1):
    inp = tf.keras.Input(shape=(encoder.config.hidden_size,), dtype=tf.float32)
    x = tf.keras.layers.Dense(encoder.config.hidden_size, activation="gelu", dtype="float32")(inp)
    x = tf.keras.layers.Dropout(dropout_rate)(x)
    x = tf.keras.layers.Dense(proj_dim, activation=None, dtype="float32")(x)
    x = tf.keras.layers.LayerNormalization(dtype="float32")(x)
    out = tf.keras.layers.Lambda(lambda t: tf.math.l2_normalize(t, axis=1), dtype="float32")(x)
    return tf.keras.Model(inp, out, name="proj_head")

def build_classifier_head():
    inp = tf.keras.Input(shape=(encoder.config.hidden_size,), dtype=tf.float32)
    x = tf.keras.layers.Dense(256, activation="gelu", dtype="float32")(inp)
    x = tf.keras.layers.Dropout(0.2)(x)
    out = tf.keras.layers.Dense(1, activation="sigmoid", dtype="float32")(x)
    return tf.keras.Model(inp, out, name="clf_head")

proj_head = build_projection_head()
clf_head = build_classifier_head()

# collect trainable variables: depending on freeze
trainable_vars = proj_head.trainable_variables + clf_head.trainable_variables
if not FREEZE_ENCODER:
    trainable_vars = list(encoder.trainable_variables) + trainable_vars

# optimizer
optimizer = tf.keras.optimizers.Adam(learning_rate=LR)

bce = tf.keras.losses.BinaryCrossentropy(from_logits=False)

# ---------------- supervised contrastive loss (vectorized, dtype-safe) ----------------
@tf.function
def supervised_contrastive_loss(projections, labels, temperature=TEMPERATURE):
    projections = tf.cast(projections, tf.float32)   # (2N, d)
    labels = tf.cast(labels, tf.int32)

    logits = tf.matmul(projections, projections, transpose_b=True) / tf.cast(temperature, tf.float32)  # (2N,2N)
    n = tf.shape(logits)[0]
    diag_mask = tf.eye(n, dtype=tf.bool)

    very_neg = tf.constant(-1e12, dtype=logits.dtype)
    logits_masked = tf.where(diag_mask, tf.fill(tf.shape(logits), very_neg), logits)

    labels_eq = tf.cast(tf.equal(tf.expand_dims(labels, 1), tf.expand_dims(labels, 0)), logits.dtype)
    labels_eq = labels_eq - tf.cast(tf.eye(n), logits.dtype)

    exp_logits = tf.exp(logits_masked)

    exp_pos = tf.reduce_sum(exp_logits * labels_eq, axis=1)
    exp_all = tf.reduce_sum(exp_logits, axis=1)

    pos_counts = tf.reduce_sum(labels_eq, axis=1)

    eps = tf.constant(1e-12, dtype=logits.dtype)
    ratio = tf.where(pos_counts > 0, exp_pos / (exp_all + eps), tf.zeros_like(exp_pos))
    loss_per_anchor = -tf.math.log(ratio + eps)

    valid = tf.cast(pos_counts > 0, logits.dtype)
    loss = tf.reduce_sum(loss_per_anchor * valid) / (tf.reduce_sum(valid) + eps)

    return tf.cast(loss, tf.float32)

# ---------------- training step (single encoder forward) ----------------
@tf.function
def train_step(input_ids, attention_mask, labels):
    labels = tf.cast(labels, tf.float32)
    with tf.GradientTape() as tape:
        # single encoder forward (memory saver)
        enc_out = encoder(input_ids, attention_mask=attention_mask, training=False if FREEZE_ENCODER else True).last_hidden_state
        pool = mean_pool(enc_out, attention_mask)  # (batch, hidden)

        pool_f32 = tf.cast(pool, tf.float32)

        # get two stochastic projections by calling proj_head twice with training=True (dropout in proj head)
        z1 = proj_head(pool_f32, training=True)
        z2 = proj_head(pool_f32, training=True)
        z = tf.concat([z1, z2], axis=0)
        labels_twice = tf.concat([labels, labels], axis=0)

        c_loss = supervised_contrastive_loss(z, tf.cast(labels_twice, tf.int32))

        preds = clf_head(pool_f32, training=True)
        cls_loss = tf.reduce_mean(bce(tf.reshape(labels, (-1,1)), preds))

        total_loss = ALPHA * c_loss + BETA * cls_loss

    grads = tape.gradient(total_loss, trainable_vars)
    # gradient clipping (avoid large steps)
    grads, _ = tf.clip_by_global_norm(grads, 1.0)
    optimizer.apply_gradients(zip(grads, trainable_vars))

    return tf.cast(total_loss, tf.float32), tf.cast(c_loss, tf.float32), tf.cast(cls_loss, tf.float32)

# ---------------- validation ----------------
def evaluate_on_val():
    preds_all = []
    y_all = []
    for (batch_ids, batch_mask), batch_labels in val_ds:
        out = encoder(batch_ids, attention_mask=batch_mask, training=False).last_hidden_state
        pool = mean_pool(out, batch_mask)
        preds = clf_head(tf.cast(pool, tf.float32), training=False).numpy().ravel()
        preds_all.extend(preds.tolist())
        y_all.extend(batch_labels.numpy().tolist())
    preds_bin = (np.array(preds_all) >= 0.5).astype(int)
    y_all = np.array(y_all).astype(int)
    prec = precision_score(y_all, preds_bin, zero_division=0)
    rec = recall_score(y_all, preds_bin, zero_division=0)
    f1 = f1_score(y_all, preds_bin, zero_division=0)
    return prec, rec, f1, preds_all, y_all

# ---------------- training loop ----------------
best_val_f1 = -1.0
epochs_since_improve = 0
epoch_list, total_losses, c_losses, cls_losses = [], [], [], []
val_precisions, val_recalls, val_f1s = [], [], []
lr_history = []

num_steps = math.ceil(len(train_df) / BATCH_SIZE)
print("\nStarting contrastive training for up to", EPOCHS, "epochs")

for epoch in range(1, EPOCHS + 1):
    epoch_list.append(epoch)
    batch_total_losses, batch_c_losses, batch_cls_losses = [], [], []
    with tqdm(total=num_steps, desc=f"Epoch {epoch}/{EPOCHS}", unit="batch") as pbar:
        for (batch_ids, batch_mask), batch_labels in train_ds:
            try:
                t_loss, t_c_loss, t_cls_loss = train_step(batch_ids, batch_mask, batch_labels)
            except tf.errors.ResourceExhaustedError as e:
                print("ResourceExhaustedError during train_step; consider reducing BATCH_SIZE or MAX_LEN.")
                raise e
            batch_total_losses.append(float(t_loss.numpy()))
            batch_c_losses.append(float(t_c_loss.numpy()))
            batch_cls_losses.append(float(t_cls_loss.numpy()))
            pbar.set_postfix({
                "tot": f"{np.mean(batch_total_losses):.4f}",
                "c": f"{np.mean(batch_c_losses):.4f}",
                "cls": f"{np.mean(batch_cls_losses):.4f}"
            })
            pbar.update(1)

    avg_total = np.mean(batch_total_losses)
    avg_c = np.mean(batch_c_losses)
    avg_cls = np.mean(batch_cls_losses)
    total_losses.append(avg_total); c_losses.append(avg_c); cls_losses.append(avg_cls)

    # evaluate
    prec, rec, f1, _, _ = evaluate_on_val()
    val_precisions.append(prec); val_recalls.append(rec); val_f1s.append(f1)

    # LR history
    try:
        current_lr = float(optimizer.lr.numpy())
    except Exception:
        current_lr = float(LR)
    lr_history.append(current_lr)

    print(f"\nEpoch {epoch} summary: tot_loss={avg_total:.4f} contrastive={avg_c:.4f} cls={avg_cls:.4f}")
    print(f" Val -> Precision: {prec:.4f}, Recall: {rec:.4f}, F1: {f1:.4f} | LR: {current_lr:.2e}")

    # save best
    if f1 > best_val_f1 + 1e-6:
        best_val_f1 = f1
        epochs_since_improve = 0
        try:
            proj_head.save_weights(os.path.join(MODELS_DIR, "best_proj_weights.h5"))
            clf_head.save_weights(os.path.join(MODELS_DIR, "best_clf_weights.h5"))
            if not FREEZE_ENCODER:
                encoder.save_weights(os.path.join(MODELS_DIR, "best_encoder_weights.h5"))
            print("  >> New best val F1, head weights saved.")
        except Exception as e:
            print("  >> Warning: saving weights failed:", e)
    else:
        epochs_since_improve += 1
        print(f"  >> No improvement for {epochs_since_improve} epoch(s).")

    # LR reduce (simple)
    if epochs_since_improve > 0 and epochs_since_improve % lr_reduce_patience == 0:
        new_lr = current_lr * lr_reduce_factor
        try:
            tf.keras.backend.set_value(optimizer.lr, new_lr)
            print(f"  >> Reduced LR to {new_lr:.2e}")
        except Exception:
            print("  >> Could not reduce LR via backend set_value.")

    if epochs_since_improve >= early_stop_patience:
        print(f"\nEarly stopping: no improvement for {epochs_since_improve} epochs.")
        break

# restore best weights if exist
best_proj = os.path.join(MODELS_DIR, "best_proj_weights.h5")
best_clf = os.path.join(MODELS_DIR, "best_clf_weights.h5")
best_encoder = os.path.join(MODELS_DIR, "best_encoder_weights.h5")

if os.path.exists(best_proj) and os.path.exists(best_clf):
    try:
        print("Restoring best head weights...")
        proj_head.load_weights(best_proj)
        clf_head.load_weights(best_clf)
    except Exception as e:
        print("Warning: could not restore head weights:", e)
if not FREEZE_ENCODER and os.path.exists(best_encoder):
    try:
        encoder.load_weights(best_encoder)
    except Exception as e:
        print("Warning: could not restore encoder weights:", e)

# ---------------- plotting (optional) ----------------
if epoch_list:
    plt.figure(figsize=(8,5))
    plt.plot(epoch_list, total_losses, label="total_loss")
    plt.plot(epoch_list, c_losses, label="contrastive_loss")
    plt.plot(epoch_list, cls_losses, label="classifier_loss")
    plt.xlabel("Epoch"); plt.ylabel("Loss"); plt.title("Training losses"); plt.legend(); plt.grid(True)
    plt.tight_layout(); plt.savefig(os.path.join(PLOTS_DIR, "train_losses.png"), dpi=200); plt.close()

    plt.figure(figsize=(8,5))
    plt.plot(epoch_list, val_precisions, marker='o', label="precision")
    plt.plot(epoch_list, val_recalls, marker='o', label="recall")
    plt.plot(epoch_list, val_f1s, marker='o', label="f1")
    plt.xlabel("Epoch"); plt.ylabel("Score"); plt.title("Validation metrics"); plt.legend(); plt.grid(True)
    plt.tight_layout(); plt.savefig(os.path.join(PLOTS_DIR, "val_metrics.png"), dpi=200); plt.close()

    plt.figure(figsize=(8,5))
    plt.plot(epoch_list, lr_history, marker='o', label="lr")
    plt.xlabel("Epoch"); plt.ylabel("Learning rate"); plt.title("LR schedule"); plt.legend(); plt.grid(True)
    plt.tight_layout(); plt.savefig(os.path.join(PLOTS_DIR, "lr_schedule.png"), dpi=200); plt.close()

print("Saved plots to", PLOTS_DIR)

# ---------------- final predictions on test ----------------
print("Running final predictions on test set (with progress)...")
test_preds = []
test_steps = math.ceil(len(df_test_small) / BATCH_SIZE)
for (batch_ids, batch_mask), _ in tqdm(test_ds, total=test_steps, desc="Test predict", unit="batch"):
    out = encoder(batch_ids, attention_mask=batch_mask, training=False).last_hidden_state
    pool = mean_pool(out, batch_mask)
    preds = clf_head(tf.cast(pool, tf.float32), training=False).numpy().ravel()
    test_preds.extend(preds.tolist())

test_preds = np.array(test_preds)
test_pred_bins = (test_preds >= 0.5).astype(int)
label_map_back = {1: "misinfo", 0: "nonmisinfo"}
pred_label_str = [label_map_back[int(x)] for x in test_pred_bins]

out_df = pd.DataFrame({"id": df_test_small['id'].astype(int), "label": pred_label_str})
out_df.to_csv(PRED_CSV, index=False)
print("Saved predictions to", PRED_CSV, "(columns: id,label)")

# save final head weights (encoder saved only if not frozen)
try:
    proj_head.save_weights(os.path.join(MODELS_DIR, "proj_head_weights.h5"))
    clf_head.save_weights(os.path.join(MODELS_DIR, "clf_head_weights.h5"))
    if not FREEZE_ENCODER:
        encoder.save_pretrained(os.path.join(MODELS_DIR, "encoder_saved"))
except Exception as e:
    print("Warning: saving model artifacts failed:", e)

print("Saved model artifacts to", MODELS_DIR)
print("\nReference (local PDF path):", PDF_REF_PATH)

Enabled GPU memory growth for 1 GPU(s).
Loading CSVs...


  return pd.read_csv(path)


Detected text columns: text text text
Counts before balancing -> misinfo: 364, nonmisinfo: 34174
Combined balanced train shape: (728, 2)
label
misinfo       364
nonmisinfo    364
Name: count, dtype: int64
Saved interim files: train_final_llm.csv test_final_llm.csv
Train, Val, Test sizes: 582 146 2414
Loading tokenizer & encoder: roberta-base


Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFRobertaModel: ['lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.dense.bias', 'roberta.embeddings.position_ids', 'lm_head.layer_norm.weight', 'lm_head.bias']
- This IS expected if you are initializing TFRobertaModel from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFRobertaModel from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
Some weights or buffers of the TF 2.0 model TFRobertaModel were not initialized from the PyTorch model and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and infe

Encoder frozen (FREEZE_ENCODER=True). Only heads will be trained.
Tokenizing train/val/test (shows progress)...


Tokenize train: 100%|██████████| 5/5 [00:00<00:00, 63.87batch/s]
Tokenize val: 100%|██████████| 2/2 [00:00<00:00, 101.92batch/s]
Tokenize test: 100%|██████████| 19/19 [00:00<00:00, 24.14batch/s]



Starting contrastive training for up to 50 epochs


Epoch 1/50: 100%|██████████| 37/37 [00:24<00:00,  1.51batch/s, tot=1.0119, c=0.3551, cls=0.6568]



Epoch 1 summary: tot_loss=1.0119 contrastive=0.3551 cls=0.6568
 Val -> Precision: 0.6701, Recall: 0.8904, F1: 0.7647 | LR: 3.00e-04
  >> New best val F1, head weights saved.


Epoch 2/50: 100%|██████████| 37/37 [00:12<00:00,  2.95batch/s, tot=0.6919, c=0.1383, cls=0.5536]



Epoch 2 summary: tot_loss=0.6919 contrastive=0.1383 cls=0.5536
 Val -> Precision: 0.6667, Recall: 0.8219, F1: 0.7362 | LR: 3.00e-04
  >> No improvement for 1 epoch(s).


Epoch 3/50: 100%|██████████| 37/37 [00:12<00:00,  2.94batch/s, tot=0.5536, c=0.0599, cls=0.4937]



Epoch 3 summary: tot_loss=0.5536 contrastive=0.0599 cls=0.4937
 Val -> Precision: 0.7317, Recall: 0.8219, F1: 0.7742 | LR: 3.00e-04
  >> New best val F1, head weights saved.


Epoch 4/50: 100%|██████████| 37/37 [00:12<00:00,  2.95batch/s, tot=0.4833, c=0.0499, cls=0.4334]



Epoch 4 summary: tot_loss=0.4833 contrastive=0.0499 cls=0.4334
 Val -> Precision: 0.7111, Recall: 0.8767, F1: 0.7853 | LR: 3.00e-04
  >> New best val F1, head weights saved.


Epoch 5/50: 100%|██████████| 37/37 [00:12<00:00,  2.94batch/s, tot=0.4307, c=0.0233, cls=0.4074]



Epoch 5 summary: tot_loss=0.4307 contrastive=0.0233 cls=0.4074
 Val -> Precision: 0.7065, Recall: 0.8904, F1: 0.7879 | LR: 3.00e-04
  >> New best val F1, head weights saved.


Epoch 6/50: 100%|██████████| 37/37 [00:12<00:00,  2.95batch/s, tot=0.3918, c=0.0165, cls=0.3753]



Epoch 6 summary: tot_loss=0.3918 contrastive=0.0165 cls=0.3753
 Val -> Precision: 0.7191, Recall: 0.8767, F1: 0.7901 | LR: 3.00e-04
  >> New best val F1, head weights saved.


Epoch 7/50: 100%|██████████| 37/37 [00:12<00:00,  2.95batch/s, tot=0.3694, c=0.0081, cls=0.3613]



Epoch 7 summary: tot_loss=0.3694 contrastive=0.0081 cls=0.3613
 Val -> Precision: 0.7973, Recall: 0.8082, F1: 0.8027 | LR: 3.00e-04
  >> New best val F1, head weights saved.


Epoch 8/50: 100%|██████████| 37/37 [00:12<00:00,  2.95batch/s, tot=0.3576, c=0.0070, cls=0.3506]



Epoch 8 summary: tot_loss=0.3576 contrastive=0.0070 cls=0.3506
 Val -> Precision: 0.7619, Recall: 0.8767, F1: 0.8153 | LR: 3.00e-04
  >> New best val F1, head weights saved.


Epoch 9/50: 100%|██████████| 37/37 [00:12<00:00,  2.95batch/s, tot=0.3278, c=0.0068, cls=0.3210]



Epoch 9 summary: tot_loss=0.3278 contrastive=0.0068 cls=0.3210
 Val -> Precision: 0.7975, Recall: 0.8630, F1: 0.8289 | LR: 3.00e-04
  >> New best val F1, head weights saved.


Epoch 10/50: 100%|██████████| 37/37 [00:12<00:00,  2.94batch/s, tot=0.3201, c=0.0057, cls=0.3144]



Epoch 10 summary: tot_loss=0.3201 contrastive=0.0057 cls=0.3144
 Val -> Precision: 0.7683, Recall: 0.8630, F1: 0.8129 | LR: 3.00e-04
  >> No improvement for 1 epoch(s).


Epoch 11/50: 100%|██████████| 37/37 [00:12<00:00,  2.95batch/s, tot=0.3102, c=0.0049, cls=0.3053]



Epoch 11 summary: tot_loss=0.3102 contrastive=0.0049 cls=0.3053
 Val -> Precision: 0.7831, Recall: 0.8904, F1: 0.8333 | LR: 3.00e-04
  >> New best val F1, head weights saved.


Epoch 12/50: 100%|██████████| 37/37 [00:12<00:00,  2.94batch/s, tot=0.2884, c=0.0022, cls=0.2862]



Epoch 12 summary: tot_loss=0.2884 contrastive=0.0022 cls=0.2862
 Val -> Precision: 0.7738, Recall: 0.8904, F1: 0.8280 | LR: 3.00e-04
  >> No improvement for 1 epoch(s).


Epoch 13/50: 100%|██████████| 37/37 [00:12<00:00,  2.95batch/s, tot=0.2893, c=0.0032, cls=0.2862]



Epoch 13 summary: tot_loss=0.2893 contrastive=0.0032 cls=0.2862
 Val -> Precision: 0.7647, Recall: 0.8904, F1: 0.8228 | LR: 3.00e-04
  >> No improvement for 2 epoch(s).


Epoch 14/50: 100%|██████████| 37/37 [00:12<00:00,  2.94batch/s, tot=0.2737, c=0.0053, cls=0.2684]



Epoch 14 summary: tot_loss=0.2737 contrastive=0.0053 cls=0.2684
 Val -> Precision: 0.7838, Recall: 0.7945, F1: 0.7891 | LR: 3.00e-04
  >> No improvement for 3 epoch(s).
  >> Reduced LR to 1.50e-04


Epoch 15/50: 100%|██████████| 37/37 [00:12<00:00,  2.95batch/s, tot=0.2636, c=0.0031, cls=0.2605]



Epoch 15 summary: tot_loss=0.2636 contrastive=0.0031 cls=0.2605
 Val -> Precision: 0.7927, Recall: 0.8904, F1: 0.8387 | LR: 1.50e-04
  >> New best val F1, head weights saved.


Epoch 16/50: 100%|██████████| 37/37 [00:12<00:00,  2.95batch/s, tot=0.2640, c=0.0019, cls=0.2621]



Epoch 16 summary: tot_loss=0.2640 contrastive=0.0019 cls=0.2621
 Val -> Precision: 0.7763, Recall: 0.8082, F1: 0.7919 | LR: 1.50e-04
  >> No improvement for 1 epoch(s).


Epoch 17/50: 100%|██████████| 37/37 [00:12<00:00,  2.95batch/s, tot=0.2449, c=0.0015, cls=0.2434]



Epoch 17 summary: tot_loss=0.2449 contrastive=0.0015 cls=0.2434
 Val -> Precision: 0.7805, Recall: 0.8767, F1: 0.8258 | LR: 1.50e-04
  >> No improvement for 2 epoch(s).


Epoch 18/50: 100%|██████████| 37/37 [00:12<00:00,  2.95batch/s, tot=0.2531, c=0.0043, cls=0.2488]



Epoch 18 summary: tot_loss=0.2531 contrastive=0.0043 cls=0.2488
 Val -> Precision: 0.7945, Recall: 0.7945, F1: 0.7945 | LR: 1.50e-04
  >> No improvement for 3 epoch(s).
  >> Reduced LR to 7.50e-05


Epoch 19/50: 100%|██████████| 37/37 [00:12<00:00,  2.95batch/s, tot=0.2463, c=0.0018, cls=0.2445]



Epoch 19 summary: tot_loss=0.2463 contrastive=0.0018 cls=0.2445
 Val -> Precision: 0.7831, Recall: 0.8904, F1: 0.8333 | LR: 7.50e-05
  >> No improvement for 4 epoch(s).


Epoch 20/50: 100%|██████████| 37/37 [00:12<00:00,  2.95batch/s, tot=0.2434, c=0.0018, cls=0.2416]



Epoch 20 summary: tot_loss=0.2434 contrastive=0.0018 cls=0.2416
 Val -> Precision: 0.7805, Recall: 0.8767, F1: 0.8258 | LR: 7.50e-05
  >> No improvement for 5 epoch(s).


Epoch 21/50: 100%|██████████| 37/37 [00:12<00:00,  2.95batch/s, tot=0.2387, c=0.0011, cls=0.2376]



Epoch 21 summary: tot_loss=0.2387 contrastive=0.0011 cls=0.2376
 Val -> Precision: 0.7805, Recall: 0.8767, F1: 0.8258 | LR: 7.50e-05
  >> No improvement for 6 epoch(s).
  >> Reduced LR to 3.75e-05

Early stopping: no improvement for 6 epochs.
Restoring best head weights...
Saved plots to plots
Running final predictions on test set (with progress)...


Test predict: 100%|██████████| 151/151 [00:54<00:00,  2.75batch/s]

Saved predictions to predictions.csv (columns: id,label)
Saved model artifacts to models

Reference (local PDF path): /mnt/data/Fake_News_Detection_in_Social_Media_Hybrid_Deep_Le.pdf



