In [6]:
import os
import gc
import random
import argparse
from dataclasses import dataclass
from typing import Dict, List, Tuple

import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, accuracy_score

import torch
from datasets import Dataset
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    DataCollatorWithPadding,
    Trainer,
    TrainingArguments,
    set_seed,
    pipeline,
)


# -----------------------
# Configuration defaults
# -----------------------
DEFAULT_DATA_CSV = "data/IMDB Dataset.csv"
DEFAULT_OUTPUT_DIR = "outputs"

# 5 candidate models to try on a subset
CANDIDATE_MODEL_NAMES = [
    "distilbert-base-uncased",
    "bert-base-uncased",
    "roberta-base",
    "google/electra-small-discriminator",
    "albert-base-v2",
]

ID2LABEL = {0: "NEGATIVE", 1: "POSITIVE"}
LABEL2ID = {"NEGATIVE": 0, "POSITIVE": 1}


def load_and_prepare_df(csv_path: str, seed: int) -> Tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]:
    """
    Load the Kaggle IMDB CSV and split into train/val/test.
    Kaggle columns: "review", "sentiment" (values 'positive'/'negative').
    """
    df = pd.read_csv(csv_path)
    if "review" not in df.columns or "sentiment" not in df.columns:
        raise ValueError("CSV must contain 'review' and 'sentiment' columns.")

    # Convert labels to ints: positive -> 1, negative -> 0
    df["label"] = df["sentiment"].map({"positive": 1, "negative": 0}).astype(int)
    df = df[["review", "label"]].dropna().reset_index(drop=True)

    # Train/val/test split: 80/10/10
    train_val_df, test_df = train_test_split(df, test_size=0.1, random_state=seed, stratify=df["label"])
    train_df, val_df = train_test_split(train_val_df, test_size=0.1111, random_state=seed, stratify=train_val_df["label"])  # 0.1111 of 90% ~ 10%

    return train_df.reset_index(drop=True), val_df.reset_index(drop=True), test_df.reset_index(drop=True)


def make_tokenize_fn(tokenizer: AutoTokenizer, max_length: int):
    def tokenize(batch):
        return tokenizer(
            batch["review"],
            padding="max_length",
            truncation=True,
            max_length=max_length,
        )
    return tokenize


@dataclass
class ModelRunResult:
    model_name: str
    eval_f1: float
    eval_accuracy: float
    output_dir: str


def compute_metrics_builder() -> callable:
    # Custom F1 compute function for binary classification
    def compute_metrics(eval_pred):
        logits, labels = eval_pred
        preds = np.argmax(logits, axis=-1)
        f1 = f1_score(labels, preds, average="binary")
        acc = accuracy_score(labels, preds)
        return {"f1": f1, "accuracy": acc}
    return compute_metrics


def build_trainer(
    model_name_or_path: str,
    num_labels: int,
    train_ds: Dataset,
    val_ds: Dataset,
    output_dir: str,
    learning_rate: float,
    epochs: int,
    per_device_train_batch_size: int,
    per_device_eval_batch_size: int,
    weight_decay: float,
    warmup_ratio: float,
    fp16: bool,
    seed: int,
    logging_steps: int = 50,
    eval_strategy: str = "epoch",
    save_strategy: str = "epoch",
    metric_for_best: str = "f1",
    max_length: int = 256,
) -> Tuple[Trainer, AutoTokenizer, AutoModelForSequenceClassification]:
    tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, use_fast=True)

    model = AutoModelForSequenceClassification.from_pretrained(
        model_name_or_path,
        num_labels=num_labels,
        id2label=ID2LABEL,
        label2id=LABEL2ID,
    )

    data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
    compute_metrics = compute_metrics_builder()

    # Important: label names for Trainer
    train_ds = train_ds.remove_columns([c for c in train_ds.column_names if c not in ["input_ids", "attention_mask", "label"]])
    val_ds = val_ds.remove_columns([c for c in val_ds.column_names if c not in ["input_ids", "attention_mask", "label"]])

    training_args = TrainingArguments(
        output_dir=output_dir,
        eval_strategy=eval_strategy,
        save_strategy=save_strategy,
        learning_rate=learning_rate,
        per_device_train_batch_size=per_device_train_batch_size,
        per_device_eval_batch_size=per_device_eval_batch_size,
        num_train_epochs=epochs,
        weight_decay=weight_decay,
        warmup_ratio=warmup_ratio,
        logging_steps=logging_steps,
        load_best_model_at_end=True,
        metric_for_best_model=metric_for_best,
        greater_is_better=True,
        fp16=fp16,
        dataloader_num_workers=2,
        seed=seed,
        report_to="none",
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_ds,
        eval_dataset=val_ds,
        tokenizer=tokenizer,
        data_collator=data_collator,
        compute_metrics=compute_metrics,
    )

    return trainer, tokenizer, model


def finetune_and_evaluate(
    model_name: str,
    train_df: pd.DataFrame,
    val_df: pd.DataFrame,
    subset_size: int,
    seed: int,
    base_output_dir: str,
    learning_rate: float,
    epochs: int,
    batch_size: int,
    max_length: int,
    fp16: bool,
) -> ModelRunResult:
    
    from sklearn.model_selection import train_test_split

    if subset_size and subset_size < len(train_df):
        train_df_sub, _ = train_test_split(
            train_df,
            train_size=subset_size,
            stratify=train_df["label"],
            random_state=seed,
        )
    else:
        train_df_sub = train_df


    # Make datasets
    tmp_tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)
    tokenize = make_tokenize_fn(tmp_tokenizer, max_length)
    train_ds = Dataset.from_pandas(train_df_sub)
    val_ds = Dataset.from_pandas(val_df)
    train_ds = train_ds.map(tokenize, batched=True, desc=f"Tokenizing train ({model_name})")
    val_ds = val_ds.map(tokenize, batched=True, desc=f"Tokenizing val ({model_name})")
    train_ds.set_format(type="torch", columns=["input_ids", "attention_mask", "label"])
    val_ds.set_format(type="torch", columns=["input_ids", "attention_mask", "label"])

    out_dir = os.path.join(base_output_dir, f"subset_{model_name.replace('/', '_')}")
    os.makedirs(out_dir, exist_ok=True)

    trainer, _, _ = build_trainer(
        model_name_or_path=model_name,
        num_labels=2,
        train_ds=train_ds,
        val_ds=val_ds,
        output_dir=out_dir,
        learning_rate=learning_rate,
        epochs=epochs,
        per_device_train_batch_size=batch_size,
        per_device_eval_batch_size=batch_size,
        weight_decay=0.01,
        warmup_ratio=0.06,
        fp16=fp16,
        seed=seed,
        max_length=max_length,
    )

    trainer.train()
    eval_metrics = trainer.evaluate()

    # Free memory
    trainer.save_state()
    trainer.save_model(out_dir)

    model_f1 = float(eval_metrics.get("eval_f1", 0.0))
    model_acc = float(eval_metrics.get("eval_accuracy", 0.0))

    # Clear CUDA cache between models
    del trainer
    torch.cuda.empty_cache()
    gc.collect()

    return ModelRunResult(
        model_name=model_name,
        eval_f1=model_f1,
        eval_accuracy=model_acc,
        output_dir=out_dir,
    )


def train_best_on_full(
    best_model_name: str,
    train_df: pd.DataFrame,
    val_df: pd.DataFrame,
    seed: int,
    base_output_dir: str,
    learning_rate: float,
    epochs: int,
    batch_size: int,
    max_length: int,
    fp16: bool,
) -> Tuple[str, AutoTokenizer]:
    # Rebuild tokenizer for full data
    tokenizer = AutoTokenizer.from_pretrained(best_model_name, use_fast=True)
    tokenize = make_tokenize_fn(tokenizer, max_length)

    train_ds = Dataset.from_pandas(train_df)
    val_ds = Dataset.from_pandas(val_df)

    train_ds = train_ds.map(tokenize, batched=True, desc=f"Tokenizing full train ({best_model_name})")
    val_ds = val_ds.map(tokenize, batched=True, desc=f"Tokenizing full val ({best_model_name})")
    train_ds.set_format(type="torch", columns=["input_ids", "attention_mask", "label"])
    val_ds.set_format(type="torch", columns=["input_ids", "attention_mask", "label"])

    out_dir = os.path.join(base_output_dir, f"full_{best_model_name.replace('/', '_')}")
    os.makedirs(out_dir, exist_ok=True)

    trainer, _, _ = build_trainer(
        model_name_or_path=best_model_name,
        num_labels=2,
        train_ds=train_ds,
        val_ds=val_ds,
        output_dir=out_dir,
        learning_rate=learning_rate,
        epochs=epochs,
        per_device_train_batch_size=batch_size,
        per_device_eval_batch_size=batch_size,
        weight_decay=0.01,
        warmup_ratio=0.06,
        fp16=fp16,
        seed=seed,
        max_length=max_length,
    )

    trainer.train()
    final_metrics = trainer.evaluate()
    print(f"[Final best model eval on val] {best_model_name} -> F1: {final_metrics.get('eval_f1'):.4f}, Acc: {final_metrics.get('eval_accuracy'):.4f}")

    trainer.save_state()
    trainer.save_model(out_dir)
    return out_dir, tokenizer


def run_inference(
    model_dir: str,
    tokenizer: AutoTokenizer,
    test_df: pd.DataFrame,
    seed: int,
    num_samples: int = 10,
):
    device = 0 if torch.cuda.is_available() else -1

    clf = pipeline(
        task="text-classification",
        model=model_dir,
        tokenizer=tokenizer,
        batch_size=32,
        device=device,
        truncation=True,
        max_length=256,
    )

    samples = test_df.sample(n=min(num_samples, len(test_df)), random_state=seed)
    texts: List[str] = samples["review"].tolist()
    true_labels: List[int] = samples["label"].tolist()
    preds = clf(texts)

    print("\n=== Inference on 10 random test samples ===")
    for i, (txt, gold, pred) in enumerate(zip(texts, true_labels, preds), 1):
        gold_lbl = ID2LABEL[gold]
        pred_lbl = pred["label"]
        score = pred.get("score", None)
        # Truncate review for display
        short_txt = (txt[:220] + "...") if len(txt) > 220 else txt
        if score is not None:
            print(f"[{i:02d}] Pred: {pred_lbl:>8}  (score={score:.3f}) | Gold: {gold_lbl:>8}\n     {short_txt}\n")
        else:
            print(f"[{i:02d}] Pred: {pred_lbl:>8} | Gold: {gold_lbl:>8}\n     {short_txt}\n")


def parse_args() -> argparse.Namespace:
    parser = argparse.ArgumentParser(
        description="IMDB Sentiment Fine-tuning (5 models subset + best on full)"
    )
    parser.add_argument("--data_csv", type=str, default=DEFAULT_DATA_CSV)
    parser.add_argument("--output_dir", type=str, default=DEFAULT_OUTPUT_DIR)
    parser.add_argument("--subset_size", type=int, default=8000)
    parser.add_argument("--epochs_subset", type=int, default=1)
    parser.add_argument("--epochs_full", type=int, default=3)
    parser.add_argument("--learning_rate_subset", type=float, default=2e-5)
    parser.add_argument("--learning_rate_full", type=float, default=2e-5)
    parser.add_argument("--batch_size", type=int, default=16)
    parser.add_argument("--max_length", type=int, default=256)
    parser.add_argument("--seed", type=int, default=42)
    parser.add_argument("--models", type=str, nargs="*", default=CANDIDATE_MODEL_NAMES)

    # 👇 ignores the extra junk Jupyter passes
    args, _ = parser.parse_known_args()
    return args


def main():
    args = parse_args()
    os.makedirs(args.output_dir, exist_ok=True)

    print("Config:")
    print(f"- data_csv: {args.data_csv}")
    print(f"- output_dir: {args.output_dir}")
    print(f"- models: {args.models}")
    print(f"- subset_size: {args.subset_size}")
    print(f"- epochs_subset: {args.epochs_subset}, epochs_full: {args.epochs_full}")
    print(f"- batch_size: {args.batch_size}, max_length: {args.max_length}")
    print(f"- seed: {args.seed}")

    # Reproducibility
    set_seed(args.seed)
    random.seed(args.seed)
    np.random.seed(args.seed)
    if torch.cuda.is_available():
        torch.manual_seed(args.seed)
        torch.cuda.manual_seed_all(args.seed)

    # Load data
    train_df, val_df, test_df = load_and_prepare_df(args.data_csv, seed=args.seed)

    # Decide fp16 if GPU available
    fp16 = torch.cuda.is_available()

    # 1) Fine-tune 5 different models on subset and evaluate
    results: List[ModelRunResult] = []
    for model_name in args.models:
        print(f"\n=== Subset fine-tuning: {model_name} ===")
        res = finetune_and_evaluate(
            model_name=model_name,
            train_df=train_df,
            val_df=val_df,
            subset_size=args.subset_size,
            seed=args.seed,
            base_output_dir=args.output_dir,
            learning_rate=args.learning_rate_subset,
            epochs=args.epochs_subset,
            batch_size=args.batch_size,
            max_length=args.max_length,
            fp16=fp16,
        )
        print(f"Result: {model_name} -> F1: {res.eval_f1:.4f}, Acc: {res.eval_accuracy:.4f}, saved at: {res.output_dir}")
        results.append(res)

    # 2) Pick best by F1
    results_sorted = sorted(results, key=lambda r: r.eval_f1, reverse=True)
    best = results_sorted[0]
    print("\n=== Best model on subset ===")
    print(f"{best.model_name} with F1: {best.eval_f1:.4f}, Acc: {best.eval_accuracy:.4f}")

    # 3) Fine-tune best model on full training set
    best_model_dir, best_tokenizer = train_best_on_full(
        best_model_name=best.model_name,
        train_df=train_df,
        val_df=val_df,
        seed=args.seed,
        base_output_dir=args.output_dir,
        learning_rate=args.learning_rate_full,
        epochs=args.epochs_full,
        batch_size=args.batch_size,
        max_length=args.max_length,
        fp16=fp16,
    )

    # 4) Inference on 10 random samples from test set
    run_inference(
        model_dir=best_model_dir,
        tokenizer=best_tokenizer,
        test_df=test_df,
        seed=args.seed,
        num_samples=10,
    )

    print("\nDone.")


if __name__ == "__main__":
    main()

Config:
- data_csv: data/IMDB Dataset.csv
- output_dir: outputs
- models: ['distilbert-base-uncased', 'bert-base-uncased', 'roberta-base', 'google/electra-small-discriminator', 'albert-base-v2']
- subset_size: 8000
- epochs_subset: 1, epochs_full: 3
- batch_size: 16, max_length: 256
- seed: 42

=== Subset fine-tuning: distilbert-base-uncased ===


Tokenizing train (distilbert-base-uncased): 100%|██████████| 8000/8000 [00:01<00:00, 6096.43 examples/s]
Tokenizing val (distilbert-base-uncased): 100%|██████████| 5000/5000 [00:00<00:00, 6195.06 examples/s]
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid de

Epoch,Training Loss,Validation Loss,F1,Accuracy
1,0.2253,0.271427,0.890678,0.8914


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

Result: distilbert-base-uncased -> F1: 0.8907, Acc: 0.8914, saved at: outputs/subset_distilbert-base-uncased

=== Subset fine-tuning: bert-base-uncased ===


Tokenizing train (bert-base-uncased): 100%|██████████| 8000/8000 [00:01<00:00, 5597.96 examples/s]
Tokenizing val (bert-base-uncased): 100%|██████████| 5000/5000 [00:00<00:00, 5857.68 examples/s]
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Ex

Epoch,Training Loss,Validation Loss,F1,Accuracy
1,0.2217,0.257187,0.897446,0.8988


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

Result: bert-base-uncased -> F1: 0.8974, Acc: 0.8988, saved at: outputs/subset_bert-base-uncased

=== Subset fine-tuning: roberta-base ===


Tokenizing train (roberta-base): 100%|██████████| 8000/8000 [00:01<00:00, 6908.26 examples/s]
Tokenizing val (roberta-base): 100%|██████████| 5000/5000 [00:00<00:00, 6924.45 examples/s]
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- 

Epoch,Training Loss,Validation Loss,F1,Accuracy
1,0.1815,0.229602,0.922467,0.9218


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

Result: roberta-base -> F1: 0.9225, Acc: 0.9218, saved at: outputs/subset_roberta-base

=== Subset fine-tuning: google/electra-small-discriminator ===


Tokenizing train (google/electra-small-discriminator): 100%|██████████| 8000/8000 [00:01<00:00, 5925.72 examples/s]
Tokenizing val (google/electra-small-discriminator): 100%|██████████| 5000/5000 [00:00<00:00, 5936.94 examples/s]
Some weights of ElectraForSequenceClassification were not initialized from the model checkpoint at google/electra-small-discriminator and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has 

Epoch,Training Loss,Validation Loss,F1,Accuracy
1,0.3595,0.371767,0.867659,0.8648


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

Result: google/electra-small-discriminator -> F1: 0.8677, Acc: 0.8648, saved at: outputs/subset_google_electra-small-discriminator

=== Subset fine-tuning: albert-base-v2 ===


Tokenizing train (albert-base-v2): 100%|██████████| 8000/8000 [00:01<00:00, 5129.79 examples/s]
Tokenizing val (albert-base-v2): 100%|██████████| 5000/5000 [00:00<00:00, 5128.91 examples/s]
Some weights of AlbertForSequenceClassification were not initialized from the model checkpoint at albert-base-v2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitl

Epoch,Training Loss,Validation Loss,F1,Accuracy
1,0.216,0.250282,0.906741,0.9062


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

Result: albert-base-v2 -> F1: 0.9067, Acc: 0.9062, saved at: outputs/subset_albert-base-v2

=== Best model on subset ===
roberta-base with F1: 0.9225, Acc: 0.9218


Tokenizing full train (roberta-base): 100%|██████████| 40000/40000 [00:05<00:00, 6685.34 examples/s]
Tokenizing full val (roberta-base): 100%|██████████| 5000/5000 [00:00<00:00, 6979.68 examples/s]
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid dead

Epoch,Training Loss,Validation Loss,F1,Accuracy
1,0.1646,0.219551,0.933964,0.933
2,0.1793,0.224442,0.937937,0.9378
3,0.1391,0.291123,0.938939,0.939


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

[Final best model eval on val] roberta-base -> F1: 0.9389, Acc: 0.9390


Device set to use cuda:0



=== Inference on 10 random test samples ===
[01] Pred: NEGATIVE  (score=0.997) | Gold: NEGATIVE
     A plane carrying a rich scientist's daughter goes down in thick wilderness. He assembles a group to go and find her and the others, but the rescue party soon suspects that something is stalking them. Then ulterior motive...

[02] Pred: NEGATIVE  (score=0.999) | Gold: NEGATIVE
     If any movie ever made Italians look bad, this is it.<br /><br />Duke Mitchell - what an A--HOLE. Duke Mitchell, I s--t on your grave. Seeing as practically every person gunned down in this film by the cowardly Mimi is e...

[03] Pred: POSITIVE  (score=0.998) | Gold: NEGATIVE
     I desperately want to give this movie a 10...I really do. Some movies, especially horror movies are so budget that they are good. A wise-cracking ninja scarecrow who can implement corn cobs as lethal weaponry...definitel...

[04] Pred: NEGATIVE  (score=1.000) | Gold: NEGATIVE
     It seems that several of the people who have reviewe