In [None]:
# 1. INSTALL & IMPORTS

print("STEP 1: INSTALLING / IMPORTING PACKAGES")

import os
import subprocess
import sys

# Enable HuggingFace tokenizers parallelism for speed
os.environ["TOKENIZERS_PARALLELISM"] = "true"

# Install core packages (comment out if already installed)
subprocess.check_call([
    sys.executable, "-m", "pip", "install", "-q",
    "transformers", "sentencepiece", "scikit-learn", "emoji==0.6.0"
])

import warnings
warnings.filterwarnings("ignore")

import re
import random
import numpy as np
import pandas as pd
from tqdm.auto import tqdm

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader

from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    get_cosine_schedule_with_warmup
)

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Mixed precision
from torch.cuda.amp import autocast, GradScaler


# 2. CONFIG & SEED
CONFIG = {
    # Models
    "model_name": "vinai/bertweet-large",   # Twitter-optimized model
    "fallback_model": "roberta-large",

    # Data
    "data_path": "training.1600000.processed.noemoticon.csv",
    "sample_size": None, #used up whole available data for training as we need variety of samples
    "max_len": 128,

    # Training
    "batch_size": 32,
    "gradient_accumulation_steps": 2,
    "learning_rate": 1e-5,
    "epochs": 3,
    "warmup_ratio": 0.1,
    "weight_decay": 0.01,
    "max_grad_norm": 1.0,

    # Dataloader
    "num_workers": 8,

    # Misc
    "seed": 42,
    "use_fp16": True
}

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

def set_seed(seed: int = 42):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(seed)
    # For speed on 5090, allow fast kernels
    torch.backends.cudnn.deterministic = False
    torch.backends.cudnn.benchmark = True

set_seed(CONFIG["seed"])

# 3. CLEANING TEXT

NEGATION_WORDS = [r"\bnot\b", r"\bnever\b", r"\bno\b"]

def clean_text(text):
    if not isinstance(text, str):
        return ""

    # Remove URLs
    text = re.sub(r"http\S+|www\.\S+", " ", text)

    # Normalize mentions
    text = re.sub(r"@\w+", "@user", text)

    # Remove '#' but keep the word
    text = re.sub(r"#(\w+)", r"\1", text)

    # Normalize repeated characters (3+ -> 2)
    # soooo -> soo, niiiice -> niice
    text = re.sub(r"(.)\1{2,}", r"\1\1", text)

    # Highlight negations
    text = re.sub(r"\bnot\b", "NOT_", text, flags=re.IGNORECASE)
    text = re.sub(r"\bnever\b", "NEVER_", text, flags=re.IGNORECASE)
    text = re.sub(r"\bno\b", "NO_", text, flags=re.IGNORECASE)

    # Collapse multiple spaces
    text = re.sub(r"\s+", " ", text).strip()

    return text

# 4. LOAD & PREPARE DATA
print("STEP 2: LOADING SENTIMENT140 DATASET")
cols = ["target", "ids", "date", "flag", "user", "text"]
df = pd.read_csv(CONFIG["data_path"], encoding="latin-1", names=cols, header=None)
print(f"Total tweets in raw file: {len(df):,}")
# Map labels: {0, 4} -> {0: negative, 1: positive}
df["target"] = df["target"].map({0: 0, 4: 1})

# Drop NAs in text just in case
df = df.dropna(subset=["text"]).reset_index(drop=True)

print("Applying light cleaning...")
df["text_clean"] = df["text"].apply(clean_text)
df = df[df["text_clean"].str.len() > 0].reset_index(drop=True)

print(f"After cleaning: {len(df):,} tweets")

# Sample a subset (dataset is roughly balanced already)
if CONFIG["sample_size"] is not None and CONFIG["sample_size"] < len(df):
    df_sampled = df.sample(
        n=CONFIG["sample_size"],
        random_state=CONFIG["seed"]
    ).reset_index(drop=True)
else:
    df_sampled = df.copy()

print(f"\nUsing {len(df_sampled):,} tweets for training/validation/testing")

# Train / Val / Test split: 90 / 5 / 5  (4k val, 4k test)
train_end = len(df_sampled) - 8000
val_end = train_end + 4000
# last 4k is test

train_df = df_sampled.iloc[:train_end].reset_index(drop=True)
val_df   = df_sampled.iloc[train_end:val_end].reset_index(drop=True)
test_df  = df_sampled.iloc[val_end:].reset_index(drop=True)

print("\nDataset splits BEFORE augmentation:")
print(f"  Train: {len(train_df):,}")
print(f"  Val:   {len(val_df):,}")
print(f"  Test:  {len(test_df):,}")


# 5. TARGETED DATA AUGMENTATION

POSITIVE_WORDS = [
    "good", "great", "awesome", "amazing", "love",
    "like", "happy", "fantastic", "excellent", "cool",
    "nice", "fun", "enjoy"
]

NEGATIVE_WORDS = [
    "bad", "terrible", "awful", "hate", "sad",
    "horrible", "worst", "sucks", "ugly", "annoying"
]

def augment_negation_examples(df, max_pos_aug=8000, max_neg_aug=8000):
    """
    Creates synthetic negation-based contrastive examples:
      - For positive tweets with positive words -> add a "NOT_" version labeled negative
      - For negative tweets with negative words -> add a "NOT_" version labeled positive
    Limits to ~max_pos_aug + max_neg_aug synthetic samples.
    """
    aug_rows = []

    # Positive -> create negative via NOT_
    pos_df = df[df["target"] == 1].copy()
    pos_candidates = pos_df[pos_df["text_clean"].str.contains("|".join(POSITIVE_WORDS), case=False, na=False)]
    pos_candidates = pos_candidates.sample(
        n=min(max_pos_aug, len(pos_candidates)),
        random_state=CONFIG["seed"]
    )

    for _, row in pos_candidates.iterrows():
        txt = row["text_clean"]
        # replace first positive word with "NOT_..."
        for w in POSITIVE_WORDS:
            pattern = re.compile(rf"\b{re.escape(w)}\b", flags=re.IGNORECASE)
            if pattern.search(txt):
                new_txt = pattern.sub("NOT_" + w, txt, count=1)
                new_txt = clean_text(new_txt)
                if len(new_txt) > 0:
                    aug_rows.append({"text_clean": new_txt, "target": 0})
                break

    # Negative -> create positive via NOT_
    neg_df = df[df["target"] == 0].copy()
    neg_candidates = neg_df[neg_df["text_clean"].str.contains("|".join(NEGATIVE_WORDS), case=False, na=False)]
    neg_candidates = neg_candidates.sample(
        n=min(max_neg_aug, len(neg_candidates)),
        random_state=CONFIG["seed"]
    )

    for _, row in neg_candidates.iterrows():
        txt = row["text_clean"]
        for w in NEGATIVE_WORDS:
            pattern = re.compile(rf"\b{re.escape(w)}\b", flags=re.IGNORECASE)
            if pattern.search(txt):
                new_txt = pattern.sub("NOT_" + w, txt, count=1)
                new_txt = clean_text(new_txt)
                if len(new_txt) > 0:
                    aug_rows.append({"text_clean": new_txt, "target": 1})
                break

    aug_df = pd.DataFrame(aug_rows)
    if len(aug_df) > 0:
        print(f"\n[Augmentation] Negation-based synthetic examples added: {len(aug_df):,}")
        df_out = pd.concat(
            [df[["text_clean", "target"]], aug_df],
            ignore_index=True
        ).sample(frac=1.0, random_state=CONFIG["seed"]).reset_index(drop=True)
    else:
        df_out = df[["text_clean", "target"]].copy()

    return df_out

def add_sarcasm_examples():
    """
    Adds a small set of synthetic sarcastic negative tweets,
    addressing common error pattern: sarcastic but negative sentiment.
    """
    texts = [
        "Yeah right, exactly what I needed today",
        "Great, just great... NOT_ happy at all",
        "Love when everything goes wrong, totally awesome",
        "Perfect timing, as always... could not be more thrilled",
        "Best day ever... NO_ joke",
    ]
    rows = []
    for t in texts:
        rows.append({
            "text_clean": clean_text(t),
            "target": 0  # negative
        })
    return pd.DataFrame(rows)

print("\nApplying targeted augmentation on TRAIN set only...")

# Start from original train_df columns
train_aug = train_df[["text_clean", "target"]].copy()

# 1) Negation-based augmentation
train_aug = augment_negation_examples(train_aug, max_pos_aug=8000, max_neg_aug=8000)

# 2) Sarcasm augmentation
sarcasm_df = add_sarcasm_examples()
train_aug = pd.concat([train_aug, sarcasm_df], ignore_index=True)
train_aug = train_aug.sample(frac=1.0, random_state=CONFIG["seed"]).reset_index(drop=True)

print(f"Final TRAIN size after augmentation: {len(train_aug):,}")

# For val/test we use original split, no augmentation
val_aug = val_df[["text_clean", "target"]].copy()
test_aug = test_df[["text_clean", "target"]].copy()

print("\nDataset splits AFTER augmentation:")
print(f"  Train: {len(train_aug):,}")
print(f"  Val:   {len(val_aug):,}")
print(f"  Test:  {len(test_aug):,}")


# 6. DATASET CLASS

class SentimentDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len=128):
        self.texts = list(texts)
        self.labels = list(labels)
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text  = str(self.texts[idx])
        label = int(self.labels[idx])

        enc = self.tokenizer(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            padding="max_length",
            truncation=True,
            return_attention_mask=True,
            return_tensors="pt"
        )

        return {
            "input_ids": enc["input_ids"].squeeze(0),
            "attention_mask": enc["attention_mask"].squeeze(0),
            "labels": torch.tensor(label, dtype=torch.long)
        }

# 7. LOAD TOKENIZER & MODEL (BERTweet)
print(f"STEP 3: LOADING MODEL: {CONFIG['model_name']}")
try:
    tokenizer = AutoTokenizer.from_pretrained(CONFIG["model_name"], use_fast=False)
    model = AutoModelForSequenceClassification.from_pretrained(
        CONFIG["model_name"],
        num_labels=2
    ).to(device)
    print(f"✓ Loaded model: {CONFIG['model_name']}")
except Exception as e:
    print(f"Error loading {CONFIG['model_name']}: {e}")
    print(f"Falling back to {CONFIG['fallback_model']}...")
    tokenizer = AutoTokenizer.from_pretrained(CONFIG["fallback_model"])
    model = AutoModelForSequenceClassification.from_pretrained(
        CONFIG["fallback_model"],
        num_labels=2
    ).to(device)
    CONFIG["model_name"] = CONFIG["fallback_model"]
    print(f"✓ Loaded fallback model: {CONFIG['model_name']}")

total_params = sum(p.numel() for p in model.parameters())
print(f"Model parameters: {total_params:,}")

# Mixed precision scaler
scaler = GradScaler(enabled=CONFIG["use_fp16"])

# 8. DATALOADERS

print("\nCreating datasets & dataloaders...")
train_dataset = SentimentDataset(
    train_aug["text_clean"].values,
    train_aug["target"].values,
    tokenizer,
    max_len=CONFIG["max_len"]
)
val_dataset = SentimentDataset(
    val_aug["text_clean"].values,
    val_aug["target"].values,
    tokenizer,
    max_len=CONFIG["max_len"]
)
test_dataset = SentimentDataset(
    test_aug["text_clean"].values,
    test_aug["target"].values,
    tokenizer,
    max_len=CONFIG["max_len"]
)

train_loader = DataLoader(
    train_dataset,
    batch_size=CONFIG["batch_size"],
    shuffle=True,
    num_workers=CONFIG["num_workers"],
    pin_memory=True,
    persistent_workers=True
)
val_loader = DataLoader(
    val_dataset,
    batch_size=CONFIG["batch_size"],
    shuffle=False,
    num_workers=CONFIG["num_workers"],
    pin_memory=True,
    persistent_workers=True
)
test_loader = DataLoader(
    test_dataset,
    batch_size=CONFIG["batch_size"],
    shuffle=False,
    num_workers=CONFIG["num_workers"],
    pin_memory=True,
    persistent_workers=True
)

# 9. OPTIMIZER & SCHEDULER
optimizer = torch.optim.AdamW(
    model.parameters(),
    lr=CONFIG["learning_rate"],
    weight_decay=CONFIG["weight_decay"]
)

num_training_steps = (
    len(train_loader)
    * CONFIG["epochs"]
    // CONFIG["gradient_accumulation_steps"]
)
num_warmup_steps = int(CONFIG["warmup_ratio"] * num_training_steps)

scheduler = get_cosine_schedule_with_warmup(
    optimizer,
    num_warmup_steps=num_warmup_steps,
    num_training_steps=num_training_steps
)

print("\nTraining configuration:")
print(f"  Steps:  {num_training_steps:,}")
print(f"  Warmup: {num_warmup_steps:,}")
print(f"  Epochs: {CONFIG['epochs']}")
print(f"  LR:     {CONFIG['learning_rate']}")
print(f"  Batch:  {CONFIG['batch_size']} (x{CONFIG['gradient_accumulation_steps']} accum)")


# 10. TRAINING & EVAL FUNCTIONS

def train_one_epoch(
    model,
    dataloader,
    optimizer,
    scheduler,
    device,
    scaler,
    accumulation_steps=1,
    use_fp16=True
):
    model.train()
    total_loss = 0.0
    preds_all = []
    labels_all = []

    optimizer.zero_grad(set_to_none=True)
    progress = tqdm(dataloader, desc="Training", leave=False)

    for step, batch in enumerate(progress):
        input_ids = batch["input_ids"].to(device, non_blocking=True)
        attention_mask = batch["attention_mask"].to(device, non_blocking=True)
        labels = batch["labels"].to(device, non_blocking=True)

        with autocast(enabled=use_fp16):
            outputs = model(
                input_ids=input_ids,
                attention_mask=attention_mask,
                labels=labels
            )
            loss = outputs.loss / accumulation_steps

        scaler.scale(loss).backward()

        if (step + 1) % accumulation_steps == 0:
            scaler.unscale_(optimizer)
            nn.utils.clip_grad_norm_(model.parameters(), CONFIG["max_grad_norm"])
            scaler.step(optimizer)
            scaler.update()
            optimizer.zero_grad(set_to_none=True)
            scheduler.step()

        total_loss += loss.item() * accumulation_steps

        with torch.no_grad():
            logits = outputs.logits
            preds = torch.argmax(logits, dim=-1)
            preds_all.extend(preds.cpu().numpy())
            labels_all.extend(labels.cpu().numpy())

        progress.set_postfix({
            "loss": f"{loss.item() * accumulation_steps:.4f}",
            "lr": f"{scheduler.get_last_lr()[0]:.2e}"
        })

    avg_loss = total_loss / len(dataloader)
    f1 = f1_score(labels_all, preds_all, average="binary")
    acc = accuracy_score(labels_all, preds_all)
    return avg_loss, f1, acc

def evaluate(model, dataloader, device, use_fp16=True):
    model.eval()
    total_loss = 0.0
    preds_all = []
    labels_all = []

    with torch.no_grad():
        for batch in tqdm(dataloader, desc="Evaluating", leave=False):
            input_ids = batch["input_ids"].to(device, non_blocking=True)
            attention_mask = batch["attention_mask"].to(device, non_blocking=True)
            labels = batch["labels"].to(device, non_blocking=True)

            with autocast(enabled=use_fp16):
                outputs = model(
                    input_ids=input_ids,
                    attention_mask=attention_mask,
                    labels=labels
                )
                loss = outputs.loss

            total_loss += loss.item()

            logits = outputs.logits
            preds = torch.argmax(logits, dim=-1)

            preds_all.extend(preds.cpu().numpy())
            labels_all.extend(labels.cpu().numpy())

    avg_loss = total_loss / len(dataloader)
    acc = accuracy_score(labels_all, preds_all)
    prec = precision_score(labels_all, preds_all, average="binary")
    rec = recall_score(labels_all, preds_all, average="binary")
    f1 = f1_score(labels_all, preds_all, average="binary")

    return {
        "loss": avg_loss,
        "accuracy": acc,
        "precision": prec,
        "recall": rec,
        "f1": f1
    }

# 11. TRAINING LOOP

print("STEP 4: TRAINING")

best_val_f1 = 0.0
history = {"train_loss": [], "train_f1": [], "val_f1": []}
best_model_path = "best_model_s140_bertweet.pt"

for epoch in range(CONFIG["epochs"]):

    print(f"Epoch {epoch + 1}/{CONFIG['epochs']}")
    train_loss, train_f1, train_acc = train_one_epoch(
        model,
        train_loader,
        optimizer,
        scheduler,
        device,
        scaler,
        accumulation_steps=CONFIG["gradient_accumulation_steps"],
        use_fp16=CONFIG["use_fp16"]
    )

    print(f" Train: Loss={train_loss:.4f}, F1={train_f1:.4f} ({train_f1*100:.2f}%), "
          f"Acc={train_acc:.4f}")

    history["train_loss"].append(train_loss)
    history["train_f1"].append(train_f1)

    val_metrics = evaluate(
        model,
        val_loader,
        device,
        use_fp16=CONFIG["use_fp16"]
    )
    val_f1 = val_metrics["f1"]
    history["val_f1"].append(val_f1)

    print(f" Val:   Loss={val_metrics['loss']:.4f}, "
          f"F1={val_f1:.4f} ({val_f1*100:.2f}%), "
          f"Acc={val_metrics['accuracy']:.4f}")

    if val_f1 > best_val_f1:
        best_val_f1 = val_f1
        torch.save(model.state_dict(), best_model_path)
        print(f"New best model saved (Val F1={best_val_f1:.4f})")


# 12. FINAL TEST EVALUATION

print("FINAL TEST EVALUATION")

# Load best model
model.load_state_dict(torch.load(best_model_path, map_location=device))

test_metrics = evaluate(
    model,
    test_loader,
    device,
    use_fp16=CONFIG["use_fp16"]
)

print("\n FINAL TEST RESULTS")
print("-" * 70)
print(f" Accuracy:  {test_metrics['accuracy']:.4f} ({test_metrics['accuracy']*100:.2f}%)")
print(f" Precision: {test_metrics['precision']:.4f} ({test_metrics['precision']*100:.2f}%)")
print(f" Recall:    {test_metrics['recall']:.4f} ({test_metrics['recall']*100:.2f}%)")
print(f" F1 Score:  {test_metrics['f1']:.4f} ({test_metrics['f1']*100:.2f}%)")

f1_pct = test_metrics["f1"] * 100
if f1_pct >= 96:
    bonus = 6
    status = "MAXIMUM F1 score ACHIEVED!"
elif f1_pct >= 93:
    bonus = 4
    status = "Excellent"
elif f1_pct >= 90:
    bonus = 2
    status = "Good job"
else:
    bonus = 0
    status = "Below target"


print(f" {status}")
print(f"\nBest model saved to: {best_model_path}")
print(f"Model used: {CONFIG['model_name']}")

STEP 1: INSTALLING / IMPORTING PACKAGES


[33m  DEPRECATION: Building 'emoji' using the legacy setup.py bdist_wheel mechanism, which will be removed in a future version. pip 25.3 will enforce this behaviour change. A possible replacement is to use the standardized build interface by setting the `--use-pep517` option, (possibly combined with `--no-build-isolation`), or adding a `pyproject.toml` file to the source tree of 'emoji'. Discussion can be found at https://github.com/pypa/pip/issues/6334[0m[33m
[0m

Using device: cuda
STEP 2: LOADING SENTIMENT140 DATASET
Total tweets in raw file: 1,600,000
Applying light cleaning...
After cleaning: 1,599,978 tweets

Using 1,599,978 tweets for training/validation/testing

Dataset splits BEFORE augmentation:
  Train: 1,591,978
  Val:   4,000
  Test:  4,000

Applying targeted augmentation on TRAIN set only...

[Augmentation] Negation-based synthetic examples added: 14,284
Final TRAIN size after augmentation: 1,606,267

Dataset splits AFTER augmentation:
  Train: 1,606,267
  Val:   4,000
  Test:  4,000
STEP 3: LOADING MODEL: vinai/bertweet-large


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at vinai/bertweet-large and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


✓ Loaded model: vinai/bertweet-large
Model parameters: 355,361,794

Creating datasets & dataloaders...

Training configuration:
  Steps:  75,294
  Warmup: 7,529
  Epochs: 3
  LR:     1e-05
  Batch:  32 (x2 accum)
STEP 4: TRAINING
Epoch 1/3




 Train: Loss=0.2891, F1=0.8717 (87.17%), Acc=0.8738




 Val:   Loss=0.2718, F1=0.9306 (93.06%), Acc=0.8702
New best model saved (Val F1=0.9306)
Epoch 2/3




 Train: Loss=0.2252, F1=0.9075 (90.75%), Acc=0.9084




 Val:   Loss=0.2746, F1=0.9419 (94.19%), Acc=0.8902
New best model saved (Val F1=0.9419)
Epoch 3/3




 Train: Loss=0.1787, F1=0.9290 (92.90%), Acc=0.9297




 Val:   Loss=0.3132, F1=0.9369 (93.69%), Acc=0.8812
FINAL TEST EVALUATION


                                                             


 FINAL TEST RESULTS
----------------------------------------------------------------------
 Accuracy:  0.8902 (89.03%)
 Precision: 1.0000 (100.00%)
 Recall:    0.8902 (89.03%)
 F1 Score:  0.9419 (94.19%)
 Excellent!

Best model saved to: best_model_s140_bertweet.pt
Model used: vinai/bertweet-large




In [None]:
# 12. THRESHOLD OPTIMIZATION + FINAL TEST EVALUATION

print("THRESHOLD OPTIMIZATION ON VALIDATION SET")

model.load_state_dict(torch.load(best_model_path, map_location=device))
model.eval()

val_probs = []
val_labels = []

with torch.no_grad():
    for batch in val_loader:
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["labels"].cpu().numpy()

        with autocast(enabled=CONFIG["use_fp16"]):
            outputs = model(input_ids=input_ids, attention_mask=attention_mask)

        probs = torch.softmax(outputs.logits, dim=1)[:, 1].cpu().numpy()

        val_probs.extend(probs)
        val_labels.extend(labels)

val_probs = np.array(val_probs)
val_labels = np.array(val_labels)

best_f1 = 0
best_thresh = 0.5

print("\nSearching for best threshold...")
for t in np.arange(0.10, 0.91, 0.01):
    preds = (val_probs >= t).astype(int)
    f1 = f1_score(val_labels, preds)

    if f1 > best_f1:
        best_f1 = f1
        best_thresh = t

print(f"\nBest Validation Threshold = {best_thresh:.2f}")
print(f"Best Validation F1 = {best_f1:.4f} ({best_f1*100:.2f}%)")

# 13. FINAL TEST EVALUATION USING OPTIMAL THRESHOLD

print("FINAL TEST EVALUATION (THRESHOLD)")

test_probs = []
test_labels = []

with torch.no_grad():
    for batch in test_loader:
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["labels"].cpu().numpy()

        with autocast(enabled=CONFIG["use_fp16"]):
            outputs = model(input_ids=input_ids, attention_mask=attention_mask)

        probs = torch.softmax(outputs.logits, dim=1)[:, 1].cpu().numpy()
        test_probs.extend(probs)
        test_labels.extend(labels)

test_probs = np.array(test_probs)
test_labels = np.array(test_labels)

test_preds = (test_probs >= best_thresh).astype(int)

test_accuracy = accuracy_score(test_labels, test_preds)
test_precision = precision_score(test_labels, test_preds)
test_recall = recall_score(test_labels, test_preds)
test_f1 = f1_score(test_labels, test_preds)

print("\n FINAL TEST RESULTS (WITH OPTIMIZED THRESHOLD)")
print("-" * 70)
print(f" Accuracy:  {test_accuracy:.4f} ({test_accuracy*100:.2f}%)")
print(f" Precision: {test_precision:.4f} ({test_precision*100:.2f}%)")
print(f" Recall:    {test_recall:.4f} ({test_recall*100:.2f}%)")
print(f" F1 Score:  {test_f1:.4f} ({test_f1*100:.2f}%)")

print("\nOptimal threshold applied:", best_thresh)


THRESHOLD OPTIMIZATION ON VALIDATION SET

Searching for best threshold...

Best Validation Threshold = 0.10
Best Validation F1 = 0.9848 (98.48%)
FINAL TEST EVALUATION (THRESHOLD)

 FINAL TEST RESULTS (WITH OPTIMIZED THRESHOLD)
----------------------------------------------------------------------
 Accuracy:  0.9685 (96.85%)
 Precision: 1.0000 (100.00%)
 Recall:    0.9685 (96.85%)
 F1 Score:  0.9840 (98.40%)

Optimal threshold applied: 0.1


In [None]:
print(" FINAL F1 SCORE With OPTIMIZED CODE")
print(f" Final F1:  {test_f1:.4f} ({test_f1*100:.2f}%)")
print(f" Best threshold used: {best_thresh:.2f}")

 FINAL F1 SCORE With OPTIMIZED CODE
 Final F1:  0.9840 (98.40%)
 Best threshold used: 0.10
