In [7]:
import os
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import torch
from torch.utils.data import DataLoader, Dataset
from torch import nn, optim
from sklearn.model_selection import train_test_split
from pathlib import Path
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score

import optuna
import wandb
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

wandb.login(key="017a8a1cf1968e847ba05f92a8935af78befe33f")

ModuleNotFoundError: No module named 'wandb'

In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import joblib  # לשמירת ה-LabelEncoder (אופציונלי)

# --- טעינת הדאטה ---
df = pd.read_csv(r"C:\Users\rabea\Desktop\my_eda.csv", encoding='latin1')

# שינוי שם עמודת הטקסט
df = df.rename(columns={'Tweet': 'Original'})
df = df.rename(columns={'normalized_tweet': 'Tweet'})

# קידוד התוויות ממחרוזות למספרים
label_encoder = LabelEncoder()
df['label'] = label_encoder.fit_transform(df['Sentiment'])

# שמירת המיפוי לשימוש עתידי (אופציונלי)
joblib.dump(label_encoder, "label_encoder.pkl")

# הצגת המיפוי
for i, label in enumerate(label_encoder.classes_):
    print(f"{i} → {label}")

# --- חלוקה ל-Train / Eval /  ---
train_df, eval_df = train_test_split(
    df, test_size=0.2, random_state=42, stratify=df['label']
)


# שמירה של רק העמודות הדרושות למודל
train_df = train_df[['Tweet', 'label']]
eval_df = eval_df[['Tweet', 'label']]


# שמירת קבצים
train_df.to_csv("train_data.csv", index=False)
eval_df.to_csv("eval_data.csv", index=False)



0 → Extremely Negative
1 → Extremely Positive
2 → Negative
3 → Neutral
4 → Positive


In [None]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification

# שם המודל
model_name = "digitalepidemiologylab/covid-twitter-bert"

# טעינת הטוקנייזר
tokenizer = AutoTokenizer.from_pretrained(model_name)

# טעינת המודל עם מספר התוויות שלך
model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    num_labels=5,  # לשנות בהתאם לסט
    ignore_mismatched_sizes=True  # רק אם הראש הותאם מחדש
).to(device)

# הצגת מבנה המודל
print(model)

print(df['Tweet'].iloc[80])


Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


In [4]:
class TweetDataset(Dataset):
    def __init__(self, dataframe, tokenizer, max_length=96):
        self.texts = dataframe['Tweet'].fillna("").astype(str).tolist()
        self.labels = dataframe['label'].tolist()
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx].strip()
        if not text:
            text = "[PAD]"  # גיבוי לטקסט ריק

        label = int(self.labels[idx])

        enc = self.tokenizer(
            text,
            padding="max_length",
            truncation=True,
            max_length=self.max_length,
            return_tensors="pt"
        )

        item = {
            "input_ids": enc["input_ids"].squeeze(0),
            "attention_mask": enc["attention_mask"].squeeze(0),
            "labels": torch.tensor(label, dtype=torch.long)
        }

        # חלק מהמודלים (BERT) מחזירים token_type_ids וחלק לא (RoBERTa)
        if "token_type_ids" in enc:
            item["token_type_ids"] = enc["token_type_ids"].squeeze(0)

        return item


In [5]:
def early_stop_check_acc(patience, best_acc, best_acc_epoch, current_acc, current_epoch):
    """
    עצירה מוקדמת לפי Val Accuracy בלבד.
    מחזיר: best_acc, best_acc_epoch, early_stop_flag
    """
    early_stop_flag = False
    if current_acc > best_acc:
        best_acc = current_acc
        best_acc_epoch = current_epoch
    elif current_epoch - best_acc_epoch > patience:
        early_stop_flag = True
    return best_acc, best_acc_epoch, early_stop_flag


In [6]:
from sklearn.utils.class_weight import compute_class_weight
import numpy as np
import torch
from torch import nn

# חישוב משקלי תוויות לפי הופעה בפועל — על ה-TRAIN בלבד
train_labels = train_df['label'].values
classes = np.unique(train_labels)

class_weights = compute_class_weight(
    class_weight='balanced',
    classes=classes,
    y=train_labels
)

# הפיכה לטנסור לשימוש בתוך CrossEntropyLoss
class_weights_tensor = torch.tensor(class_weights, dtype=torch.float, device=device)

# פונקציית הפסד עם משקלים
criterion = nn.CrossEntropyLoss(weight=class_weights_tensor)

# אופציונלי: להדפיס כדי לדעת מה קיבלת
print("Classes:", classes)
print("Class weights:", class_weights)


Classes: [0 1 2 3 4]
Class weights: [1.49423428 1.23755402 0.8287368  1.08166195 0.71861982]


In [7]:
def train_model_with_hyperparams(model, train_loader, val_loader, optimizer, criterion, epochs, patience, trial):
    best_val_accuracy = 0.0
    best_val_accuracy_epoch = 0
    early_stop_flag = False
    best_model_state = None

    for epoch in range(1, epochs + 1):
        # ===== Training =====
        model.train()
        train_loss = 0.0
        total_train_samples = 0
        correct_train_predictions = 0

        for batch in train_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            optimizer.zero_grad()
            outputs = model(input_ids, attention_mask=attention_mask)
            logits = outputs.logits
            loss = criterion(logits, labels)

            loss.backward()
            optimizer.step()

            train_loss += loss.item() * input_ids.size(0)
            total_train_samples += input_ids.size(0)
            correct_train_predictions += (logits.argmax(dim=1) == labels).sum().item()

        train_loss /= total_train_samples
        train_accuracy = correct_train_predictions / total_train_samples

        # ===== Validation =====
        model.eval()
        val_loss = 0.0
        total_val_samples = 0
        correct_val_predictions = 0
        all_val_labels, all_val_preds = [], []

        with torch.no_grad():
            for batch in val_loader:
                input_ids = batch['input_ids'].to(device)
                attention_mask = batch['attention_mask'].to(device)
                labels = batch['labels'].to(device)

                outputs = model(input_ids, attention_mask=attention_mask)
                logits = outputs.logits
                loss = criterion(logits, labels)

                val_loss += loss.item() * input_ids.size(0)
                total_val_samples += input_ids.size(0)
                correct_val_predictions += (logits.argmax(dim=1) == labels).sum().item()

                all_val_labels.extend(labels.cpu().numpy())
                all_val_preds.extend(logits.argmax(dim=1).cpu().numpy())

        val_loss /= total_val_samples
        val_accuracy = correct_val_predictions / total_val_samples

        # מטריקות נוספות (לא על עצירה)
        val_precision = precision_score(all_val_labels, all_val_preds, average='weighted', zero_division=0)
        val_recall    = recall_score(all_val_labels, all_val_preds,   average='weighted', zero_division=0)
        val_f1        = f1_score(all_val_labels, all_val_preds,       average='weighted', zero_division=0)

        # === Early Stopping לפי Accuracy ===
        prev_best = best_val_accuracy
        best_val_accuracy, best_val_accuracy_epoch, early_stop_flag = early_stop_check_acc(
            patience, best_val_accuracy, best_val_accuracy_epoch, val_accuracy, epoch
        )
        if best_val_accuracy > prev_best:
            # שומרים את מצב המודל הטוב ביותר עד כה
            best_model_state = {k: v.detach().cpu().clone() for k, v in model.state_dict().items()}

        wandb.log({
            "Epoch": epoch,
            "Train Loss": train_loss,
            "Train Accuracy": train_accuracy,
            "Validation Loss": val_loss,
            "Validation Accuracy": val_accuracy,
            "Validation Precision": val_precision,
            "Validation Recall": val_recall,
            "Validation F1": val_f1
        })

        if early_stop_flag:
            print(f"Early stopping at epoch {epoch} (best Accuracy={best_val_accuracy:.4f} @ epoch {best_val_accuracy_epoch})")
            break

    # טעינת המודל הטוב ביותר ושמירתו
    if best_model_state is not None:
        model.load_state_dict(best_model_state)
        torch.save(model.state_dict(), f"best_model_trial_{trial.number}.pt")

    return best_val_accuracy


In [9]:
# Objective Function for Optuna (maximize Validation Accuracy)
def objective(trial):
    # === Hyperparameter suggestions ===
    learning_rate = trial.suggest_float("learning_rate", 5e-5, 5e-4, log=True)
    weight_decay  = trial.suggest_float("weight_decay", 1e-6, 1e-2, log=True)
    patience      = trial.suggest_int("patience", 2, 4)
    batch_size    = trial.suggest_categorical("batch_size", [32, 64, 128])
    num_layers    = trial.suggest_int("num_layers", 2, 4)  # מספר שכבות להפשיר

    # === Tokenizer and Dataset ===
    tokenizer = AutoTokenizer.from_pretrained("digitalepidemiologylab/covid-twitter-bert")
    train_dataset = TweetDataset(train_df, tokenizer)  # max_length ברירת מחדל מהמחלקה
    val_dataset   = TweetDataset(eval_df,  tokenizer)

    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    val_loader   = DataLoader(val_dataset,   batch_size=batch_size, shuffle=False)

    # === Load CT-BERT Model ===
    model = AutoModelForSequenceClassification.from_pretrained(
        "digitalepidemiologylab/covid-twitter-bert", num_labels=5
    ).to(device)

    # === Freeze all layers first ===
    for p in model.bert.parameters():
        p.requires_grad = False

    # === Unfreeze the last `num_layers` encoder blocks ===
    for p in model.bert.encoder.layer[-num_layers:].parameters():
        p.requires_grad = True

    # === Unfreeze the classification head ===
    for p in model.classifier.parameters():
        p.requires_grad = True

    # === Define loss with class weights (computed from TRAIN ONLY) ===
    from sklearn.utils.class_weight import compute_class_weight
    import numpy as np
    train_labels = train_df['label'].values
    classes = np.unique(train_labels)
    class_weights = compute_class_weight(class_weight='balanced', classes=classes, y=train_labels)
    class_weights_tensor = torch.tensor(class_weights, dtype=torch.float, device=device)
    criterion = nn.CrossEntropyLoss(weight=class_weights_tensor)

    # === Optimizer (Adam) — only trainable params ===
    trainable_params = [p for p in model.parameters() if p.requires_grad]
    optimizer = optim.Adam(trainable_params, lr=learning_rate, weight_decay=weight_decay)

    # === Initialize W&B for tracking ===
    wandb.init(
        project="ctbert-project-2nd-run",
        config={
            "learning_rate": learning_rate,
            "weight_decay": weight_decay,
            "patience": patience,
            "batch_size": batch_size,
            "num_layers": num_layers,
            "architecture": "CT-BERT",
            "dataset": "covid-tweets",
            "early_stop_metric": "val_accuracy"
        },
        name=f"trial_{trial.number}",
        reinit=True
    )

    # === Train and evaluate (returns best Validation Accuracy) ===
    best_val_accuracy = train_model_with_hyperparams(
        model, train_loader, val_loader, optimizer, criterion,
        epochs=10, patience=patience, trial=trial
    )

    wandb.finish()
    return best_val_accuracy


In [10]:
# יצירת Study של Optuna - למקסם Validation Accuracy
study = optuna.create_study(
    study_name="CTBERT_Accuracy_Study",
    direction="maximize"
)

# הרצה של 5 ניסויים
study.optimize(objective, n_trials=5)

# הדפסת התוצאה הטובה ביותר
print(f"\nBest Validation Accuracy: {study.best_value:.4f}")
print("Best hyperparameters:", study.best_params)

# שמירת התוצאות (אופציונלי)
joblib.dump(study, "optuna_ctbert_accuracy_study.pkl")


[I 2025-08-14 10:47:29,930] A new study created in memory with name: CTBERT_Accuracy_Study
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at digitalepidemiologylab/covid-twitter-bert and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


0,1
Epoch,▁▂▃▃▄▅▆▆▇█
Train Accuracy,▁▄▅▆▇▇▇███
Train Loss,█▆▄▃▂▂▂▁▁▁
Validation Accuracy,▁▅▅▇▇██▇▇█
Validation F1,▁▆▅▇▇██▇▇█
Validation Loss,▃▁▂▁▃▄▄▇▇█
Validation Precision,▁▅▆▇▇██▇▇█
Validation Recall,▁▅▅▇▇██▇▇█

0,1
Epoch,10.0
Train Accuracy,0.95673
Train Loss,0.11071
Validation Accuracy,0.72598
Validation F1,0.72704
Validation Loss,1.17402
Validation Precision,0.7301
Validation Recall,0.72598


[I 2025-08-14 11:46:50,815] Trial 0 finished with value: 0.7313615252994378 and parameters: {'learning_rate': 6.340401056296099e-05, 'weight_decay': 6.459585710110239e-06, 'patience': 4, 'batch_size': 32, 'num_layers': 4}. Best is trial 0 with value: 0.7313615252994378.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at digitalepidemiologylab/covid-twitter-bert and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Early stopping at epoch 8 (best Accuracy=0.7045 @ epoch 5)


0,1
Epoch,▁▂▃▄▅▆▇█
Train Accuracy,▁▃▅▆▇▇██
Train Loss,█▆▄▃▃▂▁▁
Validation Accuracy,▁▃▆▇████
Validation F1,▁▃▆▇████
Validation Loss,▅▂▁▂▃▄██
Validation Precision,▁▄▆▇████
Validation Recall,▁▃▆▇████

0,1
Epoch,8.0
Train Accuracy,0.9134
Train Loss,0.21083
Validation Accuracy,0.69567
Validation F1,0.69516
Validation Loss,1.00992
Validation Precision,0.69755
Validation Recall,0.69567


[I 2025-08-14 12:28:41,332] Trial 1 finished with value: 0.7044732339281349 and parameters: {'learning_rate': 5.4366470029815594e-05, 'weight_decay': 5.243390698983217e-06, 'patience': 2, 'batch_size': 32, 'num_layers': 2}. Best is trial 0 with value: 0.7313615252994378.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at digitalepidemiologylab/covid-twitter-bert and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


0,1
Epoch,▁▂▃▃▄▅▆▆▇█
Train Accuracy,▁▄▅▆▆▇▇▇██
Train Loss,█▅▄▄▃▂▂▂▁▁
Validation Accuracy,▁▄▄▆▇▇▇███
Validation F1,▁▅▄▆▇▇▇███
Validation Loss,█▄▄▂▁▁▃▁▄▄
Validation Precision,▁▄▄▆▇▇▇███
Validation Recall,▁▄▄▆▇▇▇███

0,1
Epoch,10.0
Train Accuracy,0.8504
Train Loss,0.36713
Validation Accuracy,0.71816
Validation F1,0.71582
Validation Loss,0.8017
Validation Precision,0.72426
Validation Recall,0.71816


[I 2025-08-14 13:22:30,742] Trial 2 finished with value: 0.72647274505011 and parameters: {'learning_rate': 0.00013253403203122914, 'weight_decay': 0.0010926603377014957, 'patience': 2, 'batch_size': 128, 'num_layers': 3}. Best is trial 0 with value: 0.7313615252994378.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at digitalepidemiologylab/covid-twitter-bert and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


0,1
Epoch,▁▂▃▃▄▅▆▆▇█
Train Accuracy,▁▄▅▆▆▇▇▇██
Train Loss,█▅▄▄▃▃▂▂▁▁
Validation Accuracy,▁▄▄▅▇▆▅▇█▆
Validation F1,▁▃▄▅▇▇▆▇█▆
Validation Loss,█▅▃▃▁▁▁▁▁▃
Validation Precision,▁▃▄▆▇▇▆██▇
Validation Recall,▁▄▄▅▇▆▅▇█▆

0,1
Epoch,10.0
Train Accuracy,0.82285
Train Loss,0.43038
Validation Accuracy,0.70178
Validation F1,0.70061
Validation Loss,0.76092
Validation Precision,0.71044
Validation Recall,0.70178


[I 2025-08-14 14:17:10,393] Trial 3 finished with value: 0.7362503055487656 and parameters: {'learning_rate': 9.217205392464745e-05, 'weight_decay': 0.001433326821150192, 'patience': 3, 'batch_size': 64, 'num_layers': 3}. Best is trial 3 with value: 0.7362503055487656.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at digitalepidemiologylab/covid-twitter-bert and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


0,1
Epoch,▁▂▃▃▄▅▆▆▇█
Train Accuracy,▁▃▅▆▇▇▇███
Train Loss,█▆▄▃▃▂▂▁▁▁
Validation Accuracy,▁▄▅▇▇██▇▇█
Validation F1,▁▅▅▇▇██▇▇█
Validation Loss,▃▁▁▁▂▄▅▅▅█
Validation Precision,▁▅▆▇▇▇█▇▇█
Validation Recall,▁▄▅▇▇██▇▇█

0,1
Epoch,10.0
Train Accuracy,0.94158
Train Loss,0.14578
Validation Accuracy,0.72061
Validation F1,0.72171
Validation Loss,1.05208
Validation Precision,0.72708
Validation Recall,0.72061


[I 2025-08-14 15:13:03,999] Trial 4 finished with value: 0.7272060620875092 and parameters: {'learning_rate': 6.425278352545687e-05, 'weight_decay': 0.00014081001679032353, 'patience': 4, 'batch_size': 32, 'num_layers': 3}. Best is trial 3 with value: 0.7362503055487656.



Best Validation Accuracy: 0.7363
Best hyperparameters: {'learning_rate': 9.217205392464745e-05, 'weight_decay': 0.001433326821150192, 'patience': 3, 'batch_size': 64, 'num_layers': 3}


['optuna_ctbert_accuracy_study.pkl']