In [1]:
import os
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import torch
from torch.utils.data import DataLoader, Dataset
from torch import nn, optim
from sklearn.model_selection import train_test_split
from pathlib import Path
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score

import optuna
import wandb
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

wandb.login(key="d12da696b882ebdf6b786d182d46febc1a77dcdb")

[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mdiab55[0m ([33mdiab55-tel-aviv-university[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


True

In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import joblib  # לשמירת ה-LabelEncoder (אופציונלי)

# --- טעינת הדאטה ---
df = pd.read_csv("/kaggle/input/123444/processed_train.csv")

# שינוי שם עמודת הטקסט
df = df.rename(columns={'fully_clean_text': 'Tweet'})

# קידוד התוויות ממחרוזות למספרים
label_encoder = LabelEncoder()
df['label'] = label_encoder.fit_transform(df['Sentiment'])

# שמירת המיפוי לשימוש עתידי (אופציונלי)
joblib.dump(label_encoder, "label_encoder.pkl")

# הצגת המיפוי
for i, label in enumerate(label_encoder.classes_):
    print(f"{i} → {label}")

# --- חלוקה ל-Train / Eval / Test ---
train_df, eval_df = train_test_split(
    df[['Tweet', 'label']],          # keep only what the model needs
    test_size=0.2,                   # 80/20 split; change if you like
    random_state=42,
    stratify=df['label']             # keep class balance
)


# שמירה של רק העמודות הדרושות למודל
train_df = train_df[['Tweet', 'label']]
eval_df = eval_df[['Tweet', 'label']]

# שמירת קבצים
train_df.to_csv("train_data.csv", index=False)
eval_df.to_csv("eval_data.csv", index=False)



0 → Extremely Negative
1 → Extremely Positive
2 → Negative
3 → Neutral
4 → Positive


In [3]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification

# שם המודל
model_name = "digitalepidemiologylab/covid-twitter-bert"

# טעינת הטוקנייזר
tokenizer = AutoTokenizer.from_pretrained(model_name)

# טעינת המודל עם מספר התוויות שלך
model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    num_labels=5,  # לשנות בהתאם לסט
    ignore_mismatched_sizes=True  # רק אם הראש הותאם מחדש
).to(device)

# הצגת מבנה המודל
print(model)


config.json:   0%|          | 0.00/421 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

2025-08-11 13:26:22.754230: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1754918782.930123      36 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1754918782.984850      36 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


pytorch_model.bin:   0%|          | 0.00/1.35G [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.34G [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at digitalepidemiologylab/covid-twitter-bert and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 1024, padding_idx=0)
      (position_embeddings): Embedding(512, 1024)
      (token_type_embeddings): Embedding(2, 1024)
      (LayerNorm): LayerNorm((1024,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-23): 24 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=1024, out_features=1024, bias=True)
              (key): Linear(in_features=1024, out_features=1024, bias=True)
              (value): Linear(in_features=1024, out_features=1024, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=1024, out_features=1024, bias=True)
              (LayerNorm): LayerNorm((1

In [4]:
class TweetDataset(Dataset):
    def __init__(self, dataframe, tokenizer, max_length=160):
        self.texts = dataframe['Tweet'].fillna("").astype(str).tolist()
        self.labels = dataframe['label'].tolist()
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx].strip()
        if not text:
            text = "[PAD]"  # גיבוי לטקסט ריק

        label = int(self.labels[idx])

        enc = self.tokenizer(
            text,
            padding="max_length",
            truncation=True,
            max_length=self.max_length,
            return_tensors="pt"
        )

        item = {
            "input_ids": enc["input_ids"].squeeze(0),
            "attention_mask": enc["attention_mask"].squeeze(0),
            "labels": torch.tensor(label, dtype=torch.long)
        }

        # חלק מהמודלים (BERT) מחזירים token_type_ids וחלק לא (RoBERTa)
        if "token_type_ids" in enc:
            item["token_type_ids"] = enc["token_type_ids"].squeeze(0)

        return item


In [5]:
def early_stop_check_acc(patience, best_acc, best_acc_epoch, current_acc, current_epoch):
    """
    עצירה מוקדמת לפי Val Accuracy בלבד.
    מחזיר: best_acc, best_acc_epoch, early_stop_flag
    """
    early_stop_flag = False
    if current_acc > best_acc:
        best_acc = current_acc
        best_acc_epoch = current_epoch
    elif current_epoch - best_acc_epoch > patience:
        early_stop_flag = True
    return best_acc, best_acc_epoch, early_stop_flag


In [6]:
from sklearn.utils.class_weight import compute_class_weight
import numpy as np
import torch
from torch import nn

# חישוב משקלי תוויות לפי הופעה בפועל — על ה-TRAIN בלבד
train_labels = train_df['label'].values
classes = np.unique(train_labels)

class_weights = compute_class_weight(
    class_weight='balanced',
    classes=classes,
    y=train_labels
)

# הפיכה לטנסור לשימוש בתוך CrossEntropyLoss
class_weights_tensor = torch.tensor(class_weights, dtype=torch.float, device=device)

# פונקציית הפסד עם משקלים
criterion = nn.CrossEntropyLoss(weight=class_weights_tensor)

# אופציונלי: להדפיס כדי לדעת מה קיבלת
print("Classes:", classes)
print("Class weights:", class_weights)


Classes: [0 1 2 3 4]
Class weights: [1.49848801 1.2401726  0.82850966 1.07523382 0.71978462]


In [7]:
def train_model_with_hyperparams(model, train_loader, val_loader, optimizer, criterion, epochs, patience, trial):
    best_val_accuracy = 0.0
    best_val_accuracy_epoch = 0
    early_stop_flag = False
    best_model_state = None

    for epoch in range(1, epochs + 1):
        # ===== Training =====
        model.train()
        train_loss = 0.0
        total_train_samples = 0
        correct_train_predictions = 0

        for batch in train_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            optimizer.zero_grad()
            outputs = model(input_ids, attention_mask=attention_mask)
            logits = outputs.logits
            loss = criterion(logits, labels)

            loss.backward()
            optimizer.step()

            train_loss += loss.item() * input_ids.size(0)
            total_train_samples += input_ids.size(0)
            correct_train_predictions += (logits.argmax(dim=1) == labels).sum().item()

        train_loss /= total_train_samples
        train_accuracy = correct_train_predictions / total_train_samples

        # ===== Validation =====
        model.eval()
        val_loss = 0.0
        total_val_samples = 0
        correct_val_predictions = 0
        all_val_labels, all_val_preds = [], []

        with torch.no_grad():
            for batch in val_loader:
                input_ids = batch['input_ids'].to(device)
                attention_mask = batch['attention_mask'].to(device)
                labels = batch['labels'].to(device)

                outputs = model(input_ids, attention_mask=attention_mask)
                logits = outputs.logits
                loss = criterion(logits, labels)

                val_loss += loss.item() * input_ids.size(0)
                total_val_samples += input_ids.size(0)
                correct_val_predictions += (logits.argmax(dim=1) == labels).sum().item()

                all_val_labels.extend(labels.cpu().numpy())
                all_val_preds.extend(logits.argmax(dim=1).cpu().numpy())

        val_loss /= total_val_samples
        val_accuracy = correct_val_predictions / total_val_samples

        # מטריקות נוספות (לא על עצירה)
        val_precision = precision_score(all_val_labels, all_val_preds, average='weighted', zero_division=0)
        val_recall    = recall_score(all_val_labels, all_val_preds,   average='weighted', zero_division=0)
        val_f1        = f1_score(all_val_labels, all_val_preds,       average='weighted', zero_division=0)

        # === Early Stopping לפי Accuracy ===
        prev_best = best_val_accuracy
        best_val_accuracy, best_val_accuracy_epoch, early_stop_flag = early_stop_check_acc(
            patience, best_val_accuracy, best_val_accuracy_epoch, val_accuracy, epoch
        )
        if best_val_accuracy > prev_best:
            # שומרים את מצב המודל הטוב ביותר עד כה
            best_model_state = {k: v.detach().cpu().clone() for k, v in model.state_dict().items()}

        wandb.log({
            "Epoch": epoch,
            "Train Loss": train_loss,
            "Train Accuracy": train_accuracy,
            "Validation Loss": val_loss,
            "Validation Accuracy": val_accuracy,
            "Validation Precision": val_precision,
            "Validation Recall": val_recall,
            "Validation F1": val_f1
        })

        if early_stop_flag:
            print(f"Early stopping at epoch {epoch} (best Accuracy={best_val_accuracy:.4f} @ epoch {best_val_accuracy_epoch})")
            break

    # טעינת המודל הטוב ביותר ושמירתו
    if best_model_state is not None:
        model.load_state_dict(best_model_state)
        torch.save(model.state_dict(), f"best_model_trial_{trial.number}.pt")

    return best_val_accuracy


In [8]:
# Objective Function for Optuna (maximize Validation Accuracy)
def objective(trial):
    # === Hyperparameter suggestions ===
    learning_rate = trial.suggest_float("learning_rate", 5e-5, 5e-4, log=True)
    weight_decay  = trial.suggest_float("weight_decay", 1e-6, 1e-2, log=True)
    patience      = trial.suggest_int("patience", 2, 4)
    batch_size    = trial.suggest_categorical("batch_size", [32, 64, 128])
    num_layers    = trial.suggest_int("num_layers", 2, 4)  # מספר שכבות להפשיר

    # === Tokenizer and Dataset ===
    tokenizer = AutoTokenizer.from_pretrained("digitalepidemiologylab/covid-twitter-bert")
    train_dataset = TweetDataset(train_df, tokenizer)  # max_length ברירת מחדל מהמחלקה
    val_dataset   = TweetDataset(eval_df,  tokenizer)

    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    val_loader   = DataLoader(val_dataset,   batch_size=batch_size, shuffle=False)

    # === Load CT-BERT Model ===
    model = AutoModelForSequenceClassification.from_pretrained(
        "digitalepidemiologylab/covid-twitter-bert", num_labels=5
    ).to(device)

    # === Freeze all layers first ===
    for p in model.bert.parameters():
        p.requires_grad = False

    # === Unfreeze the last `num_layers` encoder blocks ===
    for p in model.bert.encoder.layer[-num_layers:].parameters():
        p.requires_grad = True

    # === Unfreeze the classification head ===
    for p in model.classifier.parameters():
        p.requires_grad = True

    # === Define loss with class weights (computed from TRAIN ONLY) ===
    from sklearn.utils.class_weight import compute_class_weight
    import numpy as np
    train_labels = train_df['label'].values
    classes = np.unique(train_labels)
    class_weights = compute_class_weight(class_weight='balanced', classes=classes, y=train_labels)
    class_weights_tensor = torch.tensor(class_weights, dtype=torch.float, device=device)
    criterion = nn.CrossEntropyLoss(weight=class_weights_tensor)

    # === Optimizer (Adam) — only trainable params ===
    trainable_params = [p for p in model.parameters() if p.requires_grad]
    optimizer = optim.Adam(trainable_params, lr=learning_rate, weight_decay=weight_decay)

    # === Initialize W&B for tracking ===
    wandb.init(
        project="ctbert-project-2nd-run",
        config={
            "learning_rate": learning_rate,
            "weight_decay": weight_decay,
            "patience": patience,
            "batch_size": batch_size,
            "num_layers": num_layers,
            "architecture": "CT-BERT",
            "dataset": "covid-tweets",
            "early_stop_metric": "val_accuracy"
        },
        name=f"trial_{trial.number}",
        reinit=True
    )

    # === Train and evaluate (returns best Validation Accuracy) ===
    best_val_accuracy = train_model_with_hyperparams(
        model, train_loader, val_loader, optimizer, criterion,
        epochs=10, patience=patience, trial=trial
    )

    wandb.finish()
    return best_val_accuracy


In [9]:
# יצירת Study של Optuna - למקסם Validation Accuracy
study = optuna.create_study(
    study_name="CTBERT_Accuracy_Study",
    direction="maximize"
)

# הרצה של 5 ניסויים
study.optimize(objective, n_trials=5)

# הדפסת התוצאה הטובה ביותר
print(f"\nBest Validation Accuracy: {study.best_value:.4f}")
print("Best hyperparameters:", study.best_params)

# שמירת התוצאות (אופציונלי)
joblib.dump(study, "optuna_ctbert_accuracy_study.pkl")


[I 2025-08-11 13:27:23,061] A new study created in memory with name: CTBERT_Accuracy_Study
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at digitalepidemiologylab/covid-twitter-bert and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


0,1
Epoch,▁▂▃▃▄▅▆▆▇█
Train Accuracy,▁▄▅▆▇▇▇▇██
Train Loss,█▄▄▃▂▂▂▁▁▁
Validation Accuracy,▁▃▅▄▆▅▆▆██
Validation F1,▁▃▅▄▇▅▆▇█▇
Validation Loss,█▅▃▄▂▂▁▁▁▁
Validation Precision,▁▃▅▄▇▆▇▇██
Validation Recall,▁▃▅▄▆▅▆▆██

0,1
Epoch,10.0
Train Accuracy,0.64643
Train Loss,0.79664
Validation Accuracy,0.61106
Validation F1,0.6009
Validation Loss,0.88822
Validation Precision,0.6105
Validation Recall,0.61106


[I 2025-08-11 15:05:58,324] Trial 0 finished with value: 0.6178762786166585 and parameters: {'learning_rate': 0.00010549257415300259, 'weight_decay': 0.0054956360048080055, 'patience': 2, 'batch_size': 128, 'num_layers': 2}. Best is trial 0 with value: 0.6178762786166585.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at digitalepidemiologylab/covid-twitter-bert and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


0,1
Epoch,▁▂▃▃▄▅▆▆▇█
Train Accuracy,▁▃▆▇▇▇▇███
Train Loss,█▆▃▃▂▂▂▁▁▁
Validation Accuracy,▁▅▇▇█▇▇███
Validation F1,▁▅▇▇█▇▇███
Validation Loss,█▄▃▂▂▂▂▁▁▁
Validation Precision,▁▆▇▇██▇███
Validation Recall,▁▅▇▇█▇▇███

0,1
Epoch,10.0
Train Accuracy,0.68975
Train Loss,0.72681
Validation Accuracy,0.63955
Validation F1,0.63217
Validation Loss,0.87094
Validation Precision,0.64072
Validation Recall,0.63955


[I 2025-08-11 16:58:26,164] Trial 1 finished with value: 0.6395518753044326 and parameters: {'learning_rate': 0.0004378631158079637, 'weight_decay': 0.0006020576704235517, 'patience': 4, 'batch_size': 32, 'num_layers': 4}. Best is trial 1 with value: 0.6395518753044326.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at digitalepidemiologylab/covid-twitter-bert and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


0,1
Epoch,▁▂▃▃▄▅▆▆▇█
Train Accuracy,▁▃▄▅▆▇▇▇██
Train Loss,█▆▅▄▃▃▂▂▁▁
Validation Accuracy,▁▂▅▆▆▄████
Validation F1,▁▂▅▆▆▄████
Validation Loss,█▅▂▃▂▄▃▁▂▂
Validation Precision,▁▂▅▆▆▅█▇▇█
Validation Recall,▁▂▅▆▆▄████

0,1
Epoch,10.0
Train Accuracy,0.83424
Train Loss,0.41193
Validation Accuracy,0.71627
Validation F1,0.716
Validation Loss,0.76451
Validation Precision,0.71972
Validation Recall,0.71627


[I 2025-08-11 18:44:17,787] Trial 2 finished with value: 0.7162688748173405 and parameters: {'learning_rate': 8.738156512522717e-05, 'weight_decay': 0.0009944317276910349, 'patience': 3, 'batch_size': 32, 'num_layers': 3}. Best is trial 2 with value: 0.7162688748173405.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at digitalepidemiologylab/covid-twitter-bert and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


0,1
Epoch,▁▂▃▃▄▅▆▆▇█
Train Accuracy,▁▄▅▆▆▇▇▇██
Train Loss,█▅▄▃▃▂▂▂▁▁
Validation Accuracy,▁▃▁▆▅▆▇▅▇█
Validation F1,▂▄▁▆▆▇▇▆▇█
Validation Loss,█▆▆▅▅▄▂▃▂▁
Validation Precision,▁▄▃▇▆▆▇▆▇█
Validation Recall,▁▃▁▆▅▆▇▅▇█

0,1
Epoch,10.0
Train Accuracy,0.7085
Train Loss,0.67297
Validation Accuracy,0.66318
Validation F1,0.65964
Validation Loss,0.783
Validation Precision,0.66389
Validation Recall,0.66318


[I 2025-08-11 20:29:03,360] Trial 3 finished with value: 0.6631758402338042 and parameters: {'learning_rate': 0.00022177393596707737, 'weight_decay': 0.002801550385827614, 'patience': 2, 'batch_size': 128, 'num_layers': 3}. Best is trial 2 with value: 0.7162688748173405.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at digitalepidemiologylab/covid-twitter-bert and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


0,1
Epoch,▁▂▃▃▄▅▆▆▇█
Train Accuracy,▁▄▅▆▆▇▇▇▇█
Train Loss,█▅▄▃▃▂▂▂▁▁
Validation Accuracy,▁▆▅▆▅▂▇███
Validation F1,▁▇▅▆▅▂████
Validation Loss,█▄▄▄▄▆▃▃▁▁
Validation Precision,▁▅▅▆▅▅███▇
Validation Recall,▁▆▅▆▅▂▇███

0,1
Epoch,10.0
Train Accuracy,0.70492
Train Loss,0.68484
Validation Accuracy,0.64661
Validation F1,0.6426
Validation Loss,0.83244
Validation Precision,0.64615
Validation Recall,0.64661


[I 2025-08-11 22:07:34,754] Trial 4 finished with value: 0.6468582562104238 and parameters: {'learning_rate': 0.0001571216817173418, 'weight_decay': 0.002123357971205705, 'patience': 2, 'batch_size': 64, 'num_layers': 2}. Best is trial 2 with value: 0.7162688748173405.



Best Validation Accuracy: 0.7163
Best hyperparameters: {'learning_rate': 8.738156512522717e-05, 'weight_decay': 0.0009944317276910349, 'patience': 3, 'batch_size': 32, 'num_layers': 3}


['optuna_ctbert_accuracy_study.pkl']