In [1]:
!pip -q install -U transformers accelerate


In [2]:
import os, re, random
import numpy as np
import pandas as pd
import torch

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, f1_score, classification_report, confusion_matrix

def set_seed(seed=42):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)

set_seed(42)


In [3]:
DATA_PATH = "/kaggle/input/datasets/sajaahamasha/arabic-dialect5/Arabic_dialect.csv"
df = pd.read_csv(DATA_PATH)

df = df[["t", "result"]].dropna()
df["t"] = df["t"].astype(str)
df["result"] = df["result"].astype(str)

df.head()


Unnamed: 0,t,result
0,ÿØŸá ÿ±ŸÖŸÑ ÿ≠ÿ∂ÿ±Ÿä,E
1,ŸÖÿπŸÑŸáÿ¥ ŸäÿπŸÜŸä ÿ≠ÿ∂ÿ±ÿ™ŸÉ ÿ≠ÿ∑Ÿäÿ™ ÿµŸàÿ±ÿ™ŸÉ ŸÖŸÉÿßŸÜ ÿßŸÑŸÉÿ≠ŸÉ ŸÅŸä ÿµŸàÿ± ...,E
2,ÿØŸá ÿØŸÑŸäŸÑ ÿπŸÑŸâ ÿßŸÜ ÿ≥ŸäÿßÿØÿ© ÿßŸÑÿ±Ÿäÿ≥ ŸÜÿ¨ÿ≠ ŸÅŸä ÿßŸÑŸÇÿ∂ÿßÿ° ÿπŸÑŸâ ÿß...,E
3,ŸÖÿßÿ™ŸÇŸÑŸÇÿ¥ ÿπ ÿßŸÑŸ§Ÿ• ŸÅŸä ŸÖÿµÿ± Ÿà ŸÑÿ£ŸàŸÑ ŸÖÿ±ÿ© ŸÖÿπÿß,E
4,Ÿäÿßÿ±ÿ®Ÿä ŸäÿπŸÜŸä ŸäŸàŸÖ ŸÖÿß ŸÖÿµÿ± ŸäŸÜÿ≤ŸÑ ŸÅŸäŸáÿß ÿ™ÿ±ÿßŸÜÿ≥ŸÅŸàÿ±ŸÖÿ± ŸäŸÉŸà...,E


In [None]:
URL_RE = re.compile(r"http\S+|www\.\S+")
MENTION_RE = re.compile(r"@\w+")
MULTISPACE_RE = re.compile(r"\s+")
AR_NUM_MAP = str.maketrans("Ÿ†Ÿ°Ÿ¢Ÿ£Ÿ§Ÿ•Ÿ¶ŸßŸ®Ÿ©", "0123456789")

NOISE_PATTERNS = [
    r"ŸÇÿßŸÖ ŸÖÿ≥ÿ™ÿÆÿØŸÖ ÿ®ŸÖÿ™ÿßÿ®ÿπÿ™ŸÉ",
    r"ŸäŸÖŸÉŸÜŸÉ ŸÖÿπÿ±ŸÅÿ™ŸáŸÖ",
    r"ÿßÿ¥ÿ™ÿ±ŸÉ",
    r"ÿ™ÿßÿ®ÿπŸÜÿß",
    r"ÿ¥ÿßŸáÿØ ÿßŸÑŸÖÿ≤ŸäÿØ",
    r"ÿßÿ∂ÿ∫ÿ∑ ŸáŸÜÿß",
    r"ÿ™ŸÖÿ™ ŸÖÿ¥ÿßÿ±ŸÉŸá",
]
noise_re = re.compile("|".join(NOISE_PATTERNS))

def basic_clean(text: str) -> str:
    text = str(text)
    text = URL_RE.sub(" ", text)
    text = MENTION_RE.sub(" ", text)
    text = text.replace("#", " ")
    text = re.sub(r"[‚Ä¢‚Äì‚ÄîŸÄ]", " ", text)
    text = MULTISPACE_RE.sub(" ", text).strip()
    return text

def normalize_digits(text: str) -> str:
    return str(text).translate(AR_NUM_MAP)

def normalize_ar(text: str, map_taa_marboota: bool = False) -> str:
    text = str(text)
    text = re.sub(r"[ÿ•ÿ£ÿ¢ÿß]", "ÿß", text)
    text = re.sub(r"Ÿâ", "Ÿä", text)
    text = re.sub(r"ÿ§", "Ÿà", text)
    text = re.sub(r"ÿ¶", "Ÿä", text)
    if map_taa_marboota:
        text = re.sub(r"ÿ©", "Ÿá", text)
    text = re.sub(r"ŸÄ", "", text)  
    return text

def deelongate(text: str) -> str:
    return re.sub(r"(.)\1{2,}", r"\1\1", text)

def keep_useful_chars(text: str) -> str:
    text = re.sub(r"[^0-9A-Za-z\u0600-\u06FF\s]", " ", str(text))
    text = MULTISPACE_RE.sub(" ", text).strip()
    return text

def preprocess(text: str, map_taa_marboota: bool = False) -> str:
    text = basic_clean(text)
    text = normalize_digits(text)
    text = normalize_ar(text, map_taa_marboota=map_taa_marboota)
    text = deelongate(text)
    text = keep_useful_chars(text)
    return text

def is_noise(text: str) -> bool:
    return bool(noise_re.search(str(text)))


In [None]:
tmp = df.copy()
tmp["base_clean"] = tmp["t"].apply(lambda x: preprocess(x, map_taa_marboota=False))

tmp["len"] = tmp["base_clean"].str.split().apply(len)
tmp = tmp[tmp["len"] >= 3].copy()

tmp = tmp[~tmp["base_clean"].apply(is_noise)].copy()

print("After base filtering:", len(tmp))

dfA = tmp.copy()
dfA["text_clean"] = dfA["t"].apply(lambda x: preprocess(x, map_taa_marboota=False))

dfB = tmp.copy()
dfB["text_clean"] = dfB["t"].apply(lambda x: preprocess(x, map_taa_marboota=True))

print(len(dfA), len(dfB))
dfA[["t", "text_clean", "result"]].head()


After base filtering: 33826
33826 33826


Unnamed: 0,t,text_clean,result
0,ÿØŸá ÿ±ŸÖŸÑ ÿ≠ÿ∂ÿ±Ÿä,ÿØŸá ÿ±ŸÖŸÑ ÿ≠ÿ∂ÿ±Ÿä,E
1,ŸÖÿπŸÑŸáÿ¥ ŸäÿπŸÜŸä ÿ≠ÿ∂ÿ±ÿ™ŸÉ ÿ≠ÿ∑Ÿäÿ™ ÿµŸàÿ±ÿ™ŸÉ ŸÖŸÉÿßŸÜ ÿßŸÑŸÉÿ≠ŸÉ ŸÅŸä ÿµŸàÿ± ...,ŸÖÿπŸÑŸáÿ¥ ŸäÿπŸÜŸä ÿ≠ÿ∂ÿ±ÿ™ŸÉ ÿ≠ÿ∑Ÿäÿ™ ÿµŸàÿ±ÿ™ŸÉ ŸÖŸÉÿßŸÜ ÿßŸÑŸÉÿ≠ŸÉ ŸÅŸä ÿµŸàÿ± ...,E
2,ÿØŸá ÿØŸÑŸäŸÑ ÿπŸÑŸâ ÿßŸÜ ÿ≥ŸäÿßÿØÿ© ÿßŸÑÿ±Ÿäÿ≥ ŸÜÿ¨ÿ≠ ŸÅŸä ÿßŸÑŸÇÿ∂ÿßÿ° ÿπŸÑŸâ ÿß...,ÿØŸá ÿØŸÑŸäŸÑ ÿπŸÑŸä ÿßŸÜ ÿ≥ŸäÿßÿØÿ© ÿßŸÑÿ±Ÿäÿ≥ ŸÜÿ¨ÿ≠ ŸÅŸä ÿßŸÑŸÇÿ∂ÿßÿ° ÿπŸÑŸä ÿß...,E
3,ŸÖÿßÿ™ŸÇŸÑŸÇÿ¥ ÿπ ÿßŸÑŸ§Ÿ• ŸÅŸä ŸÖÿµÿ± Ÿà ŸÑÿ£ŸàŸÑ ŸÖÿ±ÿ© ŸÖÿπÿß,ŸÖÿßÿ™ŸÇŸÑŸÇÿ¥ ÿπ ÿßŸÑ45 ŸÅŸä ŸÖÿµÿ± Ÿà ŸÑÿßŸàŸÑ ŸÖÿ±ÿ© ŸÖÿπÿß,E
4,Ÿäÿßÿ±ÿ®Ÿä ŸäÿπŸÜŸä ŸäŸàŸÖ ŸÖÿß ŸÖÿµÿ± ŸäŸÜÿ≤ŸÑ ŸÅŸäŸáÿß ÿ™ÿ±ÿßŸÜÿ≥ŸÅŸàÿ±ŸÖÿ± ŸäŸÉŸà...,Ÿäÿßÿ±ÿ®Ÿä ŸäÿπŸÜŸä ŸäŸàŸÖ ŸÖÿß ŸÖÿµÿ± ŸäŸÜÿ≤ŸÑ ŸÅŸäŸáÿß ÿ™ÿ±ÿßŸÜÿ≥ŸÅŸàÿ±ŸÖÿ± ŸäŸÉŸà...,E


In [6]:
le = LabelEncoder()
dfA["label"] = le.fit_transform(dfA["result"])
dfB["label"] = le.transform(dfB["result"])

num_labels = len(le.classes_)
print("Classes:", le.classes_)

def split_data(dfX):
    return train_test_split(
        dfX["text_clean"].values,
        dfX["label"].values,
        test_size=0.2,
        random_state=42,
        stratify=dfX["label"].values
    )

trainA, valA, ytrainA, yvalA = split_data(dfA)
trainB, valB, ytrainB, yvalB = split_data(dfB)


Classes: ['E' 'G' 'J' 'Y']


In [7]:
from transformers import AutoTokenizer
model_name = "UBC-NLP/MARBERT"
tokenizer = AutoTokenizer.from_pretrained(model_name)

class DialectDataset(torch.utils.data.Dataset):
    def __init__(self, texts, labels, tokenizer, max_len=128):
        self.encodings = tokenizer(
            list(texts),
            truncation=True,
            padding=True,
            max_length=max_len
        )
        self.labels = labels

    def __getitem__(self, idx):
        item = {k: torch.tensor(v[idx]) for k, v in self.encodings.items()}
        item["labels"] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)


config.json:   0%|          | 0.00/701 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/376 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

In [None]:
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer, EarlyStoppingCallback

def train_final(train_texts, val_texts, y_train, y_val, run_name="marbert_dialect_B"):
    train_ds = DialectDataset(train_texts, y_train, tokenizer, max_len=128)
    val_ds   = DialectDataset(val_texts, y_val, tokenizer, max_len=128)

    model = AutoModelForSequenceClassification.from_pretrained(
        model_name,
        num_labels=num_labels
    )

    args = TrainingArguments(
        output_dir=f"./{run_name}",
        num_train_epochs=5,
        learning_rate=2e-5,
        weight_decay=0.01,
        warmup_ratio=0.1,
        per_device_train_batch_size=16,
        per_device_eval_batch_size=16,
        logging_steps=100,
        save_strategy="epoch",
        eval_strategy="epoch",  
        load_best_model_at_end=True,
        metric_for_best_model="eval_loss",
        report_to="none"
    )

    trainer = Trainer(
        model=model,
        args=args,
        train_dataset=train_ds,
        eval_dataset=val_ds,
        callbacks=[EarlyStoppingCallback(early_stopping_patience=1)]
    )

    trainer.train()

    pred = trainer.predict(val_ds)
    preds = np.argmax(pred.predictions, axis=1)

    acc = accuracy_score(y_val, preds)
    f1  = f1_score(y_val, preds, average="weighted")
    print(f"FINAL -> ACC: {acc:.4f}, F1: {f1:.4f}")
    print(classification_report(y_val, preds, target_names=le.classes_))

    trainer.save_model(f"./{run_name}/best_model")
    tokenizer.save_pretrained(f"./{run_name}/best_model")

    return trainer, preds


In [11]:
print("Starting Final Training (Best preprocessing = B)")

trainer, preds_final = train_final(
    trainB,
    valB,
    ytrainB,
    yvalB,
    run_name="marbert_dialect_final"
)

Starting Final Training (Best preprocessing = B)


Loading weights:   0%|          | 0/199 [00:00<?, ?it/s]

[1mBertForSequenceClassification LOAD REPORT[0m from: UBC-NLP/MARBERT
Key                                        | Status     | 
-------------------------------------------+------------+-
cls.predictions.transform.dense.weight     | UNEXPECTED | 
cls.seq_relationship.weight                | UNEXPECTED | 
cls.seq_relationship.bias                  | UNEXPECTED | 
cls.predictions.transform.dense.bias       | UNEXPECTED | 
cls.predictions.transform.LayerNorm.weight | UNEXPECTED | 
cls.predictions.decoder.weight             | UNEXPECTED | 
cls.predictions.transform.LayerNorm.bias   | UNEXPECTED | 
cls.predictions.bias                       | UNEXPECTED | 
classifier.weight                          | MISSING    | 
classifier.bias                            | MISSING    | 

[3mNotes:
- UNEXPECTED[3m	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.
- MISSING[3m	:those params were newly initialized because missing from the checkpoint. Con

Epoch,Training Loss,Validation Loss
1,0.615803,0.548168
2,0.386329,0.485148
3,0.193352,0.772322


Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]



Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]



Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

There were missing keys in the checkpoint model loaded: ['bert.embeddings.LayerNorm.weight', 'bert.embeddings.LayerNorm.bias', 'bert.encoder.layer.0.attention.output.LayerNorm.weight', 'bert.encoder.layer.0.attention.output.LayerNorm.bias', 'bert.encoder.layer.0.output.LayerNorm.weight', 'bert.encoder.layer.0.output.LayerNorm.bias', 'bert.encoder.layer.1.attention.output.LayerNorm.weight', 'bert.encoder.layer.1.attention.output.LayerNorm.bias', 'bert.encoder.layer.1.output.LayerNorm.weight', 'bert.encoder.layer.1.output.LayerNorm.bias', 'bert.encoder.layer.2.attention.output.LayerNorm.weight', 'bert.encoder.layer.2.attention.output.LayerNorm.bias', 'bert.encoder.layer.2.output.LayerNorm.weight', 'bert.encoder.layer.2.output.LayerNorm.bias', 'bert.encoder.layer.3.attention.output.LayerNorm.weight', 'bert.encoder.layer.3.attention.output.LayerNorm.bias', 'bert.encoder.layer.3.output.LayerNorm.weight', 'bert.encoder.layer.3.output.LayerNorm.bias', 'bert.encoder.layer.4.attention.output.La

FINAL -> ACC: 0.9261, F1: 0.9262
              precision    recall  f1-score   support

           E       0.95      0.93      0.94      1792
           G       0.89      0.95      0.92      2004
           J       0.90      0.87      0.88      1498
           Y       0.98      0.94      0.96      1472

    accuracy                           0.93      6766
   macro avg       0.93      0.92      0.93      6766
weighted avg       0.93      0.93      0.93      6766



Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

In [13]:
label_map = {i: c for i, c in enumerate(le.classes_)}  # {0:'E',1:'G',2:'J',3:'Y'}

def predict_dialect(text, model, map_taa_marboota=True):
    text_clean = preprocess(text, map_taa_marboota=map_taa_marboota)
    enc = tokenizer(text_clean, return_tensors="pt", truncation=True, padding=True, max_length=128)
    enc = {k: v.to(model.device) for k, v in enc.items()}
    with torch.no_grad():
        logits = model(**enc).logits
    pred_id = int(torch.argmax(logits, dim=1).cpu().numpy()[0])
    return label_map[pred_id], text_clean


In [None]:
import os, json, shutil
import numpy as np
import torch

from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer

BEST_DIR = "./best_model"
os.makedirs(BEST_DIR, exist_ok=True)

label_map = {i: c for i, c in enumerate(le.classes_)} 
with open(os.path.join(BEST_DIR, "label_map.json"), "w", encoding="utf-8") as f:
    json.dump(label_map, f, ensure_ascii=False, indent=2)

def run_one_experiment(
    train_texts, val_texts, y_train, y_val,
    lr=2e-5, batch_size=16, epochs=3, max_len=128, weight_decay=0.01, warmup_ratio=0.1,
    exp_name="exp"
):
    train_ds = DialectDataset(train_texts, y_train, tokenizer, max_len=max_len)
    val_ds   = DialectDataset(val_texts, y_val, tokenizer, max_len=max_len)

    model = AutoModelForSequenceClassification.from_pretrained(
        model_name,
        num_labels=num_labels
    )

    args = TrainingArguments(
        output_dir=f"./runs/{exp_name}",
        num_train_epochs=epochs,
        learning_rate=lr,
        weight_decay=weight_decay,
        warmup_ratio=warmup_ratio,
        per_device_train_batch_size=batch_size,
        per_device_eval_batch_size=batch_size,
        logging_steps=100,
        save_strategy="no",
        report_to="none"
    )

    trainer = Trainer(
        model=model,
        args=args,
        train_dataset=train_ds
    )

    trainer.train()

    pred = trainer.predict(val_ds)
    preds = np.argmax(pred.predictions, axis=1)

    acc = accuracy_score(y_val, preds)
    f1  = f1_score(y_val, preds, average="weighted")

    return trainer, {"acc": acc, "f1": f1, "preds": preds}

In [None]:
experiments = [
    {"lr": 2e-5, "batch_size": 16, "epochs": 3, "max_len": 128},
    {"lr": 1e-5, "batch_size": 16, "epochs": 3, "max_len": 192},
    {"lr": 2e-5, "batch_size": 8,  "epochs": 4, "max_len": 192},
    {"lr": 1e-5, "batch_size": 8,  "epochs": 4, "max_len": 256}, 

best = {"f1": -1, "acc": -1, "config": None}

for i, cfg in enumerate(experiments, 1):
    exp_name = f"exp_{i}_lr{cfg['lr']}_bs{cfg['batch_size']}_ep{cfg['epochs']}_len{cfg['max_len']}"
    print("\n==============================")
    print("Running:", exp_name)
    print("==============================")

    trainer, metrics = run_one_experiment(
        trainB, valB, ytrainB, yvalB,
        lr=cfg["lr"],
        batch_size=cfg["batch_size"],
        epochs=cfg["epochs"],
        max_len=cfg["max_len"],
        exp_name=exp_name
    )

    print(f"ACC={metrics['acc']:.4f} | F1={metrics['f1']:.4f}")
    if metrics["f1"] > best["f1"]:
 
        if os.path.exists(BEST_DIR):
            for item in os.listdir(BEST_DIR):
                p = os.path.join(BEST_DIR, item)
                if os.path.isdir(p):
                    shutil.rmtree(p)
                elif item != "label_map.json":
                    os.remove(p)

        trainer.save_model(BEST_DIR)
        tokenizer.save_pretrained(BEST_DIR)

        with open(os.path.join(BEST_DIR, "best_config.json"), "w", encoding="utf-8") as f:
            json.dump(best, f, ensure_ascii=False, indent=2)



Running: exp_1_lr2e-05_bs16_ep3_len128




Loading weights:   0%|          | 0/199 [00:00<?, ?it/s]

[1mBertForSequenceClassification LOAD REPORT[0m from: UBC-NLP/MARBERT
Key                                        | Status     | 
-------------------------------------------+------------+-
cls.predictions.transform.dense.weight     | UNEXPECTED | 
cls.seq_relationship.weight                | UNEXPECTED | 
cls.seq_relationship.bias                  | UNEXPECTED | 
cls.predictions.transform.dense.bias       | UNEXPECTED | 
cls.predictions.transform.LayerNorm.weight | UNEXPECTED | 
cls.predictions.decoder.weight             | UNEXPECTED | 
cls.predictions.transform.LayerNorm.bias   | UNEXPECTED | 
cls.predictions.bias                       | UNEXPECTED | 
classifier.weight                          | MISSING    | 
classifier.bias                            | MISSING    | 

[3mNotes:
- UNEXPECTED[3m	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.
- MISSING[3m	:those params were newly initialized because missing from the checkpoint. Con

Step,Training Loss
100,2.344087
200,0.972475
300,0.784493
400,0.712596
500,0.695763
600,0.609337
700,0.589009
800,0.596206
900,0.451705
1000,0.382774


ACC=0.9258 | F1=0.9261


Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

‚úÖ Saved NEW BEST model to: ./best_model

Running: exp_2_lr1e-05_bs16_ep3_len192


Loading weights:   0%|          | 0/199 [00:00<?, ?it/s]

[1mBertForSequenceClassification LOAD REPORT[0m from: UBC-NLP/MARBERT
Key                                        | Status     | 
-------------------------------------------+------------+-
cls.predictions.transform.dense.weight     | UNEXPECTED | 
cls.seq_relationship.weight                | UNEXPECTED | 
cls.seq_relationship.bias                  | UNEXPECTED | 
cls.predictions.transform.dense.bias       | UNEXPECTED | 
cls.predictions.transform.LayerNorm.weight | UNEXPECTED | 
cls.predictions.decoder.weight             | UNEXPECTED | 
cls.predictions.transform.LayerNorm.bias   | UNEXPECTED | 
cls.predictions.bias                       | UNEXPECTED | 
classifier.weight                          | MISSING    | 
classifier.bias                            | MISSING    | 

[3mNotes:
- UNEXPECTED[3m	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.
- MISSING[3m	:those params were newly initialized because missing from the checkpoint. Con

Step,Training Loss
100,2.607818
200,1.303933
300,0.884734
400,0.745858
500,0.743595
600,0.663139
700,0.620375
800,0.605954
900,0.49805
1000,0.446169


ACC=0.9231 | F1=0.9233

Running: exp_3_lr2e-05_bs8_ep4_len192


Loading weights:   0%|          | 0/199 [00:00<?, ?it/s]

[1mBertForSequenceClassification LOAD REPORT[0m from: UBC-NLP/MARBERT
Key                                        | Status     | 
-------------------------------------------+------------+-
cls.predictions.transform.dense.weight     | UNEXPECTED | 
cls.seq_relationship.weight                | UNEXPECTED | 
cls.seq_relationship.bias                  | UNEXPECTED | 
cls.predictions.transform.dense.bias       | UNEXPECTED | 
cls.predictions.transform.LayerNorm.weight | UNEXPECTED | 
cls.predictions.decoder.weight             | UNEXPECTED | 
cls.predictions.transform.LayerNorm.bias   | UNEXPECTED | 
cls.predictions.bias                       | UNEXPECTED | 
classifier.weight                          | MISSING    | 
classifier.bias                            | MISSING    | 

[3mNotes:
- UNEXPECTED[3m	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.
- MISSING[3m	:those params were newly initialized because missing from the checkpoint. Con

Step,Training Loss
100,2.702538
200,1.707862
300,0.93279
400,0.892376
500,0.859874
600,0.819942
700,0.759462
800,0.714885
900,0.720932
1000,0.73654


ACC=0.9265 | F1=0.9266


Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

‚úÖ Saved NEW BEST model to: ./best_model

Running: exp_4_lr1e-05_bs8_ep4_len256


Loading weights:   0%|          | 0/199 [00:00<?, ?it/s]

[1mBertForSequenceClassification LOAD REPORT[0m from: UBC-NLP/MARBERT
Key                                        | Status     | 
-------------------------------------------+------------+-
cls.predictions.transform.dense.weight     | UNEXPECTED | 
cls.seq_relationship.weight                | UNEXPECTED | 
cls.seq_relationship.bias                  | UNEXPECTED | 
cls.predictions.transform.dense.bias       | UNEXPECTED | 
cls.predictions.transform.LayerNorm.weight | UNEXPECTED | 
cls.predictions.decoder.weight             | UNEXPECTED | 
cls.predictions.transform.LayerNorm.bias   | UNEXPECTED | 
cls.predictions.bias                       | UNEXPECTED | 
classifier.weight                          | MISSING    | 
classifier.bias                            | MISSING    | 

[3mNotes:
- UNEXPECTED[3m	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.
- MISSING[3m	:those params were newly initialized because missing from the checkpoint. Con

Step,Training Loss
100,2.747143
200,2.260273
300,1.272639
400,0.985719
500,0.921277
600,0.84725
700,0.718042
800,0.743515
900,0.672857
1000,0.728884


ACC=0.9242 | F1=0.9243

üèÜ BEST RESULT: {'f1': 0.9266096444299694, 'acc': 0.9265444871415903, 'config': {'lr': 2e-05, 'batch_size': 8, 'epochs': 4, 'max_len': 192}}
