In [1]:
from datasets import load_dataset

dataset = load_dataset(
    "csv",
    data_files={
        "train": "data/data-train-prototype.csv",
        "test": "data/data-test-prototype.csv"
    }
)

print(dataset)

  from .autonotebook import tqdm as notebook_tqdm


DatasetDict({
    train: Dataset({
        features: ['text'],
        num_rows: 21000000
    })
    test: Dataset({
        features: ['text'],
        num_rows: 9000000
    })
})


In [2]:
from tokenizers import Tokenizer, models
from tokenizers.normalizers import NFKC, Sequence as NormalizerSequence
from tokenizers.pre_tokenizers import Whitespace
from tokenizers.trainers import BpeTrainer
from transformers import PreTrainedTokenizerFast

# Step 1: Create and train tokenizer
tokenizer = Tokenizer(models.BPE())
tokenizer.normalizer = NormalizerSequence([NFKC()])
tokenizer.pre_tokenizer = Whitespace()

trainer = BpeTrainer(
    vocab_size=32000,
    special_tokens=["<unk>", "<pad>", "<s>", "</s>", "<mask>"]
)

tokenizer.train(files=["ma3bani-prototype-corpus.txt"], trainer=trainer)

# Step 2: Save to Hugging Face-compatible tokenizer
hf_tokenizer = PreTrainedTokenizerFast(
    tokenizer_object=tokenizer,
    unk_token="<unk>",
    pad_token="<pad>",
    bos_token="<s>",
    eos_token="</s>",
    mask_token="<mask>"
)

hf_tokenizer.save_pretrained("ma3bani-prototype-tokenizer")

print("✅ Hugging Face tokenizer created and saved.")


✅ Hugging Face tokenizer created and saved.


In [3]:
from tokenizers import Tokenizer, models, trainers, pre_tokenizers
from tokenizers.normalizers import Sequence, NFKC
from transformers import PreTrainedTokenizerFast

# Step 1: Initialize tokenizer
tokenizer = Tokenizer(models.BPE(unk_token="<unk>"))

# Step 2: Normalize and pre-tokenize
tokenizer.normalizer = Sequence([NFKC()])
tokenizer.pre_tokenizer = pre_tokenizers.Whitespace()

# Step 3: Set training parameters
trainer = trainers.BpeTrainer(
    vocab_size=32000,
    special_tokens=["<unk>", "<pad>", "<s>", "</s>", "<mask>"]
)

# Step 4: Train tokenizer
tokenizer.train(["ma3bani-prototype-corpus.txt"], trainer)

# Step 5: Wrap with Hugging Face and save
hf_tokenizer = PreTrainedTokenizerFast(
    tokenizer_object=tokenizer,
    unk_token="<unk>",
    pad_token="<pad>",
    bos_token="<s>",
    eos_token="</s>",
    mask_token="<mask>"
)

hf_tokenizer.save_pretrained("ma3bani-prototype-tokenizer")

print("✅ Tokenizer successfully trained and saved to 'ma3bani-tokenizer/'")


✅ Tokenizer successfully trained and saved to 'ma3bani-tokenizer/'


In [4]:
from transformers import PreTrainedTokenizerFast

tokenizer = PreTrainedTokenizerFast.from_pretrained("ma3bani-prototype-tokenizer")
tokenizer.save_pretrained("ma3bani-prototype-tokenizer")

('ma3bani-prototype-tokenizer\\tokenizer_config.json',
 'ma3bani-prototype-tokenizer\\special_tokens_map.json',
 'ma3bani-prototype-tokenizer\\tokenizer.json')

In [5]:
from transformers import PreTrainedTokenizerFast, DataCollatorForLanguageModeling
from functools import partial

tokenizer = PreTrainedTokenizerFast.from_pretrained("ma3bani-prototype-tokenizer")

def tokenize_function(examples, tokenizer):
    return tokenizer(examples["text"], truncation=True, max_length=256, padding="max_length")

tokenized = dataset.map(
    partial(tokenize_function, tokenizer=tokenizer),
    batched=True,
    num_proc=4,
    remove_columns=["text"]
)

data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=True,
    mlm_probability=0.15
)


In [None]:
from transformers import BertConfig, BertForMaskedLM

config = BertConfig(
    vocab_size=tokenizer.vocab_size,
    hidden_size=384, #500
    num_hidden_layers=4, #6
    num_attention_heads=8, #16,12
    intermediate_size=2048,
    max_position_embeddings=512,
    type_vocab_size=2
)

model = BertForMaskedLM(config)


BertForMaskedLM has generative capabilities, as `prepare_inputs_for_generation` is explicitly overwritten. However, it doesn't directly inherit from `GenerationMixin`. From 👉v4.50👈 onwards, `PreTrainedModel` will NOT inherit from `GenerationMixin`, and this model will lose the ability to call `generate` and other related functions.
  - If you are the owner of the model architecture code, please modify your model class such that it inherits from `GenerationMixin` (after `PreTrainedModel`, otherwise you'll get an exception).
  - If you are not the owner of the model architecture class, please contact the model code owner to update it.


In [None]:
from transformers import TrainingArguments, Trainer
import math

# def compute_metrics(eval_pred):
#     loss =eval_pred.metrics["eval_loss"]                                            #Saving for later
#     return {"perplexity": math.exp(loss)} if loss else {}

training_args = TrainingArguments(
    output_dir="./ma3bani-prototype",
    overwrite_output_dir=True,
    num_train_epochs=2,
    per_device_train_batch_size=32,
    gradient_accumulation_steps=2,
    evaluation_strategy="steps",
    save_strategy="steps",
    eval_steps=2000,
    save_steps=2000,
    save_total_limit=2,
    warmup_steps=500,
    learning_rate=5e-5,
    lr_scheduler_type="linear",
    metric_for_best_model="eval_loss",
    greater_is_better=False,
    load_best_model_at_end=True,
    eval_accumulation_steps=8,
    # predict_with_generate=False,
    logging_steps=1000,
    logging_dir='./logs',
    fp16=True,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized["train"],
    eval_dataset=tokenized["test"].select(range(1000)),
    tokenizer=tokenizer,
    data_collator=data_collator,
    # compute_metrics=compute_metrics
)

trainer.train()
# trainer.train(resume_from_checkpoint=True)

  trainer = Trainer(


Step,Training Loss,Validation Loss
2000,7.4811,7.626669
4000,7.5685,7.447351
6000,7.4349,7.23632
8000,7.3242,7.141355
10000,7.2055,7.113891
12000,7.091,6.912715
14000,7.0006,6.674437
16000,6.8857,6.641129
18000,6.7778,6.531026
20000,6.6826,6.435897
