<a href="https://colab.research.google.com/github/shahriarivari/Diabete_RL/blob/main/Persian_sentiment_analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
%pip install transformers
%pip install tokenizers
%pip install datasets

# BERT pre-training

## Initial imports

In [None]:
import os
import json
from tokenizers import BertWordPieceTokenizer
from transformers import BertForMaskedLM, BertConfig
from transformers import DataCollatorForLanguageModeling
from transformers import Trainer, TrainingArguments
from datasets import load_dataset
from transformers import TrainerCallback
import logging

## Importing dataset from huggingface hub

In [None]:
from datasets import load_dataset

# You should just change this part in order to download your
# parts of corpus.
indices = {
    "train": [5, 1, 2],
    "test": [0, 2]
}

N_FILES = {
    "train": 126,
    "test": 3
}
_BASE_URL = "https://huggingface.co/datasets/SLPL/naab/resolve/main/data/"
data_url = {
    "train": [_BASE_URL + "train-{:05d}-of-{:05d}.txt".format(x, N_FILES["train"]) for x in range(N_FILES["train"])],
    "test": [_BASE_URL + "test-{:05d}-of-{:05d}.txt".format(x, N_FILES["test"]) for x in range(N_FILES["test"])],
}
for index in indices['train']:
    assert index < N_FILES['train']
for index in indices['test']:
    assert index < N_FILES['test']
data_files = {
    "train": [data_url['train'][i] for i in indices['train']],
    "test": [data_url['test'][i] for i in indices['test']]
}
print(data_files)
dataset = load_dataset('text', data_files=data_files, use_auth_token=False)

## Training a WordPiece Tokenizer

In [None]:
# Extract text data from the dataset
texts = dataset['train']['text']  # Assuming you have a 'text' column in your dataset

# Save the text data to a temporary file
temp_file_path = 'temp_dataset_file.txt'
with open(temp_file_path, 'w', encoding='utf-8') as file:
    for text in texts:
        file.write(text + '\n')

# Training a WordPiece Tokenizer
files = [temp_file_path]
vocab_size = 30000
special_tokens = ["[PAD]", "[MASK]", "[CLS]", "[SEP]", "[UNK]"]
# Initialize the WordPiece tokenizer
tokenizer = BertWordPieceTokenizer()
tokenizer.train(
    files=files,
    vocab_size = vocab_size,  # Adjust the vocabulary size according to your needs
    special_tokens= special_tokens,
)

In [None]:
# Save the trained tokenizer
model_path = "pretrained_bert_tokenizer"
# make the directory if not already there
if not os.path.isdir(model_path):
    os.mkdir(model_path)

tokenizer.save_model(model_path)

# Clean up: Remove the temporary file
os.remove(temp_file_path)

In [None]:
import json
# dumping some of the tokenizer config to config file,
# including special tokens, whether
# to lower case and the maximum sequence length
max_length = 512

with open(os.path.join(model_path, "config.json"), "w") as f:
    tokenizer_cfg = {
        "do_lower_case": True,
        "unk_token": "[UNK]",
        "sep_token": "[SEP]",
        "pad_token": "[PAD]",
        "cls_token": "[CLS]",
        "mask_token": "[MASK]",
        "model_max_length": max_length,
        "max_len": max_length,
    }
    json.dump(tokenizer_cfg, f)

## tokenizing the dataset

## Model Configuration

In [None]:
#LOADING THE TOKENIZER AS BertTokenizerFast
tokenizer = BertTokenizerFast.from_pretrained(model_path)

In [None]:
model_config = BertConfig(
    vocab_size=vocab_size + len(special_tokens),  # Adjust vocab size to include special tokens
    hidden_size=768,  # Set the hidden size of the transformer model
    num_attention_heads=12,  # Set the number of attention heads
    num_hidden_layers=12,  # Set the number of transformer layers
    intermediate_size=3072,  # Set the size of the intermediate (feed-forward) layer
)

# Model Initialization
model = BertForMaskedLM(config=model_config)

## Data Collator

In [None]:
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=True,
    mlm_probability=0.15  # You can adjust the masking probability
)

## Training Arguments

In [None]:
training_args = TrainingArguments(
    output_dir='path/to/output_dir',
    overwrite_output_dir=True,
    num_train_epochs=3,
    per_device_train_batch_size=4,
    save_steps=10_000,
    save_total_limit=3,
    learning_rate=5e-5,  # Set the learning rate
    weight_decay=0.01,  # Set weight decay for regularization
)

## Trainer Initialization

In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=dataset[dataset_split],  # Assuming you have 'train' split in your dataset
)

In [None]:
# Train the Model
trainer.train()