<a href="https://colab.research.google.com/github/shahriarivari/Persian_sentiment_analysis/blob/main/BERT_notebooks/Pretrain_BERT.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#pip installs

In [None]:
!pip install tokenizers
!pip install datasets
!pip install -U accelerate
!pip install -U transformers

Collecting datasets
  Downloading datasets-2.16.1-py3-none-any.whl (507 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m507.1/507.1 kB[0m [31m6.1 MB/s[0m eta [36m0:00:00[0m
Collecting pyarrow-hotfix (from datasets)
  Downloading pyarrow_hotfix-0.6-py3-none-any.whl (7.9 kB)
Collecting dill<0.3.8,>=0.3.0 (from datasets)
  Downloading dill-0.3.7-py3-none-any.whl (115 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m115.3/115.3 kB[0m [31m8.1 MB/s[0m eta [36m0:00:00[0m
Collecting multiprocess (from datasets)
  Downloading multiprocess-0.70.15-py310-none-any.whl (134 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m8.2 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: pyarrow-hotfix, dill, multiprocess, datasets
Successfully installed datasets-2.16.1 dill-0.3.7 multiprocess-0.70.15 pyarrow-hotfix-0.6
Collecting accelerate
  Downloading accelerate-0.25.0-py3-none-any.whl (265 kB)
[2K     [

#import libraries

In [None]:
import os
from transformers import BertTokenizerFast
from transformers import BertConfig , BertForMaskedLM
from transformers import DataCollatorForLanguageModeling
from transformers import Trainer, TrainingArguments
from datasets import load_dataset, Dataset
from transformers import TrainerCallback
import logging

##import dataset

In [None]:
# You should just change this part in order to download your
# parts of corpus.
indices = {
    "train": [81, 14, 3,
              # 94, 35,
              # 41, 28, 67, 55, 79
              ],
    "test": [0,
            #  1
             ]
}

N_FILES = {
    "train": 126,
    "test": 3
}
_BASE_URL = "https://huggingface.co/datasets/SLPL/naab/resolve/main/data/"
data_url = {
    "train": [_BASE_URL + "train-{:05d}-of-{:05d}.txt".format(x, N_FILES["train"]) for x in range(N_FILES["train"])],
    "test": [_BASE_URL + "test-{:05d}-of-{:05d}.txt".format(x, N_FILES["test"]) for x in range(N_FILES["test"])],
}
for index in indices['train']:
    assert index < N_FILES['train']
for index in indices['test']:
    assert index < N_FILES['test']
data_files = {
    "train": [data_url['train'][i] for i in indices['train']],
    "test": [data_url['test'][i] for i in indices['test']]
}

In [None]:
dataset = load_dataset('text', data_files=data_files, use_auth_token=False)



Downloading data:   0%|          | 0.00/1.07G [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/1.07G [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/1.07G [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/1.07G [00:00<?, ?B/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

#set paths and file names

In [None]:
# Set your paths and file names
tokenizer_output_dir = "bert_tokenizer"
os.mkdir(tokenizer_output_dir)
pretrained_model_output_dir = "bert_pretrained_model"
os.mkdir(pretrained_model_output_dir)

##load tokenzier

In [None]:
tokenizer = BertTokenizerFast.from_pretrained(tokenizer_output_dir,max_length= 256)

##tokenize the dataset

In [None]:
def batch_tokenize_function(batch):
    # Tokenize the text using BertTokenizerFast
    encoded = tokenizer(batch["text"], truncation=True, padding='max_length',
                        return_special_tokens_mask=True,
                        max_length= 256, return_tensors='pt')

    return {
        'input_ids': encoded['input_ids'],
        'attention_mask': encoded['attention_mask'],
    }

# tokenized_train_dataset = dataset['train'].select([i for i in range(1_000_000)]).map(batch_tokenize_function, batched=True)
# tokenized_test_dataset = dataset['test'].select([i for i in range(1_000_000)]).map(batch_tokenize_function, batched=True)

tokenized_train_dataset = dataset['train'].map(batch_tokenize_function, batched=True)
tokenized_test_dataset = dataset['test'].map(batch_tokenize_function, batched=True)

tokenized_train_dataset.set_format(type="torch", columns=["input_ids", "attention_mask"])
tokenized_test_dataset.set_format(type="torch", columns=["input_ids", "attention_mask"])

Map:   0%|          | 0/9183678 [00:00<?, ? examples/s]

##BERT model configuration

In [None]:
vocab_size = 10_000
max_length = 256

# Model Configuration
model_config = BertConfig(
    vocab_size=vocab_size,
    hidden_size=64,  # Adjust as needed
    num_attention_heads=8,  # Adjust as needed
    num_hidden_layers=4,  # Adjust as needed
    max_position_embeddings=max_length,
)

# Model Initialization
model = BertForMaskedLM(config=model_config)

##set the data collator

In [None]:
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer, mlm=True, mlm_probability=0.2
    )

##training arguments

In [None]:
# Training Arguments
training_args = TrainingArguments(
    output_dir=pretrained_model_output_dir, # output directory to where save model checkpoint
    overwrite_output_dir=True,
    num_train_epochs=1,                     # number of training epochs, feel free to tweak
    per_device_train_batch_size=64,         # the training batch size, put it as high as your GPU memory fits
    gradient_accumulation_steps=100,        # accumulating the gradients before updating the weights
    per_device_eval_batch_size=64,          # evaluation batch size
    logging_steps=1000,                     # evaluate, log and save model checkpoints every 1000 step
    save_steps=1000,
    save_total_limit=3,                     # whether you don't have much space so you let only 3 model weights saved in the disk
    evaluation_strategy="steps",            # evaluate each `logging_steps` steps
    eval_steps=1000,
)

##train the model

In [13]:
# Trainer Initialization
trainer = Trainer(
    model = model,
    args = training_args,
    train_dataset = tokenized_train_dataset,
    eval_dataset = tokenized_test_dataset,
    data_collator = data_collator,
)

# Train the model
trainer.train()

# Save the final pre-trained model
trainer.save_model(os.path.join(pretrained_model_output_dir, "final_model"))

Step,Training Loss,Validation Loss


Step,Training Loss,Validation Loss
1000,8.1494,7.65985


##saving the model in google drive

In [15]:
import shutil
#First zip the model_dir then move it to google drive

source_dir = '/content/bert_pretrained_model'
output_filename = pretrained_model_output_dir

# Create a Zip file
shutil.make_archive(output_filename, 'zip', source_dir)

'/content/bert_pretrained_model.zip'

In [None]:
# mount google  drive
from google.colab import drive

# Mount Google Drive
drive.mount('/content/drive')

# Move the zip file to Google Drive
drive_path = '/content/drive/MyDrive/'
output_filename = f'/content/{output_filename}.zip'
shutil.move(output_filename, drive_path)