<a href="https://colab.research.google.com/github/shahriarivari/Persian_sentiment_analysis/blob/main/Persian_sentiment_analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install tokenizers
!pip install datasets
!pip install -U accelerate
!pip install -U transformers

# BERT pre-training

## Initial imports

In [2]:
import os
import json
from tokenizers import BertWordPieceTokenizer
from transformers import BertForMaskedLM, BertConfig
from transformers import DataCollatorForLanguageModeling
from transformers import Trainer, TrainingArguments
from datasets import load_dataset
from transformers import TrainerCallback
import logging

## Importing dataset from huggingface hub

In [3]:
from datasets import load_dataset

# You should just change this part in order to download your
# parts of corpus.
indices = {
    "train": [5],
    "test": [0]
}

N_FILES = {
    "train": 126,
    "test": 3
}
_BASE_URL = "https://huggingface.co/datasets/SLPL/naab/resolve/main/data/"
data_url = {
    "train": [_BASE_URL + "train-{:05d}-of-{:05d}.txt".format(x, N_FILES["train"]) for x in range(N_FILES["train"])],
    "test": [_BASE_URL + "test-{:05d}-of-{:05d}.txt".format(x, N_FILES["test"]) for x in range(N_FILES["test"])],
}
for index in indices['train']:
    assert index < N_FILES['train']
for index in indices['test']:
    assert index < N_FILES['test']
data_files = {
    "train": [data_url['train'][i] for i in indices['train']],
    "test": [data_url['test'][i] for i in indices['test']]
}
print(data_files)
dataset = load_dataset('text', data_files=data_files, use_auth_token=False)

{'train': ['https://huggingface.co/datasets/SLPL/naab/resolve/main/data/train-00005-of-00126.txt'], 'test': ['https://huggingface.co/datasets/SLPL/naab/resolve/main/data/test-00000-of-00003.txt']}




Downloading data files:   0%|          | 0/2 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/1.07G [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/1.07G [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/2 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

## Setting paths and file names

In [4]:
# Set your paths and file names
# data_files = ["path/to/your_dataset_file.txt"]
# os.mkdir(model_path)
tokenizer_output_dir = "bert_tokenizer"
os.mkdir(tokenizer_output_dir)
pretrained_model_output_dir = "bert_pretrained_model"
os.mkdir(pretrained_model_output_dir)

## Making files for trainging the tokenizer

In [None]:
# Extract text data from the dataset
texts = dataset['train']['text']  # Assuming you have a 'text' column in your dataset

# Save the text data to a temporary file
temp_file_path = 'temp_dataset_file.txt'
with open(temp_file_path, 'w', encoding='utf-8') as file:
    for text in texts:
        file.write(text + '\n')

## Training a WordPiece Tokenizer

In [None]:
# Training a WordPiece Tokenizer
files = [temp_file_path]
# Parameters for Tokenizer Training
vocab_size = 30_522
min_frequency = 2
max_length = 512
special_tokens = ["[PAD]", "[MASK]", "[CLS]", "[SEP]", "[UNK]"]

# Initialize the WordPiece tokenizer for BERT
tokenizer = BertWordPieceTokenizer()

# Train the tokenizer
tokenizer.train(
    files=files,
    vocab_size=vocab_size,
    min_frequency=min_frequency,
    show_progress=True,
    special_tokens=special_tokens,
)

# enable truncation up to the maximum 512 tokens
tokenizer.enable_truncation(max_length=max_length)

# Save the trained tokenizer
# model_path = "pretrained_bert_tokenizer"
# # make the directory if not already there
# if not os.path.isdir(model_path):
#     os.mkdir(model_path)

# Save the trained tokenizer
tokenizer.save_model(tokenizer_output_dir)

# Clean up: Remove the temporary file
os.remove(temp_file_path)

['bert_tokenizer/vocab.txt']

In [None]:
import json
# dumping some of the tokenizer config to config file,
# including special tokens, whether
# to lower case and the maximum sequence length

with open(os.path.join(model_path, "config.json"), "w") as f:
    tokenizer_cfg = {
        "do_lower_case": True,
        "unk_token": "[UNK]",
        "sep_token": "[SEP]",
        "pad_token": "[PAD]",
        "cls_token": "[CLS]",
        "mask_token": "[MASK]",
        "model_max_length": max_length,
        "max_len": max_length,
    }
    json.dump(tokenizer_cfg, f)

## tokenizing the dataset

In [5]:
vocab_size = 30_522
min_frequency = 2
max_length = 512
special_tokens = ["[PAD]", "[MASK]", "[CLS]", "[SEP]", "[UNK]"]

In [None]:
# Load tokenizer after training
# tokenizer = BertWordPieceTokenizer(tokenizer_output_dir + "/" + "vocab.txt")

In [6]:
from transformers import BertTokenizerFast
# Load the tokenizer
tokenizer = BertTokenizerFast.from_pretrained(tokenizer_output_dir)

In [None]:
# max_sequence_length = max_length

# # Define a custom tokenize function
# def tokenize_function(example):

#   # Ensure that the "text" field is a string
#   if not isinstance(example["text"], str):
#     raise TypeError("Input 'text' must be a string.")

#   # Tokenize the text
#   encoded = tokenizer.encode(example["text"])

#   # Truncate or pad to fit the maximum sequence length
#   if len(encoded.ids) > max_sequence_length:
#     encoded.truncate(max_sequence_length)
#   else:
#     encoded.pad(max_sequence_length - len(encoded.ids))

#   return {
#     'input_ids': encoded.ids,
#     'attention_mask': encoded.attention_mask,
# }

# # Use the map function to tokenize the "train" split
# tokenized_train_dataset = dataset['train'].select(list(range(10000))).map(tokenize_function,num_proc=4)

# Use the map function to tokenize the "test" split
# tokenized_test_dataset = dataset['test'].map(tokenize_function)


In [7]:
max_sequence_length = max_length

# Define a custom tokenize function
def tokenize_function(example):
    # Ensure that the "text" field is a string
    if not isinstance(example["text"], str):
        raise TypeError("Input 'text' must be a string.")

    # Tokenize the text using BertTokenizerFast
    encoded = tokenizer(example["text"], truncation=True, padding='max_length',
                        return_special_tokens_mask=True,
                        max_length=max_sequence_length, return_tensors='pt')

    return {
        'input_ids': encoded['input_ids'].squeeze(),
        'attention_mask': encoded['attention_mask'].squeeze(),
    }

# Use the map function to tokenize the "train" split
tokenized_train_dataset = dataset['train'].select(list(range(10000))).map(tokenize_function, num_proc=4)
tokenized_test_dataset = dataset['test'].select(list(range(10000))).map(tokenize_function, num_proc=4)

Map (num_proc=4):   0%|          | 0/10000 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/10000 [00:00<?, ? examples/s]

In [8]:
tokenized_train_dataset.set_format(type="torch", columns=["input_ids", "attention_mask"])
tokenized_test_dataset.set_format(type="torch", columns=["input_ids", "attention_mask"])

## Model Configuration

In [9]:
# Model Configuration
model_config = BertConfig(
    vocab_size=vocab_size,
    hidden_size=768,  # Adjust as needed
    num_attention_heads=12,  # Adjust as needed
    num_hidden_layers=12,  # Adjust as needed
    max_position_embeddings=max_length,
)
# Model Initialization
model = BertForMaskedLM(config=model_config)

## Data Collator
Load your dataset using Hugging Face datasets library

In [10]:
from transformers import DataCollatorForLanguageModeling

data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer, mlm=True, mlm_probability=0.2
    )
# class CustomDataCollator(DataCollatorForLanguageModeling):
#     def collate_batch(self, batch):
#         input_ids = torch.stack([torch.tensor(example['input_ids']) for example in batch])
#         attention_mask = torch.stack([torch.tensor(example['attention_mask']) for example in batch])
#         labels = input_ids.clone()  # Use input_ids as labels for MLM

#         # Mask out some tokens for MLM training
#         probability_matrix = torch.full(labels.shape, 0.15)
#         special_tokens_mask = [
#             self.tokenizer.get_special_tokens_mask(val, already_has_special_tokens=True) for val in input_ids.tolist()
#         ]
#         probability_matrix.masked_fill_(torch.tensor(special_tokens_mask, dtype=torch.bool), value=0.0)

#         masked_indices = torch.bernoulli(probability_matrix).bool()
#         labels[masked_indices] = self.tokenizer.mask_token_id

#         return {
#             'input_ids': input_ids,
#             'attention_mask': attention_mask,
#             'labels': labels
#         }


## Callback

In [15]:
from transformers import TrainerCallback
import logging

# Define a custom callback for monitoring
class CustomCallback(TrainerCallback):
    def __init__(self):
        super().__init__()

    def on_step_end(self, args, state, control, model, *extra_args, **kwargs):
        if state.global_step % args.logging_steps == 0:
            logging.info(f"Step {state.global_step}: Loss = {state.log_metrics['loss']}")


## Training Arguments

In [None]:
training_args = TrainingArguments(
    output_dir=model_path,          # output directory to where save model checkpoint
    evaluation_strategy="steps",    # evaluate each `logging_steps` steps
    overwrite_output_dir=True,
    num_train_epochs=10,            # number of training epochs, feel free to tweak
    per_device_train_batch_size=10, # the training batch size, put it as high as your GPU memory fits
    gradient_accumulation_steps=8,  # accumulating the gradients before updating the weights
    per_device_eval_batch_size=64,  # evaluation batch size
    logging_steps=1000,             # evaluate, log and save model checkpoints every 1000 step
    save_steps=1000,
    load_best_model_at_end=True,    # whether to load the best model (in terms of loss) at the end of training
    # save_total_limit=3,           # whether you don't have much space so you let only 3 model weights saved in the disk
)

In [12]:
# Training Arguments
training_args = TrainingArguments(
    output_dir=pretrained_model_output_dir,
    overwrite_output_dir=True,
    num_train_epochs=3,
    per_device_train_batch_size=10,
    gradient_accumulation_steps=8,
    per_device_eval_batch_size=64,
    logging_steps=1000,
    save_steps=1000,
    save_total_limit=3,
    evaluation_strategy="steps",
    eval_steps=500,
)

## Trainer Initialization

In [17]:
# Trainer Initialization
trainer = Trainer(
    model = model,
    args = training_args,
    train_dataset = tokenized_train_dataset,
    eval_dataset = tokenized_test_dataset,
    data_collator = data_collator,
    #data_collator=CustomDataCollator(tokenizer=tokenizer, mlm=True, mlm_probability=0.15),
    #callbacks = [CustomCallback()],
)

In [18]:
# Train the model
trainer.train()

# Save the final pre-trained model
trainer.save_model(os.path.join(pretrained_model_output_dir, "final_model"))

Step,Training Loss,Validation Loss


In [19]:
import shutil

# Zip the directory
shutil.make_archive("pretrained_model_output", 'zip', pretrained_model_output_dir)

# Move the zip file to /content
shutil.move("pretrained_model_output.zip", "pretrained_model_output.zip")


'pretrained_model_output.zip'

In [20]:
import torch

# Clear GPU memory
torch.cuda.empty_cache()


In [None]:
from transformers import BertForPreTraining

# Load the saved BERT model
model = BertForPreTraining.from_pretrained(os.path.join(pretrained_model_output_dir, "final_model"))


In [24]:
import os

# Specify the path to the directory
directory_path = "/content/bert_pretrained_model"

# Get the size of the directory
directory_size = sum([os.path.getsize(os.path.join(directory_path, file)) for file in os.listdir(directory_path)])

# Convert to MB
directory_size_mb = directory_size / (1024 * 1024)

print(f"Size of the directory: {directory_size_mb:.2f} MB")


Size of the directory: 0.01 MB


# XLnet pre-training

## Initial imports

In [None]:
import os
from tokenizers import XLNetWordPieceTokenizer
from transformers import XLNetLMHeadModel, XLNetConfig
from transformers import DataCollatorForLanguageModeling
from transformers import Trainer, TrainingArguments
from datasets import load_dataset
from transformers import TrainerCallback
from torch.utils.data import DataLoader
import torch
import logging

## Importing dataset from huggingface hub

In [None]:
from datasets import load_dataset

# You should just change this part in order to download your
# parts of corpus.
indices = {
    "train": [5, 1, 2],
    "test": [0, 2]
}

N_FILES = {
    "train": 126,
    "test": 3
}
_BASE_URL = "https://huggingface.co/datasets/SLPL/naab/resolve/main/data/"
data_url = {
    "train": [_BASE_URL + "train-{:05d}-of-{:05d}.txt".format(x, N_FILES["train"]) for x in range(N_FILES["train"])],
    "test": [_BASE_URL + "test-{:05d}-of-{:05d}.txt".format(x, N_FILES["test"]) for x in range(N_FILES["test"])],
}
for index in indices['train']:
    assert index < N_FILES['train']
for index in indices['test']:
    assert index < N_FILES['test']
data_files = {
    "train": [data_url['train'][i] for i in indices['train']],
    "test": [data_url['test'][i] for i in indices['test']]
}
print(data_files)
dataset = load_dataset('text', data_files=data_files, use_auth_token=False)

## Setting paths and file names

In [None]:
# Set your paths and file names
data_files = ["path/to/your_dataset_file.txt"]
tokenizer_output_dir = "path/to/xlnet_tokenizer"
pretrained_model_output_dir = "path/to/xlnet_pretrained_model"

## Making files for trainging the tokenizer

In [None]:
# Extract text data from the dataset
texts = dataset['train']['text']  # Assuming you have a 'text' column in your dataset

# Save the text data to a temporary file
temp_file_path = 'temp_dataset_file.txt'
with open(temp_file_path, 'w', encoding='utf-8') as file:
    for text in texts:
        file.write(text + '\n')

# Training a WordPiece Tokenizer
files = [temp_file_path]

## Training a wordPiece Tokenizer

In [None]:
# Training a WordPiece Tokenizer
files = [temp_file_path]

# Parameters for Tokenizer Training
vocab_size = 30000
min_frequency = 2
max_length = 128
special_tokens = ["<pad>", "<mask>", "<cls>", "<sep>", "<unk>"]

# Initialize the WordPiece tokenizer for XLNet
tokenizer = XLNetWordPieceTokenizer()

# Train the tokenizer
tokenizer.train(
    files=files,
    vocab_size=vocab_size,
    min_frequency=min_frequency,
    show_progress=True,
    special_tokens=special_tokens,
)

# Save the trained tokenizer
tokenizer.save_model(tokenizer_output_dir)

# Save the trained tokenizer
model_path = "pretrained_xlnet_tokenizer"
# make the directory if not already there
if not os.path.isdir(model_path):
    os.mkdir(model_path)

# Save the trained tokenizer
tokenizer.save_model(tokenizer_output_dir)

# Clean up: Remove the temporary file
os.remove(temp_file_path)

## Tokenizing the dataset

In [None]:
# Load tokenizer after training
tokenizer = XLNetWordPieceTokenizer(f"{tokenizer_output_dir}/vocab.json", f"{tokenizer_output_dir}/merges.txt")

In [None]:
# Tokenizing the data using dataset.map()
def tokenize_function(examples):
    return tokenizer(examples['text'], padding=True, truncation=True)

tokenized_dataset = dataset.map(tokenize_function, batched=True)

## Model Configuration

In [None]:
# Model Configuration
model_config = XLNetConfig(
    vocab_size=vocab_size,
    d_model=768,  # Adjust as needed
    n_head=12,  # Adjust as needed
    num_layers=12,  # Adjust as needed
    max_position_embeddings=max_length,
)

# Model Initialization
model = XLNetLMHeadModel(config=model_config)

## Data Collator

In [None]:
# Data Collator for Language Modeling
class CustomDataCollator(DataCollatorForLanguageModeling):
  def collate_batch(self, batch):
    input_ids = torch.stack([torch.tensor(example['input_ids']) for example in batch])
    attention_mask = torch.stack([torch.tensor(example['attention_mask']) for example in batch])
    labels = torch.stack([torch.tensor(example['input_ids']) for example in batch])
    return {
      'input_ids': input_ids,
      'attention_mask': attention_mask,
      'labels': labels
    }


## Callback

In [None]:
# Define a custom callback for monitoring
class CustomCallback(TrainerCallback):
  def __init__(self):
    super().__init__()

  def on_step_end(self, args, state, control, model, optimizer, scheduler, **kwargs):
    if state.global_step % args.logging_steps == 0:
      logging.info(f"Step {state.global_step}: Loss = {state.log_metrics['loss']}")

## Training Arguments

In [None]:
# Training Arguments
training_args = TrainingArguments(
    output_dir=pretrained_model_output_dir,
    overwrite_output_dir=True,
    num_train_epochs=3,
    per_device_train_batch_size=4,
    save_steps=1000,
    save_total_limit=3,
    evaluation_strategy="steps",
    eval_steps=500,
)

In [None]:
# Tokenized data for training
train_inputs = {
    "input_ids": tokenized_dataset["input_ids"],
    "attention_mask": tokenized_dataset["attention_mask"],
}

## Trainer Initialazation

In [None]:
# Trainer Initialization
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_inputs,
    data_collator=CustomDataCollator(tokenizer=tokenizer, mlm=True, mlm_probability=0.15),
    callbacks=[CustomCallback()],
)

In [None]:
# Train the model
trainer.train()

# Save the final pre-trained model
trainer.save_model(os.path.join(pretrained_model_output_dir, "final_model"))