<a href="https://colab.research.google.com/github/shahriarivari/Persian_sentiment_analysis/blob/main/BERT_notebooks/Train_BERT_Tokenizer.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#some pip installs

In [1]:
!pip install tokenizers
!pip install datasets
!pip install stopwords_guilannlp
!pip install hazm

Collecting datasets
  Downloading datasets-2.16.1-py3-none-any.whl (507 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m507.1/507.1 kB[0m [31m5.4 MB/s[0m eta [36m0:00:00[0m
Collecting pyarrow-hotfix (from datasets)
  Downloading pyarrow_hotfix-0.6-py3-none-any.whl (7.9 kB)
Collecting dill<0.3.8,>=0.3.0 (from datasets)
  Downloading dill-0.3.7-py3-none-any.whl (115 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m115.3/115.3 kB[0m [31m10.4 MB/s[0m eta [36m0:00:00[0m
Collecting multiprocess (from datasets)
  Downloading multiprocess-0.70.15-py310-none-any.whl (134 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m7.3 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: pyarrow-hotfix, dill, multiprocess, datasets
Successfully installed datasets-2.16.1 dill-0.3.7 multiprocess-0.70.15 pyarrow-hotfix-0.6
Collecting stopwords_guilannlp
  Downloading stopwords_guilannlp-13.2019.3.5-py3-none-any.

In [1]:
# some imports
import os
from tokenizers import BertWordPieceTokenizer
from datasets import load_dataset, Dataset

import numpy as np
import pandas as pd

# Preprocessing
from stopwords_guilannlp import stopwords_output
from hazm import *


##Loading the labeled dataset

In [2]:
# importing datasets
test = pd.read_csv('test.csv', index_col=None, header=None, encoding="utf-8")
original = pd.read_csv('original.csv', index_col=None, header=None, encoding="utf-8")
balanced = pd.read_csv('balanced.csv', index_col=None, header=None, encoding="utf-8")
translation = pd.read_csv('translation.csv', index_col=None, header=None, encoding="utf-8")

In [15]:
stacked_array = np.concatenate([test[0], original[0], balanced[0], translation[0]])

###Preprocessing

In [17]:
puncs = ['،', '.', ',', ':', ';', '"']
normalizer = Normalizer()
lemmatizer = Lemmatizer()

# turn a doc into clean tokens
def clean_doc(doc):
    doc = normalizer.normalize(doc) # Normalize document using Hazm Normalizer
    tokenized = word_tokenize(doc)  # Tokenize text
    tokens = []
    for t in tokenized:
      temp = t
      for p in puncs:
        temp = temp.replace(p, '')
      tokens.append(temp)
    # tokens = [w for w in tokens if not w in stop_set]    # Remove stop words
    tokens = [w for w in tokens if not len(w) <= 1]
    tokens = [w for w in tokens if not w.isdigit()]
    tokens = [lemmatizer.lemmatize(w) for w in tokens] # Lemmatize sentence words using Hazm Lemmatizer
    tokens = ' '.join(tokens)
    return tokens

In [18]:
# Apply preprocessing step to training data
stacked_array_processed = np.empty_like(stacked_array)
for index, document in enumerate(stacked_array):
  stacked_array_processed[index] = clean_doc(document)

In [21]:
#saving data as .txt

# Specify the file path for the .txt file
file_path = 'fine_tune_data.txt'

# Write the stacked array to a .txt file
np.savetxt(file_path, stacked_array, fmt='%s')

print(f'The stacked array has been written to {file_path}.')

The stacked array has been written to fine_tune_data.txt.


#Loading the dataset
We are using Huggingface's data hub

##Loading the data

In [24]:
# You should just change this part in order to download your
# parts of corpus.
indices = {
    "train": [81, 14, 3,
              # 94, 35,
              # 41, 28, 67, 55, 79
              ],
    "test": [0,
            #  1
             ]
}

N_FILES = {
    "train": 126,
    "test": 3
}
_BASE_URL = "https://huggingface.co/datasets/SLPL/naab/resolve/main/data/"
data_url = {
    "train": [_BASE_URL + "train-{:05d}-of-{:05d}.txt".format(x, N_FILES["train"]) for x in range(N_FILES["train"])],
    "test": [_BASE_URL + "test-{:05d}-of-{:05d}.txt".format(x, N_FILES["test"]) for x in range(N_FILES["test"])],
}
for index in indices['train']:
    assert index < N_FILES['train']
for index in indices['test']:
    assert index < N_FILES['test']
data_files = {
    "train": [data_url['train'][i] for i in indices['train']],
    "test": [data_url['test'][i] for i in indices['test']]
}


In [25]:
dataset_train = load_dataset('text', data_files=data_files["train"], use_auth_token=False)
dataset_test = load_dataset('text', data_files=data_files["test"], use_auth_token=False)

Downloading data:   0%|          | 0.00/1.07G [00:00<?, ?B/s]

Generating train split: 0 examples [00:00, ? examples/s]

##Turn each dataset into a .txt file to feed to the tokenizer

In [26]:
for index, dataset in enumerate([dataset_train, dataset_test]):
  current_dataset = dataset['train']['text']
  # Save the current slice to a file
  current_file = f'dataset{index + 1}.txt'
  print(f"starting writing dataset{index + 1}")
  with open(current_file, 'w', encoding='utf-8') as file:
      for text in current_dataset:
          file.write(text + '\n')

  print(f"finished writing dataset{index + 1}")
  # Free up memory by deleting the variable
  del current_dataset

starting writing dataset1
finished writing dataset1
starting writing dataset2
finished writing dataset2


In [27]:
# Set your paths and file names
tokenizer_output_dir = "bert_tokenizer"
if not os.path.isdir(tokenizer_output_dir):
    os.mkdir(tokenizer_output_dir)

##Training the tokenizer

In [28]:
# Training a WordPiece Tokenizer
files = ["dataset1.txt", "dataset2.txt", "fine_tune_data.txt"]
# Parameters for Tokenizer Training
vocab_size = 10_000
min_frequency = 20
special_tokens = ["[PAD]", "[MASK]", "[CLS]", "[SEP]", "[UNK]"]

# Initialize the WordPiece tokenizer for BERT
tokenizer = BertWordPieceTokenizer()
print("starting to train the toeknizer")

# Train the tokenizer
tokenizer.train(
    files=files,
    vocab_size=vocab_size,
    min_frequency=min_frequency,
    show_progress=True,
    special_tokens=special_tokens,
)
print("training is done now saving...")
# Save the trained tokenizer
tokenizer.save_model(tokenizer_output_dir)

starting to train the toeknizer
training is done now saving...


['bert_tokenizer/vocab.txt']