In [3]:
import json
import torch
from transformers import BertTokenizer, BertForMaskedLM, Trainer, TrainingArguments
from torch.utils.data import Dataset


In [2]:
!pip install transformers[torch]
!pip install accelerate -U
!pip install transformers[torch] -U


Collecting accelerate>=0.21.0 (from transformers[torch])
  Downloading accelerate-0.27.2-py3-none-any.whl (279 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m280.0/280.0 kB[0m [31m7.4 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: accelerate
Successfully installed accelerate-0.27.2
Collecting transformers[torch]
  Downloading transformers-4.38.1-py3-none-any.whl (8.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m8.5/8.5 MB[0m [31m28.2 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: transformers
  Attempting uninstall: transformers
    Found existing installation: transformers 4.37.2
    Uninstalling transformers-4.37.2:
      Successfully uninstalled transformers-4.37.2
Successfully installed transformers-4.38.1


In [3]:
!pip show accelerate


Name: accelerate
Version: 0.27.2
Summary: Accelerate
Home-page: https://github.com/huggingface/accelerate
Author: The HuggingFace team
Author-email: sylvain@huggingface.co
License: Apache
Location: /usr/local/lib/python3.10/dist-packages
Requires: huggingface-hub, numpy, packaging, psutil, pyyaml, safetensors, torch
Required-by: 


In [1]:
# For Colab


from google.colab import files

# Select and upload the dataset file

uploaded = files.upload()



Saving training_data.zip to training_data.zip


In [2]:
# For Colab

import zipfile

# unzip the dataset file

with zipfile.ZipFile('training_data.zip', 'r') as zip_ref:
    zip_ref.extractall()


In [1]:
import torch


if torch.cuda.is_available():

    # Tell PyTorch to use the GPU.
    device = torch.device("cuda")

    print('There are %d GPU(s) available.' % torch.cuda.device_count())

    print('We will use the GPU:', torch.cuda.get_device_name(0))

else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

There are 1 GPU(s) available.
We will use the GPU: NVIDIA GeForce RTX 3070


In [4]:
# Function: load data set
def load_dataset(file_path):
    with open(file_path, 'r', encoding='utf-8-sig') as file:
        lines = file.readlines()
        data = [json.loads(line) for line in lines]
    return data

In [5]:
# Custom PyTorch dataset class
class YueDataset(Dataset):
    def __init__(self, encodings):
        self.encodings = encodings

    def __getitem__(self, idx):
        return {key: val[idx].clone().detach() if torch.is_tensor(val[idx]) else torch.tensor(val[idx]) for key, val in self.encodings.items()}

    def __len__(self):
        return len(self.encodings['input_ids'])

In [6]:
# Load the dataset
train_data = load_dataset('training_data/train.json')
validation_data = load_dataset('training_data/validation.json')
test_data = load_dataset('training_data/test.json')

In [7]:
from transformers import BertTokenizer
import json
# Initialize the tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased')

#Load the dataset
with open('training_data/train.json', 'r', encoding='utf-8') as file:
    data = [json.loads(line) for line in file]

# Calculate the token length of each sentence
token_lengths = []
for item in data:
    tokens = tokenizer.encode(item['translation']['yue'], add_special_tokens=True)
    token_lengths.append(len(tokens))

# Find the maximum token length
max_token_length = max(token_lengths)
print(f"The maximum length in tokens is: {max_token_length}")


Token indices sequence length is longer than the specified maximum sequence length for this model (545 > 512). Running this sequence through the model will result in indexing errors


The maximum length in tokens is: 1336


In [8]:
sorted_lengths = sorted(token_lengths, reverse=True)
sorted_lengths

[1336,
 1200,
 984,
 970,
 874,
 780,
 771,
 731,
 625,
 622,
 545,
 532,
 531,
 490,
 486,
 470,
 466,
 458,
 456,
 450,
 447,
 442,
 438,
 435,
 431,
 427,
 414,
 411,
 408,
 404,
 404,
 404,
 401,
 399,
 392,
 389,
 389,
 373,
 372,
 365,
 362,
 361,
 358,
 357,
 353,
 353,
 352,
 349,
 349,
 347,
 345,
 340,
 339,
 338,
 337,
 333,
 333,
 333,
 332,
 331,
 330,
 328,
 325,
 319,
 318,
 318,
 316,
 316,
 314,
 313,
 312,
 311,
 310,
 310,
 310,
 309,
 309,
 308,
 306,
 306,
 305,
 304,
 304,
 304,
 303,
 301,
 301,
 300,
 297,
 295,
 295,
 294,
 293,
 290,
 290,
 287,
 287,
 286,
 285,
 285,
 283,
 282,
 280,
 278,
 277,
 277,
 275,
 275,
 274,
 274,
 274,
 273,
 273,
 273,
 272,
 267,
 266,
 266,
 265,
 264,
 263,
 263,
 263,
 262,
 262,
 260,
 260,
 260,
 260,
 260,
 260,
 259,
 259,
 258,
 258,
 257,
 257,
 256,
 255,
 255,
 254,
 254,
 254,
 253,
 253,
 252,
 251,
 251,
 251,
 250,
 250,
 249,
 249,
 248,
 247,
 247,
 247,
 246,
 244,
 244,
 243,
 243,
 241,
 241,
 241,
 240,
 2

In [9]:
# Prepare data set
def prepare_dataset(data):
    texts = [item['translation']['yue'] for item in data]
    encodings = tokenizer(texts, padding=True, truncation=True, max_length=512, return_tensors="pt")
    return YueDataset(encodings)

In [8]:
train_dataset = prepare_dataset(train_data)
validation_dataset = prepare_dataset(validation_data)
test_dataset = prepare_dataset(test_data)  # Can be used for subsequent testing and evaluation



In [9]:
train_dataset[0]

{'input_ids': tensor([  101,  2072,  7069,  2527,  2724,  3472,  2301,  4953,  4889, 10064,
          2475,  7069,  2527,   100,  3472,  2301,  4914,  5252,  1882,   102,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,   

In [14]:
# Data sorting
from transformers import DataCollatorForLanguageModeling
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=True, mlm_probability=0.15)

In [15]:
#Load model
model = BertForMaskedLM.from_pretrained('bert-base-multilingual-cased').cuda()  # Make sure you have a CUDA environment available

Some weights of the model checkpoint at bert-base-multilingual-cased were not used when initializing BertForMaskedLM: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [16]:
#Training parameters
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,
    per_device_train_batch_size=4,  # Modify batch size here
    learning_rate=5e-5,
    logging_dir='./logs',
    save_steps=10_000,
    save_total_limit=2,
    evaluation_strategy="epoch", #Evaluate at the end of each epoch
    fp16=True
)



In [17]:
#Initialize the trainer
trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=train_dataset,
    eval_dataset=validation_dataset,  # Use validation set
)


In [18]:
import time
# Record training start time
start_time = time.time()

# Start training
trainer.train()

# Record the training end time
end_time = time.time()

# Calculate training duration
training_duration = end_time - start_time

# Convert training duration from seconds to minutes
minutes, seconds = divmod(training_duration, 60)



Epoch,Training Loss,Validation Loss
1,3.4532,
2,3.1317,
3,2.7775,


In [19]:
print(f"Training completed in {int(minutes)} minutes and {int(seconds)} seconds.")

Training completed in 494 minutes and 52 seconds.


In [1]:
# Optional: evaluate the model on the test set
trainer.evaluate(test_dataset)

NameError: name 'trainer' is not defined

In [20]:
# Save the Bert model

model.save_pretrained('FYP-Bert_model(multiple)')
tokenizer.save_pretrained('FYP-Bert_model(multiple)')


('FYP-Bert_model(multiple)\\tokenizer_config.json',
 'FYP-Bert_model(multiple)\\special_tokens_map.json',
 'FYP-Bert_model(multiple)\\vocab.txt',
 'FYP-Bert_model(multiple)\\added_tokens.json')

In [22]:
torch.save(model.state_dict(), 'FYP-Bert_model(multiple).pth')

In [35]:
# Masked Language Modeling testing

from transformers import BertForMaskedLM, BertTokenizer
import torch

model = BertForMaskedLM.from_pretrained('FYP-Bert_model(multiple)')
tokenizer = BertTokenizer.from_pretrained('FYP-Bert_model(multiple)')

# Prepare text marked with [MASK]
texts = [
    '請教 巴打 們 買 部 手提電腦 做 文書 野 [MASK]下 片 有時 用下 電量 要求 無咩 所謂 周圍 有得 叉 最緊要 靚 舒服 重量 最好 輕 需要 打機 推介 睇過 麻煩 各位 巴打',
    '我[MASK]天很開心，因為今天是我的生日。',
    '明天的天氣[MASK]會下雨。',
    '曾是「香港手信」 自由行客拿起又放[MASK]低 報販：無人敢買',
    '曾 是 香港 手信 自由行 客 拿起 放 [MASK] 低 報販 無人 敢 買'
    '買 6D 定 [MASK] 6Dmkii',
    '[MASK](全名HyperText Markup Language)，又譯超文字標記語言，係目前用喺網頁嘅主流標記語言。HTML元素係網頁嘅基本組成模組'
]
inputs = tokenizer(texts, return_tensors='pt', padding=True, truncation=True)

# Use the model to predict the word at [MASK] position
with torch.no_grad():
    predictions = model(**inputs).logits

# Get the indices of [MASK] tokens
mask_token_indices = torch.where(inputs.input_ids == tokenizer.mask_token_id)

# Get the IDs of the predicted most likely words for each [MASK]
predicted_token_ids = torch.argmax(predictions, dim=-1)[mask_token_indices]

# Convert IDs to words
predicted_tokens = [tokenizer.decode(token_id.item()) for token_id in predicted_token_ids]

# Print the predicted tokens for each example
for i, predicted_token in enumerate(predicted_tokens):
    print(f"Predicted token for text {i+1}: {predicted_token}")

Predicted token for text 1: 睇
Predicted token for text 2: 今
Predicted token for text 3: 就
Predicted token for text 4: 低
Predicted token for text 5: 放
Predicted token for text 6: 買
Predicted token for text 7: H T M L


In [26]:
print(model)

BertForMaskedLM(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(119547, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_