In [None]:
!pip install transformers datasets accelerate



In [None]:
!pip install --upgrade transformers



In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import os

# path in google drive where my chess text files are stored
TEXT_FOLDER = "/content/drive/MyDrive/chess_text_data/files/"

def load_all_texts(folder_path):
    texts = []
    for filename in os.listdir(folder_path):
        if filename.endswith(".txt"):
            with open(os.path.join(folder_path, filename), 'r', encoding='utf-8', errors='ignore') as f:
                text = f.read()
                texts.append(text)
    return texts

texts = load_all_texts(TEXT_FOLDER)
print(f"Num documents loaded: {len(texts)}")

Loaded 11 documents.


In [None]:
from itertools import chain

# break up long text into smaller pieces
def chunk_text(text, max_length=512):
    """Split a large text into smaller chunks of max_length tokens"""
    tokenized = tokenizer(text, truncation=False, padding=False)
    input_ids = tokenized['input_ids']

    # split into chunks if text is longer than max_length
    return [input_ids[i:i + max_length] for i in range(0, len(input_ids), max_length)]

def tokenize_function(examples):
    # first chunk text into smaller chunks
    all_chunks = list(chain(*[chunk_text(text) for text in examples["text"]]))

    # tokenize and label for MLM
    encodings = tokenizer.pad({'input_ids': all_chunks}, padding=True)

    labels = encodings["input_ids"].copy()
    for i in range(len(labels)):
        labels[i] = [token if torch.rand(1).item() > 0.15 else -100 for token in labels[i]]

    encodings["labels"] = labels
    return encodings

In [None]:
from datasets import Dataset
from transformers import BertTokenizerFast, BertForMaskedLM

# load tokenizer and model
tokenizer = BertTokenizerFast.from_pretrained("bert-base-uncased")
model = BertForMaskedLM.from_pretrained("bert-base-uncased")

from sklearn.model_selection import train_test_split

# split into 90% train, 10% validation
train_texts, val_texts = train_test_split(texts, test_size=0.1, random_state=42)

from datasets import Dataset

train_dataset = Dataset.from_dict({'text': train_texts})
val_dataset = Dataset.from_dict({'text': val_texts})

tokenized_train = train_dataset.map(tokenize_function, batched=True, remove_columns=["text"])
tokenized_val = val_dataset.map(tokenize_function, batched=True, remove_columns=["text"])

# divide tokens into blocks of size 512
block_size = 512

def group_texts(examples):
    concatenated = {k: sum(examples[k], []) for k in examples.keys()}
    total_length = (len(concatenated[list(examples.keys())[0]]) // block_size) * block_size
    result = {
        k: [t[i : i + block_size] for i in range(0, total_length, block_size)]
        for k, t in concatenated.items()
    }
    return result

lm_train = tokenized_train.map(group_texts, batched=True)
lm_val = tokenized_val.map(group_texts, batched=True)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMaskedLM: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Map:   0%|          | 0/9 [00:00<?, ? examples/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (46961 > 512). Running this sequence through the model will result in indexing errors
You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Map:   0%|          | 0/2 [00:00<?, ? examples/s]

Map:   0%|          | 0/705 [00:00<?, ? examples/s]

Map:   0%|          | 0/151 [00:00<?, ? examples/s]

In [None]:
# SOME CHECKS TO ENSURE CORRECT TOKENIZATION AND DIVISION INTO TRAIN AND VAL

# check length of the first tokenized input
print(len(tokenized_train[0]['input_ids']))

# get number of tokens in the train and val dataset
train_tokens_count = sum([len(example['input_ids']) for example in tokenized_train])
val_tokens_count = sum([len(example['input_ids']) for example in tokenized_val])

print(f"Total tokens in train dataset: {train_tokens_count}")
print(f"Total tokens in validation dataset: {val_tokens_count}")

512
Total tokens in train dataset: 360960
Total tokens in validation dataset: 77312


In [None]:
import torch

In [None]:
from transformers import DataCollatorForLanguageModeling, TrainingArguments

data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=True,
    mlm_probability=0.15
)

# directory in drive to save the model
output_dir = "/content/drive/MyDrive/chess-bert-mlm"

training_args = TrainingArguments(
    output_dir=output_dir,
    overwrite_output_dir=True,
    eval_strategy="epoch",
    num_train_epochs=20,
    per_device_train_batch_size=8,
    save_steps=500,
    save_total_limit=2,
    logging_steps=100,
    learning_rate=3e-5,
    weight_decay=0.01,
    prediction_loss_only=True,
    fp16=torch.cuda.is_available(),
)

In [None]:
from transformers import Trainer

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=lm_train,
    eval_dataset=lm_val,
    tokenizer=tokenizer,
    data_collator=data_collator,
)

trainer.train()

  trainer = Trainer(


Epoch,Training Loss,Validation Loss
1,No log,2.433528
2,2.748200,2.367669
3,2.350000,2.292325
4,2.219200,2.202871
5,2.123200,2.180906
6,2.063400,2.178564
7,1.968700,2.162492
8,1.933900,2.14732
9,1.891000,2.113254
10,1.891000,2.092436


TrainOutput(global_step=1780, training_loss=1.9394332800018654, metrics={'train_runtime': 591.1817, 'train_samples_per_second': 23.851, 'train_steps_per_second': 3.011, 'total_flos': 3711187860480000.0, 'train_loss': 1.9394332800018654, 'epoch': 20.0})

In [None]:
final_path = output_dir + "/final_model"
trainer.save_model(final_path)
tokenizer.save_pretrained(final_path)
print(f"Success, saved to {final_path}")

Model saved to /content/drive/MyDrive/chess-bert-mlm/final_model


In [None]:
from transformers import pipeline

fill_mask = pipeline("fill-mask", model=model, tokenizer=tokenizer)

fill_mask("The most aggressive opening is the [MASK] defense.")

fill_mask("The [MASK] attack catches many novices off guard in chess.")

Device set to use cuda:0


[{'score': 0.07478274405002594,
  'token': 3313,
  'token_str': 'double',
  'sequence': 'the double attack catches many novices off guard in chess.'},
 {'score': 0.06943342834711075,
  'token': 4474,
  'token_str': 'surprise',
  'sequence': 'the surprise attack catches many novices off guard in chess.'},
 {'score': 0.030931485816836357,
  'token': 3722,
  'token_str': 'simple',
  'sequence': 'the simple attack catches many novices off guard in chess.'},
 {'score': 0.026560653001070023,
  'token': 5000,
  'token_str': 'knight',
  'sequence': 'the knight attack catches many novices off guard in chess.'},
 {'score': 0.02193371020257473,
  'token': 4525,
  'token_str': 'resulting',
  'sequence': 'the resulting attack catches many novices off guard in chess.'}]

In [None]:
# MLM pipeline
mlm_pipeline = pipeline("fill-mask", model=model, tokenizer=tokenizer)

example_sentence = "The chess game started with the [MASK]."
predictions = mlm_pipeline(example_sentence)

for prediction in predictions:
    print(f"Token: {prediction['token_str']} | Score: {prediction['score']}")

Device set to use cuda:0


Token: diagram | Score: 0.11852438747882843
Token: king | Score: 0.05426441878080368
Token: queen | Score: 0.03489914909005165
Token: game | Score: 0.02870725840330124
Token: ending | Score: 0.024650724604725838


In [None]:
# test sentences for prediction
test_sentences = [
    "The most aggressive opening is the [MASK] defense.",
    "The [MASK] attack catches many novices off guard in chess.",
    "The [MASK] is the most valuable chess piece.",
    "The pawn can [MASK] the knight.",
    "The knight moves to the [MASK] square on the chess board.",
    "In chess, [MASK] plays first.",
    "The pawn can [MASK] to a queen when it reaches the end of the chess board.",
    "I think, therefore I [MASK].",
    "I like [MASK] notation.",
    "He likes to play [MASK] because he is smart."
]

for sentence in test_sentences:
    predictions = mlm_pipeline(sentence)
    print(f"Input: {sentence}")
    for pred in predictions:
        print(f"Predicted Token: {pred['token_str']} | Score: {pred['score']}")

Input: The most aggressive opening is the [MASK] defense.
Predicted Token: sicilian | Score: 0.3536076247692108
Predicted Token: indian | Score: 0.15814310312271118
Predicted Token: french | Score: 0.09819328784942627
Predicted Token: scandinavian | Score: 0.03905859217047691
Predicted Token: american | Score: 0.018449973315000534
Input: The [MASK] attack catches many novices off guard in chess.
Predicted Token: double | Score: 0.07478274405002594
Predicted Token: surprise | Score: 0.06943342834711075
Predicted Token: simple | Score: 0.030931485816836357
Predicted Token: knight | Score: 0.026560653001070023
Predicted Token: resulting | Score: 0.02193371020257473
Input: The [MASK] is the most valuable chess piece.
Predicted Token: bishop | Score: 0.1851297914981842
Predicted Token: knight | Score: 0.1105460450053215
Predicted Token: rook | Score: 0.07221522927284241
Predicted Token: queen | Score: 0.06810544431209564
Predicted Token: king | Score: 0.05515346676111221
Input: The pawn can