I adapted this from a tutorial

## Setup


In [None]:
!pip install -q -U trl transformers accelerate git+https://github.com/huggingface/peft.git
!pip install -q datasets bitsandbytes einops wandb

  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m88.1/88.1 kB[0m [31m5.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.4/7.4 MB[0m [31m78.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m244.2/244.2 kB[0m [31m26.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m519.3/519.3 kB[0m [31m45.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m268.8/268.8 kB[0m [31m30.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m90.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m79.1 MB/s[0m eta [36m0

## Load Dataset

In [None]:
from google.colab import drive
drive.mount('/content/drive')

from datasets import load_from_disk

dataset = load_from_disk("/content/drive/MyDrive/ProjectNLP/data/final_datasets/classify_into_checkmate_check_none")
print(dataset)

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
DatasetDict({
    fen_mate_in_one_train: Dataset({
        features: ['input', 'label', 'clean_input'],
        num_rows: 181892
    })
    fen_mate_in_one_test: Dataset({
        features: ['input', 'label', 'clean_input'],
        num_rows: 9574
    })
})


In [None]:
train_datasets = {k: v for k, v in dataset.items() if k.endswith('_train')}


## Load the model

In this section we load the [ChessGPT 2.7B model](https://huggingface.co/Waterhorse/chessgpt-base-v1) and attach LoRA adapters on it.

In [None]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, AutoTokenizer
from google.colab import drive

model_name = "Waterhorse/chessgpt-base-v1"

model = AutoModelForCausalLM.from_pretrained(
    "Waterhorse/chessgpt-base-v1",#model_name,
 #   quantization_config=bnb_config,
    trust_remote_code=True,
    torch_dtype=torch.float16
    #low_cpu_mem_usage=True
)
model.config.use_cache = False

Downloading (…)lve/main/config.json:   0%|          | 0.00/604 [00:00<?, ?B/s]

Downloading (…)model.bin.index.json:   0%|          | 0.00/42.0k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

Downloading (…)l-00001-of-00002.bin:   0%|          | 0.00/10.1G [00:00<?, ?B/s]

Downloading (…)l-00002-of-00002.bin:   0%|          | 0.00/1.15G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Downloading (…)neration_config.json:   0%|          | 0.00/111 [00:00<?, ?B/s]

Let's also load the tokenizer below

In [None]:
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token

Downloading (…)okenizer_config.json:   0%|          | 0.00/237 [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/2.11M [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/99.0 [00:00<?, ?B/s]

Below we will load the configuration file in order to create the LoRA model. According to QLoRA paper, it is important to consider all linear layers in the transformer block for maximum performance. Therefore we will add `dense`, `dense_h_to_4_h` and `dense_4h_to_h` layers in the target modules in addition to the mixed query key value layer.

In [None]:
from peft import LoraConfig

lora_alpha = 16
lora_dropout = 0.1
lora_r = 64

peft_config = LoraConfig(
    lora_alpha=lora_alpha,
    lora_dropout=lora_dropout,
    target_modules=[
        "query_key_value",
        "dense",
        "dense_h_to_4h",
        "dense_4h_to_h",
    ],
    r=lora_r,
    bias="none",
    task_type="CAUSAL_LM",
)

## Load the trainer

Here we will use the [`SFTTrainer` from TRL library](https://huggingface.co/docs/trl/main/en/sft_trainer) that gives a wrapper around transformers `Trainer` to easily fine-tune models on instruction based datasets using PEFT adapters. Let's first load the training arguments below.

In [None]:
from transformers import TrainingArguments
from trl import SFTTrainer, DataCollatorForCompletionOnlyLM
from datasets import concatenate_datasets

output_dir = "/content/drive/MyDrive/ProjectNLP/saved_models/temp_new_A"
per_device_train_batch_size = 32
gradient_accumulation_steps = 4
optim = "paged_adamw_8bit"
save_steps = 500
logging_steps = 2
learning_rate = 2e-4
max_grad_norm = 0.3
#max_steps = 2000
warmup_ratio = 0.03
training_arguments = TrainingArguments(
    output_dir=output_dir,
    per_device_train_batch_size=per_device_train_batch_size,
    gradient_accumulation_steps=gradient_accumulation_steps,
    optim=optim,
    save_steps=save_steps,
    logging_steps=logging_steps,
    learning_rate=learning_rate,
    fp16=True,
    max_grad_norm=max_grad_norm,
 #   max_steps=max_steps,
    warmup_ratio=warmup_ratio,
   # group_by_length=True,
)

max_seq_length = 80

shuffled_datasets = [dataset.shuffle() for dataset in train_datasets.values()]
concatenated_dataset = concatenate_datasets(shuffled_datasets).shuffle()


def formatting_prompts_func(example):
    output_texts = []
    for i in range(len(example['input'])):
        text = f"### Q: {example['input'][i]}\n ### A: {example['label'][i]}"
        output_texts.append(text)
    return output_texts
response_template = " ### A:"
collator = DataCollatorForCompletionOnlyLM(response_template, tokenizer=tokenizer)

# Test
max_seq_length = 82
mixed = formatting_prompts_func(concatenated_dataset[:100000])
print(mixed[:20])
# Tokenize all sequences using the formatting function and check their length
over_max_length = sum(
    len(tokenizer.encode(row)) > max_seq_length
    for i, row in enumerate(mixed)
)
print(f"Number of sequences longer than max sequence length: {over_max_length}")


['### Q: Is this fen position 2Q1k1rr/5q2/2p2P2/1pPpB1b1/pP1Pp1P1/P3P2p/7P/5RRK a mate, a check or neither for White:\n ### A: mate', '### Q: Is this fen position 7r/pb2qk2/1p1pNpQ1/2pP1P2/4P3/8/PP3KP1/4R3 a mate, a check or neither for White:\n ### A: mate', '### Q: Is this fen position 3Q1b1r/6p1/3p1k1p/4pPq1/1P5N/3P3b/P4PPP/RN3RK1 a mate, a check or neither for White:\n ### A: check', '### Q: Is this fen position 6R1/7p/1p1p2pk/p3bp2/7Q/8/5P1P/5K2 a mate, a check or neither for White:\n ### A: mate', '### Q: Is this fen position 8/Q6p/R3k1p1/3p1p2/3P1P1P/2P1B3/6K1/8 a mate, a check or neither for White:\n ### A: mate', '### Q: Is this fen position 1r3rk1/ppp1qp2/2np1n1B/4p3/2P1P3/P1P2Q1P/2P3P1/R4RK1 a mate, a check or neither for White:\n ### A: neither', '### Q: Is this fen position r1b2r2/1pp1nNk1/pbn4p/4N2Q/2pq4/6BP/PP3PP1/R4RK1 a mate, a check or neither for White:\n ### A: neither', '### Q: Is this fen position 8/4QRk1/pp1P2p1/7p/4p3/PP5P/6P1/6K1 a mate, a check or neither for 

Then finally pass everthing to the trainer

In [None]:

trainer = SFTTrainer(
    model=model,
    train_dataset=concatenated_dataset,
    peft_config=peft_config,
   # dataset_text_field="input",
    max_seq_length=max_seq_length,
    tokenizer=tokenizer,
    formatting_func=formatting_prompts_func,
    args=training_arguments,
    data_collator=collator,
)


In [None]:
print(concatenated_dataset.select(range(0,10))['input'])

['Is this fen position 2Q1k1rr/5q2/2p2P2/1pPpB1b1/pP1Pp1P1/P3P2p/7P/5RRK a mate, a check or neither for White:', 'Is this fen position 7r/pb2qk2/1p1pNpQ1/2pP1P2/4P3/8/PP3KP1/4R3 a mate, a check or neither for White:', 'Is this fen position 3Q1b1r/6p1/3p1k1p/4pPq1/1P5N/3P3b/P4PPP/RN3RK1 a mate, a check or neither for White:', 'Is this fen position 6R1/7p/1p1p2pk/p3bp2/7Q/8/5P1P/5K2 a mate, a check or neither for White:', 'Is this fen position 8/Q6p/R3k1p1/3p1p2/3P1P1P/2P1B3/6K1/8 a mate, a check or neither for White:', 'Is this fen position 1r3rk1/ppp1qp2/2np1n1B/4p3/2P1P3/P1P2Q1P/2P3P1/R4RK1 a mate, a check or neither for White:', 'Is this fen position r1b2r2/1pp1nNk1/pbn4p/4N2Q/2pq4/6BP/PP3PP1/R4RK1 a mate, a check or neither for White:', 'Is this fen position 8/4QRk1/pp1P2p1/7p/4p3/PP5P/6P1/6K1 a mate, a check or neither for White:', 'Is this fen position 5kr1/p7/2p1Pp2/2Np4/5Q2/8/P1P2K2/5B2 a mate, a check or neither for White:', 'Is this fen position 8/p3p3/8/1k6/8/qP5P/P1P3Q1/1K2R

We will also pre-process the model by upcasting the layer norms in float 32 for more stable training

In [None]:
for name, module in trainer.model.named_modules():
    if "norm" in name:
        module = module.to(torch.float32)

## Train the model

Now let's train the model! Simply call `trainer.train()`

In [None]:
trainer.train()

Step,Training Loss
2,6.609
4,6.7589
6,6.856
8,6.7566
10,6.2032
12,6.0736
14,5.3273
16,4.7595
18,3.8141
20,2.9394


TrainOutput(global_step=4263, training_loss=0.3379636500227376, metrics={'train_runtime': 7387.946, 'train_samples_per_second': 73.86, 'train_steps_per_second': 0.577, 'total_flos': 6.431027540116685e+17, 'train_loss': 0.3379636500227376, 'epoch': 3.0})

In [None]:
 # @title Save the model
 trainer.save_model("/content/drive/MyDrive/ProjectNLP/saved_models/chessgpt_trained_check_mate_none_new")