In [4]:
import os
import logging
from datetime import datetime
from datasets import load_dataset
from pprint import pprint
from transformers import AutoTokenizer, AutoConfig, AutoModelForCausalLM
from transformers import DataCollatorForLanguageModeling
from transformers import TrainingArguments, Trainer


os.environ["CUDA_VISIBLE_DEVICES"] = "7"

save_dir = f'training-{datetime.now().strftime("%Y%m%d_%H:%M:%S")}'
if not os.path.exists(save_dir):
    os.mkdir(save_dir)

logging.basicConfig(filename=os.path.join(save_dir, 'training.log'),
                    format="%(asctime)s || %(message)s", 
                    datefmt="%Y-%m-%d %H:%M:%S", level=logging.INFO)

logger = logging.getLogger(__name__)

  from .autonotebook import tqdm as notebook_tqdm


OSError: [WinError 123] 文件名、目录名或卷标语法不正确。: 'training-20241201_00:49:55'

In [2]:
pretrained_file_path = '../../DataCollection/officials/gpt2'

# config = AutoConfig(
#     vocab_size=50257,        # Typically, GPT-2 uses a vocabulary size of 50257
#     n_positions=1024,        # Maximum number of tokens in a sequence
#     n_ctx=1024,              # Context window size
#     n_embd=768,              # Embedding dimension
#     n_layer=12,              # Number of transformer layers
#     n_head=12,               # Number of attention heads
# )

config = AutoConfig.from_pretrained(pretrained_file_path)
tokenizer = AutoTokenizer.from_pretrained(pretrained_file_path)

In [3]:
print(tokenizer.special_tokens_map)
if not tokenizer.pad_token:
    print('Using eos token as pad token')
    tokenizer.pad_token = tokenizer.eos_token
print(tokenizer.special_tokens_map)

{'bos_token': '<|endoftext|>', 'eos_token': '<|endoftext|>', 'unk_token': '<|endoftext|>'}
Using eos token as pad token
{'bos_token': '<|endoftext|>', 'eos_token': '<|endoftext|>', 'unk_token': '<|endoftext|>', 'pad_token': '<|endoftext|>'}


In [4]:
model = AutoModelForCausalLM.from_config(config)

## Step 1. Load the dataset

In [5]:
datasets = load_dataset('wikitext', 'wikitext-2-raw-v1')
# datasets = load_dataset("wikimedia/wikipedia", "20231101.en")
pprint(datasets)

DatasetDict({
    test: Dataset({
        features: ['text'],
        num_rows: 4358
    })
    train: Dataset({
        features: ['text'],
        num_rows: 36718
    })
    validation: Dataset({
        features: ['text'],
        num_rows: 3760
    })
})


Each entry in dataset["train"] contains the full text of a Wikipedia article. For language modeling, we need to concatenate these texts and split them into chunks that match the model’s context length.

## Step 2. Tokenize the Text Data

We directly tokenize each text entry. 
The output is input_ids and attention_mask with different length. 

In [6]:
def tokenize_function(data):
    return tokenizer(data["text"])

tokenized_datasets = datasets.map(tokenize_function, batched=True, num_proc=4, remove_columns=["text"])

In [7]:
tokenizer.decode(tokenized_datasets['train'][1]['input_ids'], skip_special_tokens=False)

' = Valkyria Chronicles III = \n'

## Step 3. Group Text into Chunks of Fixed Length

Why we didn't chunk text before converting to token ids? Because after tokenization, the lengths will definetly change. And chunking in this scenario makes no sense, since it cannot provide data of equal lengths, which is ideal for less padding as possible. The leftovers shorter than chunk size is discarded. 

In [8]:
# Define a function to group tokens into chunks of the model's context length
block_size = 1024  # Context length for GPT-2

def group_texts(examples):
    # Concatenate all texts (technically speaking, input_ids and attention_mask)
    concatenated = {k: sum(examples[k], []) for k in examples.keys()}
    total_length = len(concatenated["input_ids"])
    # We drop the last chunk if it's smaller than block_size
    total_length = (total_length // block_size) * block_size
    # Split by chunks of block_size
    result = {
        k: [t[i : i + block_size] for i in range(0, total_length, block_size)]
        for k, t in concatenated.items()
    }
    result["labels"] = result["input_ids"].copy()
    return result

# Apply the grouping function to form fixed-length chunks
lm_datasets = tokenized_datasets.map(
    group_texts,
    batched=True,
    num_proc=4
)
lm_datasets

DatasetDict({
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 272
    })
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 2314
    })
    validation: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 240
    })
})

This preprocessing results in a dataset (lm_dataset) where each entry contains chunks of block_size tokens, formatted for language model training.

## Step 4. Create a Data Collator for Language Modeling

In [9]:
from transformers import DataCollatorForLanguageModeling

# Define the data collator for language modeling with batch-level padding
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False,  # Set mlm=False for GPT-2, as it’s an autoregressive model
    pad_to_multiple_of=None  # No fixed length; pad each batch independently
)

An example of using DataCollator.

In [10]:
batch = [
    {"input_ids": [101, 102, 103, 104]},             # Sequence 1
    {"input_ids": [201, 202]},                       # Sequence 2
    {"input_ids": [301, 302, 303, 304, 305, 306]}    # Sequence 3
]

batch = [{'input_ids':item['input_ids'], 'attention_mask':[1 for i in item['input_ids']]} for item in batch]
pprint(batch, width=100)
pprint(data_collator(batch), width=100)

[{'attention_mask': [1, 1, 1, 1], 'input_ids': [101, 102, 103, 104]},
 {'attention_mask': [1, 1], 'input_ids': [201, 202]},
 {'attention_mask': [1, 1, 1, 1, 1, 1], 'input_ids': [301, 302, 303, 304, 305, 306]}]
{'attention_mask': tensor([[1, 1, 1, 1, 0, 0],
        [1, 1, 0, 0, 0, 0],
        [1, 1, 1, 1, 1, 1]]),
 'input_ids': tensor([[  101,   102,   103,   104, 50256, 50256],
        [  201,   202, 50256, 50256, 50256, 50256],
        [  301,   302,   303,   304,   305,   306]]),
 'labels': tensor([[ 101,  102,  103,  104, -100, -100],
        [ 201,  202, -100, -100, -100, -100],
        [ 301,  302,  303,  304,  305,  306]])}


If your data already has fixed-length chunks and requires no padding, the data collator's role becomes minimal. Its primary functions in this case are: 
- Batch Conversion to Tensors: It automatically converts lists of examples into PyTorch tensors, which is required for efficient batch processing in training.
- Batch Management: It helps ensure that the model gets consistent inputs by organizing data in batches and handling minor tasks like device placement (if specified).

## Step 5. Create Trainer

In [11]:
training_args = TrainingArguments(
    output_dir=save_dir,
    per_device_train_batch_size=8,
    num_train_epochs=5,
    load_best_model_at_end=True,
    logging_strategy='steps',
    logging_steps=1,
    evaluation_strategy='steps',
    eval_steps=5,
    report_to=[], 
)



In [None]:

trainer = Trainer(
    model=model,
    train_dataset=lm_datasets['train'],
    eval_dataset=lm_datasets['validation'],
    data_collator=data_collator,
    args=training_args,
  )
trainer.train()

[2024-11-24 03:49:28,027] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)


/data02/hyzhang10/miniconda3/envs/xp-nlp/compiler_compat/ld: /usr/local/cuda-12.2/lib64/libcufile.so: undefined reference to `std::runtime_error::~runtime_error()@GLIBCXX_3.4'
/data02/hyzhang10/miniconda3/envs/xp-nlp/compiler_compat/ld: /usr/local/cuda-12.2/lib64/libcufile.so: undefined reference to `__gxx_personality_v0@CXXABI_1.3'
/data02/hyzhang10/miniconda3/envs/xp-nlp/compiler_compat/ld: /usr/local/cuda-12.2/lib64/libcufile.so: undefined reference to `std::ostream::tellp()@GLIBCXX_3.4'
/data02/hyzhang10/miniconda3/envs/xp-nlp/compiler_compat/ld: /usr/local/cuda-12.2/lib64/libcufile.so: undefined reference to `std::chrono::_V2::steady_clock::now()@GLIBCXX_3.4.19'
/data02/hyzhang10/miniconda3/envs/xp-nlp/compiler_compat/ld: /usr/local/cuda-12.2/lib64/libcufile.so: undefined reference to `std::string::_M_replace_aux(unsigned long, unsigned long, unsigned long, char)@GLIBCXX_3.4'
/data02/hyzhang10/miniconda3/envs/xp-nlp/compiler_compat/ld: /usr/local/cuda-12.2/lib64/libcufile.so: unde

Step,Training Loss,Validation Loss
5,9.4735,9.473883
10,9.252,9.155459
15,9.0101,8.740193
20,8.6458,8.511891
25,8.4383,8.311181
30,8.1774,8.141772
35,8.0332,7.984874
40,7.8178,7.840292
45,7.742,7.712692
50,7.6792,7.592248


There were missing keys in the checkpoint model loaded: ['lm_head.weight'].


TrainOutput(global_step=1450, training_loss=6.276560823177469, metrics={'train_runtime': 2187.0714, 'train_samples_per_second': 5.29, 'train_steps_per_second': 0.663, 'total_flos': 6046297620480000.0, 'train_loss': 6.276560823177469, 'epoch': 5.0})

In [17]:
model_inputs = tokenizer('I enjoy walking with my cute dog', return_tensors='pt').to('cuda')
tokenizer.decode(model.generate(model_inputs['input_ids'], max_new_tokens=40, top_p=0.92, temperature=0.7,)[0])

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


'I enjoy walking with my cute dog , and the first time of the first time of the game . \n = = = = = = = = = = = = = = = = = = = = = = = = ='

In [18]:
model.generate(model_inputs['input_ids'], max_new_tokens=40)

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


tensor([[   40,  2883,  6155,   351,   616, 13779,  3290,   837,   290,   262,
           717,   640,   286,   262,   717,   640,   286,   262,   983,   764,
           220,   198,   796,   796,   796,   796,   796,   796,   796,   796,
           796,   796,   796,   796,   796,   796,   796,   796,   796,   796,
           796,   796,   796,   796,   796,   796,   796]], device='cuda:0')

In [None]:
trainer.