In [1]:
# !apt-get install cuda

In [2]:
!pip install transformers[torch]
!pip install accelerate -U

Collecting accelerate>=0.21.0 (from transformers[torch])
  Downloading accelerate-0.29.3-py3-none-any.whl (297 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m297.6/297.6 kB[0m [31m4.8 MB/s[0m eta [36m0:00:00[0m
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch->transformers[torch])
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (23.7 MB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch->transformers[torch])
  Using cached nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (823 kB)
Collecting nvidia-cuda-cupti-cu12==12.1.105 (from torch->transformers[torch])
  Using cached nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (14.1 MB)
Collecting nvidia-cudnn-cu12==8.9.2.26 (from torch->transformers[torch])
  Using cached nvidia_cudnn_cu12-8.9.2.26-py3-none-manylinux1_x86_64.whl (731.7 MB)
Collecting nvidia-cublas-cu12==12.1.3.1 (from torch->transformers[torch])
  Using cached nvidia_cublas_cu

In [3]:
from transformers import GPT2Tokenizer, GPT2TokenizerFast, TextDataset, DataCollatorForLanguageModeling, \
                         GPT2LMHeadModel, pipeline, Trainer, TrainingArguments
import torch

In [5]:
tokenizer = GPT2TokenizerFast.from_pretrained('gpt2')

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

In [6]:
def load_dataset(file_path, tokenizer, block_size = 32):
    dataset = TextDataset(
        tokenizer = tokenizer,
        file_path = file_path,
        block_size = block_size,
    )
    return dataset

In [9]:
pds_data = load_dataset(file_path = '/content/pds.txt', tokenizer = tokenizer)

Token indices sequence length is longer than the specified maximum sequence length for this model (128380 > 1024). Running this sequence through the model will result in indexing errors


In [10]:
pds_data[0], pds_data[0].shape

(tensor([47231,  6418,   286,  6060,  5800,   628,   198, 15269, 10673, 48609,
          6400,    83, 23499,   198,   198,  3237,  2489, 10395,    13,  1400,
           636,   286,   428,  1492,   743,   307, 31759,    11,  8574,   287,
           257, 45069]),
 torch.Size([32]))

In [11]:
print(tokenizer.decode(pds_data[0]))

Principles of Data Science


Copyright © 2024 Packt Publishing

All rights reserved. No part of this book may be reproduced, stored in a retrieval


In [12]:
# Define a padding token for the tokenizer
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

In [13]:
data_collactor = DataCollatorForLanguageModeling(
        tokenizer = tokenizer,
        mlm=False, # MLM is mask modeling
)

In [14]:
collactor_example = data_collactor([tokenizer('I am an input'), tokenizer('So am i')])
collactor_example

{'input_ids': tensor([[   40,   716,   281,  5128],
        [ 2396,   716,  1312, 50256]]), 'attention_mask': tensor([[1, 1, 1, 1],
        [1, 1, 1, 0]]), 'labels': tensor([[  40,  716,  281, 5128],
        [2396,  716, 1312, -100]])}

In [15]:
collactor_example.input_ids

tensor([[   40,   716,   281,  5128],
        [ 2396,   716,  1312, 50256]])

In [16]:
tokenizer.pad_token_id

50256

In [17]:
collactor_example.attention_mask # note the 0 mask where we have a pad

tensor([[1, 1, 1, 1],
        [1, 1, 1, 0]])

In [18]:
collactor_example.labels

tensor([[  40,  716,  281, 5128],
        [2396,  716, 1312, -100]])

In [19]:
model = GPT2LMHeadModel.from_pretrained('gpt2')

pretrained_generator = pipeline(
    'text-generation', model=model, tokenizer=tokenizer,
    config={'max_length': 200, 'do_sample': True, 'top_p': 0.9, 'temperature': 0.7, 'top_k': 10}
)

model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

In [20]:
print('-----------')
for generated_sequence in pretrained_generator('A dataset shows the relationships', num_return_sequences=3):
    print(generated_sequence['generated_text'])
    print('-------------')

-----------
A dataset shows the relationships between various groups and group differences in academic achievement in the general population. We used three datasets and developed a statistical procedure to classify them under the broad umbrella of two different data sources: the US Bureau of Economic Analysis and the American
-------------
A dataset shows the relationships between the number of new entries in the dictionary and the number of previous entries in the dictionary. There are three kinds of new entries: those whose information was last updated in the last five years, those whose information was last updated
-------------
A dataset shows the relationships between the CPE (curve) width and its relative density during the time the population was sampled in various geographical areas, such as India from South to North. The distribution of CPE (curve) density in regions
-------------


In [21]:
checkpoint_dir = "./gpt2_pds_checkpoint"

training_args = TrainingArguments(
    output_dir="./gpt2_pds", #The output directory
    overwrite_output_dir=True, # foverwrite the content of the output directory
    num_train_epochs = 1, # number of training epochs
    per_device_train_batch_size=32, #batch size for training
    per_device_eval_batch_size=32, #batch size for evaluation
    warmup_steps = len(pds_data.examples) // 5, #number of warmup stepa for learn
    logging_steps = 50,
    load_best_model_at_end=True,
    evaluation_strategy='epoch',
    save_strategy='epoch'
)
trainer = Trainer(
    model = model,
    args = training_args,
    data_collator=data_collactor,
    train_dataset=pds_data.examples[:int (len(pds_data.examples)*.8)],
    eval_dataset=pds_data.examples[int (len(pds_data.examples)*.8):]
)

trainer.evaluate()

{'eval_loss': 4.388331413269043,
 'eval_runtime': 104.0926,
 'eval_samples_per_second': 7.714,
 'eval_steps_per_second': 0.25}

In [22]:
trainer.train()

Epoch,Training Loss,Validation Loss
1,3.9828,4.144012


Non-default generation parameters: {'max_length': 50, 'do_sample': True}
There were missing keys in the checkpoint model loaded: ['lm_head.weight'].


TrainOutput(global_step=101, training_loss=4.0871305748967846, metrics={'train_runtime': 1340.6029, 'train_samples_per_second': 2.393, 'train_steps_per_second': 0.075, 'total_flos': 52389052416000.0, 'train_loss': 4.0871305748967846, 'epoch': 1.0})

In [23]:
trainer.save_model()

Non-default generation parameters: {'max_length': 50, 'do_sample': True}


In [24]:
loaded_model = GPT2LMHeadModel.from_pretrained('./gpt2_pds')

finetuned_generator = pipeline(
    'text-generation',
    model=loaded_model,
    tokenizer=tokenizer,
    config={'max_length': 200, 'do_sample': True, 'top_p': 0.9, 'temperature': 0.7, 'top_k': 10}
)

In [25]:
print('-----------')
for generated_sequence in finetuned_generator('A dataset shows the relationships', num_return_sequences=3):
    print(generated_sequence['generated_text'])
    print('-------------')

-----------
A dataset shows the relationships between two groups of genes in a region. We can analyze the data using the following equation:

We see that:

All the data points are consistent across all three variables that correlate with our study. The model
-------------
A dataset shows the relationships between the number of covariates including the variable (1, 2, 9, 16, 32) and the degree to which those covariates were associated with the associated variables (1-3). Furthermore, because the variance associated
-------------
A dataset shows the relationships between data sets in a variety of data sets and their average across these datasets. In this paper, we use a dataset of all data set (i.e., the SVM1 dataset) at the end and show that
-------------


# CheckPoint
---

In [None]:
# Tentukan lokasi penyimpanan checkpoint
checkpoint_dir = "./gpt2_pds_checkpoint"

# Muat model dari checkpoint terakhir
model = Trainer.load_model(checkpoint_dir)

In [None]:
# Lanjutkan pelatihan
trainer.train()

In [None]:
# Evaluasi model jika diperlukan
trainer.evaluate()