In [1]:
from transformers import GPT2Tokenizer, TextDataset, DataCollatorForLanguageModeling, GPT2LMHeadModel, pipeline, Trainer, TrainingArguments

  from .autonotebook import tqdm as notebook_tqdm
2024-02-11 19:46:53.645727: E tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc:9342] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-02-11 19:46:53.645784: E tensorflow/compiler/xla/stream_executor/cuda/cuda_fft.cc:609] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-02-11 19:46:53.645806: E tensorflow/compiler/xla/stream_executor/cuda/cuda_blas.cc:1518] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-02-11 19:46:53.650416: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')


In [4]:
pds_data = TextDataset(
    tokenizer = tokenizer,
    file_path = 'BERT_LLM/BERT_LLM/data/data/PDS2.txt', # Principles of Data Science - Sinan Ozdemir
    block_size = 32 # Length of each chunk of text to use as a datapoint
)



In [5]:
pds_data[0], pds_data[0].shape # inspect the first point

(tensor([  200, 47231,  6418,   286,  6060,  5800,   198, 12211,  5061,   198,
           198,    32, 31516,   338,  5698,   284, 13905,  7605,   290,  4583,
           284,   198, 11249,   304,   171,   105,   222, 13967,  1366,    12,
         15808,  5479]),
 torch.Size([32]))

In [6]:
print(tokenizer.decode(pds_data[0]))

Principles of Data Science
Second Edition

A beginner's guide to statistical techniques and theory to
build eﬀective data-driven applications


In [8]:
data_collator = DataCollatorForLanguageModeling(
    tokenizer = tokenizer, mlm = False, # MLM is Masked Language Modelling
)

In [12]:
# Adding PAD token since the current tokenizer does not have pad_token
if tokenizer.pad_token is None:
    tokenizer.add_special_tokens({'pad_token': '[PAD]'})
    #model.resize_token_embeddings(len(tokenizer))

In [13]:
collator_example = data_collator([tokenizer('I am an input'), tokenizer('So am I')])

collator_example

{'input_ids': tensor([[   40,   716,   281,  5128],
        [ 2396,   716,   314, 50257]]), 'attention_mask': tensor([[1, 1, 1, 1],
        [1, 1, 1, 0]]), 'labels': tensor([[  40,  716,  281, 5128],
        [2396,  716,  314, -100]])}

In [14]:
collator_example.input_ids

tensor([[   40,   716,   281,  5128],
        [ 2396,   716,   314, 50257]])

In [15]:
tokenizer.pad_token_id

50257

In [16]:
collator_example.attention_mask # Note the 0 in the attention mask where we have a pad token

tensor([[1, 1, 1, 1],
        [1, 1, 1, 0]])

In [17]:
collator_example.labels # Note the -100 to ignore loss calculation for the padded token
# Reminder that labels are shifted *inside* the GPT model so we don't need to worry about that

tensor([[  40,  716,  281, 5128],
        [2396,  716,  314, -100]])

In [18]:
model = GPT2LMHeadModel.from_pretrained('gpt2') # Load up a GPT2 model

pretrained_generator = pipeline(
    'text-generation', model = model, tokenizer = 'gpt2',
    config = {'max_length': 200, 'do_sample': True, 'top_p': 0.9, 'temperature': 0.7, 'top_k': 10}
)

In [19]:
print('---------')
for generated_sequence in pretrained_generator('A dataset shows the relationships', num_return_sequences = 3):
    print(generated_sequence['generated_text'])
    print('---------')

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


---------
A dataset shows the relationships between those 3 data points and trends across time. (B) Trends in time series within the last 5 years for the five US Census tracts.

As shown in figure 1 above, differences in the frequency and strength of
---------
A dataset shows the relationships of the three sex sites and the three sex groups. The sex-specific analyses (Figure 1) are made of two groups (M, C, and P respectively) but the group that contains P was classified by an analysis
---------
A dataset shows the relationships such that the first two relationships are strongly associated, and the final ones are strongly associated. This means that when you're looking at data, your visual comprehension is in full swing.

It's helpful to know what's
---------


In [20]:
training_args = TrainingArguments(
    output_dir = "./gpt2_pds", # The output directory
    overwrite_output_dir = True, # Overwrite the content of the output directory
    num_train_epochs = 3, # Number of training epochs
    per_device_train_batch_size = 32, # Batch size for training
    per_device_eval_batch_size = 32, # Batch size for evaluation
    warmup_steps = len(pds_data.examples) //5, # Number of warmup steps dor learning rate scheduler
    logging_steps = 50,
    load_best_model_at_end = True,
    evaluation_strategy = 'epoch',
    save_strategy = 'epoch'
)

trainer = Trainer(
    model = model,
    args = training_args,
    data_collator = data_collator,
    train_dataset = pds_data.examples[:int(len(pds_data.examples)*0.8)],
    eval_dataset = pds_data.examples[int(len(pds_data.examples)*0.8):]
)

trainer.evaluate()

{'eval_loss': 4.955997943878174,
 'eval_runtime': 3.6911,
 'eval_samples_per_second': 254.666,
 'eval_steps_per_second': 8.128}

In [21]:
trainer.train()

Epoch,Training Loss,Validation Loss
1,4.2968,4.095623
2,3.7894,3.864108
3,3.4068,3.775156


There were missing keys in the checkpoint model loaded: ['lm_head.weight'].


TrainOutput(global_step=354, training_loss=3.897096394145556, metrics={'train_runtime': 94.7122, 'train_samples_per_second': 118.971, 'train_steps_per_second': 3.738, 'total_flos': 184014913536000.0, 'train_loss': 3.897096394145556, 'epoch': 3.0})

In [22]:
trainer.evaluate()

{'eval_loss': 3.775155782699585,
 'eval_runtime': 1.9812,
 'eval_samples_per_second': 474.455,
 'eval_steps_per_second': 15.142,
 'epoch': 3.0}

In [23]:
trainer.save_model()

In [25]:
loaded_model = GPT2LMHeadModel.from_pretrained("./gpt2_pds")

finetuned_generator = pipeline(
    'text-generation', model = loaded_model, tokenizer = tokenizer,
    config={'max_length': 200, 'do_sample': True, 'top_p': 0.9, 'temperature': 0.7, 'top_k': 10}
)

In [26]:
print('---------')
for generated_sequence in finetuned_generator('A dataset shows the relationships', num_return_sequences = 3):
    print(generated_sequence['generated_text'])
    print('---------')

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


---------
A dataset shows the relationships among the data. As shown in the following diagram (in blue), the
data represents whether there were positive and negative correlation coefficients between the data
and the chi-square test. To calculate the mean and standard deviation,
---------
A dataset shows the relationships between two variables, such as
population size.
Let's begin by looking at population size and mean
size:



fig. 3

The two values in the x axis are all the values in our y
---------
A dataset shows the relationships between categorical variables and
order of the categorical variables.

[ 182 ]

Problems of Data Science

Chapter 5

We have looked at many different kinds of issues with data science, but
---------
