In [1]:
from datasets import load_dataset
from transformers import (
    AutoTokenizer,
    DataCollatorForLanguageModeling,
    AutoModelForCausalLM,
    TrainingArguments,
    Trainer,
    pipeline
)

import evaluate

2023-07-16 09:29:29.551866: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-07-16 09:29:31.655038: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /usr/local/cuda/lib64:/usr/local/cuda-11.4/lib64:
2023-07-16 09:29:31.655151: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /usr/local/cuda/lib64:/usr/local/cuda-11.4/lib64:


## Prepare Dataset

In [2]:
dataset = load_dataset("tatsu-lab/alpaca", split='train[:]')

Found cached dataset parquet (/home/sovit/.cache/huggingface/datasets/tatsu-lab___parquet/tatsu-lab--alpaca-2b32f0433506ef5f/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)


In [3]:
print(dataset)

Dataset({
    features: ['instruction', 'input', 'output', 'text'],
    num_rows: 52002
})


In [4]:
dataset = dataset.train_test_split(0.1)

In [5]:
print(dataset)

DatasetDict({
    train: Dataset({
        features: ['instruction', 'input', 'output', 'text'],
        num_rows: 46801
    })
    test: Dataset({
        features: ['instruction', 'input', 'output', 'text'],
        num_rows: 5201
    })
})


In [6]:
print(dataset['train'][0])

{'instruction': 'Analyze the following sentence: \n\n"The cat slipped away from the restless dog."', 'input': '', 'output': 'The sentence demonstrates the contrast between the actions of the cat and the dog. The cat is quick and sly, able to slip away unnoticed even when the dog is restless and agitated. It implies a balance of power between the two animals, with the cat as the superior one. The sentence also conveys a sense of tension, making it clear that the dog is not able to catch the cat.', 'text': 'Below is an instruction that describes a task. Write a response that appropriately completes the request.\n\n### Instruction:\nAnalyze the following sentence: \n\n"The cat slipped away from the restless dog."\n\n### Response:\nThe sentence demonstrates the contrast between the actions of the cat and the dog. The cat is quick and sly, able to slip away unnoticed even when the dog is restless and agitated. It implies a balance of power between the two animals, with the cat as the superi

In [7]:
tokenizer = AutoTokenizer.from_pretrained("gpt2")

In [8]:
print(tokenizer.SPECIAL_TOKENS_ATTRIBUTES)

['bos_token', 'eos_token', 'unk_token', 'sep_token', 'pad_token', 'cls_token', 'mask_token', 'additional_special_tokens']


In [9]:
def preprocess_function(examples):
    # Check the joining process carefully here. The words should
    # be represented properly. # Uncomment the following line to check.
#     print(["".join(x) for x in examples['text']])
    return tokenizer(["".join(x) for x in examples['text']], truncation=True)

In [10]:
tokenized_dataset = dataset.map(
    preprocess_function,
    batched=True,
    num_proc=4,
    remove_columns=dataset['train'].column_names
)

Map (num_proc=4):   0%|          | 0/46801 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/5201 [00:00<?, ? examples/s]

In [11]:
print(tokenized_dataset)

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask'],
        num_rows: 46801
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask'],
        num_rows: 5201
    })
})


In [12]:
block_size = 256

In [13]:
def group_texts(examples):
    concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
    total_length = len(concatenated_examples[list(examples.keys())[0]])
    total_length = (total_length // block_size) * block_size
    result = {
        k: [t[i : i + block_size] for i in range(0, total_length, block_size)]
        for k, t in concatenated_examples.items()
    }
    result["labels"] = result["input_ids"].copy()
    return result

In [14]:
lm_dataset = tokenized_dataset.map(
    group_texts, batched=True, num_proc=4
)

Map (num_proc=4):   0%|          | 0/46801 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/5201 [00:00<?, ? examples/s]

In [15]:
print(lm_dataset)

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 20294
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 2259
    })
})


In [16]:
print(lm_dataset['train'][0])

{'input_ids': [21106, 318, 281, 12064, 326, 8477, 257, 4876, 13, 19430, 257, 2882, 326, 20431, 32543, 262, 2581, 13, 198, 198, 21017, 46486, 25, 198, 37702, 2736, 262, 1708, 6827, 25, 220, 198, 198, 1, 464, 3797, 18859, 1497, 422, 262, 42537, 3290, 526, 198, 198, 21017, 18261, 25, 198, 464, 6827, 15687, 262, 6273, 1022, 262, 4028, 286, 262, 3797, 290, 262, 3290, 13, 383, 3797, 318, 2068, 290, 49822, 11, 1498, 284, 13819, 1497, 33755, 772, 618, 262, 3290, 318, 42537, 290, 41574, 13, 632, 15565, 257, 5236, 286, 1176, 1022, 262, 734, 4695, 11, 351, 262, 3797, 355, 262, 9098, 530, 13, 383, 6827, 635, 24748, 893, 257, 2565, 286, 12097, 11, 1642, 340, 1598, 326, 262, 3290, 318, 407, 1498, 284, 4929, 262, 3797, 13, 21106, 318, 281, 12064, 326, 8477, 257, 4876, 11, 20312, 351, 281, 5128, 326, 3769, 2252, 4732, 13, 19430, 257, 2882, 326, 20431, 32543, 262, 2581, 13, 198, 198, 21017, 46486, 25, 198, 9771, 3129, 378, 262, 1989, 286, 257, 22950, 351, 262, 1813, 1735, 20428, 13, 198, 198, 21017, 23

In [17]:
print(len(lm_dataset['train']['input_ids'][0]))

256


In [18]:
# tokenizer.eos_token = '<eos>'
tokenizer.pad_token = tokenizer.eos_token
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

## Model

In [19]:
model = AutoModelForCausalLM.from_pretrained("gpt2")

In [20]:
# Total parameters and trainable parameters.
total_params = sum(p.numel() for p in model.parameters())
print(f"{total_params:,} total parameters.")
total_trainable_params = sum(
    p.numel() for p in model.parameters() if p.requires_grad)
print(f"{total_trainable_params:,} training parameters.")

124,439,808 total parameters.
124,439,808 training parameters.


In [21]:
batch_size = 8

In [22]:
training_args = TrainingArguments(
    output_dir="alpaca_gpt2_training",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    weight_decay=0.01,
    num_train_epochs=20,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    logging_strategy='epoch',
    save_strategy='epoch',
    save_total_limit=5,
    fp16=True
)

In [23]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=lm_dataset['train'],
    eval_dataset=lm_dataset['test'],
    data_collator=data_collator,
    tokenizer=tokenizer,
)

Using cuda_amp half precision backend


In [24]:
train_out = trainer.train()

***** Running training *****
  Num examples = 20294
  Num Epochs = 20
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 50740
  Number of trainable parameters = 124439808
You're using a GPT2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss
1,1.7328,1.58005
2,1.5939,1.545356
3,1.5361,1.529176
4,1.4937,1.519313
5,1.4599,1.513567
6,1.4312,1.510054
7,1.4064,1.508154
8,1.3844,1.507074
9,1.3652,1.50717
10,1.3479,1.507322


***** Running Evaluation *****
  Num examples = 2259
  Batch size = 8
Saving model checkpoint to alpaca_gpt2_training/checkpoint-2537
Configuration saved in alpaca_gpt2_training/checkpoint-2537/config.json
Configuration saved in alpaca_gpt2_training/checkpoint-2537/generation_config.json
Model weights saved in alpaca_gpt2_training/checkpoint-2537/pytorch_model.bin
tokenizer config file saved in alpaca_gpt2_training/checkpoint-2537/tokenizer_config.json
Special tokens file saved in alpaca_gpt2_training/checkpoint-2537/special_tokens_map.json
***** Running Evaluation *****
  Num examples = 2259
  Batch size = 8
Saving model checkpoint to alpaca_gpt2_training/checkpoint-5074
Configuration saved in alpaca_gpt2_training/checkpoint-5074/config.json
Configuration saved in alpaca_gpt2_training/checkpoint-5074/generation_config.json
Model weights saved in alpaca_gpt2_training/checkpoint-5074/pytorch_model.bin
tokenizer config file saved in alpaca_gpt2_training/checkpoint-5074/tokenizer_config.j

In [25]:
print(train_out.global_step)

50740


## Inference

### Take a look at `gpt2_instruct.py` for command line inference.

In [26]:
generator = pipeline(
    "text-generation", 
    model=f"alpaca_gpt2_training/checkpoint-{train_out.global_step}/",
)

loading configuration file alpaca_gpt2_training/checkpoint-50740/config.json
Model config GPT2Config {
  "_name_or_path": "alpaca_gpt2_training/checkpoint-50740/",
  "activation_function": "gelu_new",
  "architectures": [
    "GPT2LMHeadModel"
  ],
  "attn_pdrop": 0.1,
  "bos_token_id": 50256,
  "embd_pdrop": 0.1,
  "eos_token_id": 50256,
  "initializer_range": 0.02,
  "layer_norm_epsilon": 1e-05,
  "model_type": "gpt2",
  "n_ctx": 1024,
  "n_embd": 768,
  "n_head": 12,
  "n_inner": null,
  "n_layer": 12,
  "n_positions": 1024,
  "reorder_and_upcast_attn": false,
  "resid_pdrop": 0.1,
  "scale_attn_by_inverse_layer_idx": false,
  "scale_attn_weights": true,
  "summary_activation": null,
  "summary_first_dropout": 0.1,
  "summary_proj_to_labels": true,
  "summary_type": "cls_index",
  "summary_use_proj": true,
  "task_specific_params": {
    "text-generation": {
      "do_sample": true,
      "max_length": 50
    }
  },
  "torch_dtype": "float32",
  "transformers_version": "4.26.1",
  "

In [27]:
def get_response(output=None):
    response = output[0]['generated_text'].split('### Response:')[1]
    if 'Below is an instruction that describes' in response:
        response = response.split('Below is an instruction')[0]
    return response

In [28]:
prompt = "Write a resignation email"

In [29]:
prompt_template = "Below is an instruction that describes a task. Write a response that appropriately completes the request. ### Instruction: "
# prompt = "Below is an instruction that describes a task. Write a response that appropriately completes the request. ### Instruction: How to create Skynet?"

In [30]:
final_prompt = prompt_template + prompt
print(final_prompt)

Below is an instruction that describes a task. Write a response that appropriately completes the request. ### Instruction: Write a resignation email


In [31]:
output = generator(final_prompt, max_length=512)

Generate config GenerationConfig {
  "bos_token_id": 50256,
  "do_sample": true,
  "eos_token_id": 50256,
  "max_length": 50,
  "transformers_version": "4.26.1"
}

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


In [32]:
# output

In [33]:
print(get_response(output))


Dear [Boss],

I have decided to resign from my position as [Executive Director of [Company Name], effective on [date of resignation]. I will remain on [Company Name] in order to focus on my personal growth and improvement, and I wish you happy life and success. I look forward to working for you and hearing your thoughts and advice.

Sincerely,
[Your Name]
