In [1]:
import transformers 
from transformers import GPT2Tokenizer, GPT2LMHeadModel
from transformers import Trainer, TrainingArguments
from datasets import load_dataset, DatasetDict, ClassLabel, Dataset
import pandas as pd
from sklearn.model_selection import train_test_split
import torch

W0701 02:01:47.848019 18948 site-packages\torch\distributed\elastic\multiprocessing\redirects.py:29] NOTE: Redirects are currently not supported in Windows or MacOs.


In [2]:
with open("simple.txt", "r", encoding="utf-8") as f:
    lines = f.readlines()

lines = [line.strip() for line in lines]
df = pd.DataFrame(lines, columns=["text"])

In [3]:
simple_data = Dataset.from_pandas(df)

In [4]:
simple_data[:5]

{'text': ['M r. and Mrs. Dursley, of number four, Privet Drive, were proud to say that they were perfectly normal, thank you very much. They were the last people youâ€™d expect to be involved in anything strange or mysterious, because they just didnâ€™t hold with such nonsense.',
  '',
  'Mr. Dursley was the director of a firm called Grunnings, which made drills. He was a big, beefy man with hardly any neck, although he did have a very large mustache. Mrs. Dursley was thin and blonde and had nearly twice the usual amount of neck, which came in very useful as she spent so much of her time craning over garden fences, spying on the neighbors. The Dursleys had a small son called Dudley and in their opinion there was no finer boy anywhere.',
  '',
  'The Dursleys had everything they wanted, but they also had a secret, and their greatest fear was that somebody would discover it. They didnâ€™t think they could bear it if anyone found out about the Potters. Mrs. Potter was Mrs. Dursleyâ€™s sis

In [5]:
train_test_split = simple_data.train_test_split(test_size=0.2)

In [6]:
dataset = DatasetDict({
    "train" :train_test_split['train'],
    "test" : train_test_split["test"]
})

In [7]:
dataset

DatasetDict({
    train: Dataset({
        features: ['text'],
        num_rows: 5256
    })
    test: Dataset({
        features: ['text'],
        num_rows: 1315
    })
})

In [8]:
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')



In [9]:
tokenizer.pad_token = tokenizer.eos_token

In [10]:
tokenizer.pad_token_id = tokenizer.eos_token_id

In [11]:
# tokenizer.add_special_tokens = tokenizer({'pad_token':'[PAD]'})

In [23]:
def tokenize_function(example):
    tokenized = tokenizer(
        example["text"],
        padding="max_length",
        truncation=True,
        max_length=512,
    )
    tokenized["labels"] = tokenized["input_ids"]
    return tokenized


In [24]:
tokenizer_data = dataset.map(tokenize_function, batched=True)

Map:   0%|          | 0/5256 [00:00<?, ? examples/s]

Map:   0%|          | 0/1315 [00:00<?, ? examples/s]

In [25]:
model = GPT2LMHeadModel.from_pretrained('gpt2',
                                        pad_token_id = tokenizer.eos_token_id)

In [26]:
model

GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-11): 12 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=768, out_features=50257, bias=False)
)

In [31]:
training_args = TrainingArguments(
    output_dir="./results",
    num_train_epochs=1,
    per_device_train_batch_size=1,
    per_device_eval_batch_size=1,
    warmup_steps=10,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=10,
    save_total_limit=1,
    save_steps=1000,
    max_steps=500,
    fp16=True,
)

In [33]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenizer_data['train'],
    eval_dataset=tokenizer_data['test']
)

  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


In [34]:
trainer.train()

Step,Training Loss
10,8.1976
20,4.0626
30,0.2186
40,0.273
50,0.0538
60,0.1137
70,0.1994
80,0.3124
90,0.1834
100,0.086


TrainOutput(global_step=500, training_loss=0.37109557196497917, metrics={'train_runtime': 206.2604, 'train_samples_per_second': 2.424, 'train_steps_per_second': 2.424, 'total_flos': 130646016000000.0, 'train_loss': 0.37109557196497917, 'epoch': 0.1})

In [35]:
trainer.save_model("output_dir")
tokenizer.save_pretrained("output_dir")

('output_dir\\tokenizer_config.json',
 'output_dir\\special_tokens_map.json',
 'output_dir\\vocab.json',
 'output_dir\\merges.txt',
 'output_dir\\added_tokens.json')

In [36]:
from transformers import GPT2LMHeadModel, GPT2Tokenizer

model = GPT2LMHeadModel.from_pretrained("output_dir")  
tokenizer = GPT2Tokenizer.from_pretrained("output_dir")
model.eval()

GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-11): 12 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=768, out_features=50257, bias=False)
)

In [37]:
def generate_text(prompt, max_length=50):
    inputs = tokenizer.encode(prompt, return_tensors="pt") 
    outputs = model.generate(
        inputs,
        max_length=max_length,
        num_return_sequences=1,
        no_repeat_ngram_size=2,
        top_k=50,
        top_p=0.95,
        temperature=0.7,
        do_sample=True,
        pad_token_id=tokenizer.eos_token_id 
    )
    return tokenizer.decode(outputs[0], skip_special_tokens=True)


In [38]:
prompt = "Once upon a time"
generated = generate_text(prompt)
print(generated)

Once upon a time, the headmaster had already given a long glance at the back of the room. Harry was too busy looking up at him to look at this. He had taken a deep breath and stood up.
