In [None]:
!pip install transformers datasets torch evaluate accelerate

Collecting transformers
  Downloading transformers-4.33.1-py3-none-any.whl (7.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.6/7.6 MB[0m [31m59.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting datasets
  Downloading datasets-2.14.5-py3-none-any.whl (519 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m519.6/519.6 kB[0m [31m50.6 MB/s[0m eta [36m0:00:00[0m
Collecting evaluate
  Downloading evaluate-0.4.0-py3-none-any.whl (81 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m81.4/81.4 kB[0m [31m13.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting accelerate
  Downloading accelerate-0.23.0-py3-none-any.whl (258 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m258.1/258.1 kB[0m [31m32.1 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.15.1 (from transformers)
  Downloading huggingface_hub-0.17.1-py3-none-any.whl (294 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m294.8/2

In [None]:
from datasets import load_dataset

dataset = load_dataset("ingeniumacademy/reuters_articles")
dataset

Downloading readme:   0%|          | 0.00/698 [00:00<?, ?B/s]

Downloading data files:   0%|          | 0/3 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/8.13M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/1.12M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/827k [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/3 [00:00<?, ?it/s]

Generating train split:   0%|          | 0/17262 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/2158 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/2158 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['title', 'body'],
        num_rows: 17262
    })
    validation: Dataset({
        features: ['title', 'body'],
        num_rows: 2158
    })
    test: Dataset({
        features: ['title', 'body'],
        num_rows: 2158
    })
})

In [None]:
def create_full_article_col(example):

  return {'full_article': f"TITLE:{example['title']}\n\nBODY:{example['body']}"}

dataset = dataset.map(create_full_article_col)
dataset

Map:   0%|          | 0/17262 [00:00<?, ? examples/s]

Map:   0%|          | 0/2158 [00:00<?, ? examples/s]

Map:   0%|          | 0/2158 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['title', 'body', 'full_article'],
        num_rows: 17262
    })
    validation: Dataset({
        features: ['title', 'body', 'full_article'],
        num_rows: 2158
    })
    test: Dataset({
        features: ['title', 'body', 'full_article'],
        num_rows: 2158
    })
})

In [None]:
dataset['train'][0]['full_article']

'TITLE:BAHIA COCOA REVIEW\n\nBODY:Showers continued throughout the week in\nthe Bahia cocoa zone, alleviating the drought since early\nJanuary and improving prospects for the coming temporao,\nalthough normal humidity levels have not been restored,\nComissaria Smith said in its weekly review.\n    The dry period means the temporao will be late this year.\n    Arrivals for the week ended February 22 were 155,221 bags\nof 60 kilos making a cumulative total for the season of 5.93\nmln against 5.81 at the same stage last year. Again it seems\nthat cocoa delivered earlier on consignment was included in the\narrivals figures.\n    Comissaria Smith said there is still some doubt as to how\nmuch old crop cocoa is still available as harvesting has\npractically come to an end. With total Bahia crop estimates\naround 6.4 mln bags and sales standing at almost 6.2 mln there\nare a few hundred thousand bags still in the hands of farmers,\nmiddlemen, exporters and processors.\n    There are doubts as

In [None]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("ingeniumacademy/gpt2-reuters-tokenizer")

Downloading (…)okenizer_config.json:   0%|          | 0.00/234 [00:00<?, ?B/s]

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/819k [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/465k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/2.17M [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/99.0 [00:00<?, ?B/s]

In [None]:
CONTEXT_LENGTH = 512

def tokenize(element):
    outputs = tokenizer(
        element["full_article"],
        truncation=True,
        max_length=CONTEXT_LENGTH,
        return_overflowing_tokens=False
    )

    return outputs


tokenized_datasets = dataset.map(
    tokenize, batched=True, remove_columns=dataset["train"].column_names
)
tokenized_datasets

Map:   0%|          | 0/17262 [00:00<?, ? examples/s]

Map:   0%|          | 0/2158 [00:00<?, ? examples/s]

Map:   0%|          | 0/2158 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask'],
        num_rows: 17262
    })
    validation: Dataset({
        features: ['input_ids', 'attention_mask'],
        num_rows: 2158
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask'],
        num_rows: 2158
    })
})

### Preparing Model For Training

In [None]:
from transformers import AutoTokenizer, GPT2LMHeadModel, AutoConfig

config = AutoConfig.from_pretrained(
    "gpt2",
    vocab_size=len(tokenizer),
    n_ctx=CONTEXT_LENGTH,
    bos_token_id=tokenizer.bos_token_id,
    eos_token_id=tokenizer.eos_token_id,
)

Downloading (…)lve/main/config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

In [None]:
config

GPT2Config {
  "_name_or_path": "gpt2",
  "activation_function": "gelu_new",
  "architectures": [
    "GPT2LMHeadModel"
  ],
  "attn_pdrop": 0.1,
  "bos_token_id": 0,
  "embd_pdrop": 0.1,
  "eos_token_id": 0,
  "initializer_range": 0.02,
  "layer_norm_epsilon": 1e-05,
  "model_type": "gpt2",
  "n_ctx": 512,
  "n_embd": 768,
  "n_head": 12,
  "n_inner": null,
  "n_layer": 12,
  "n_positions": 1024,
  "reorder_and_upcast_attn": false,
  "resid_pdrop": 0.1,
  "scale_attn_by_inverse_layer_idx": false,
  "scale_attn_weights": true,
  "summary_activation": null,
  "summary_first_dropout": 0.1,
  "summary_proj_to_labels": true,
  "summary_type": "cls_index",
  "summary_use_proj": true,
  "task_specific_params": {
    "text-generation": {
      "do_sample": true,
      "max_length": 50
    }
  },
  "transformers_version": "4.33.1",
  "use_cache": true,
  "vocab_size": 52000
}

In [None]:
model = GPT2LMHeadModel(config)
model_size = sum(t.numel() for t in model.parameters())
print(f"GPT-2 size: {model_size/1000**2:.1f}M parameters")

GPT-2 size: 125.8M parameters


In [None]:
from transformers import DataCollatorForLanguageModeling

tokenizer.pad_token = tokenizer.eos_token
data_collator = DataCollatorForLanguageModeling(tokenizer, mlm=False)

### Training Model

In [None]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [None]:
from transformers import Trainer, TrainingArguments

training_args = TrainingArguments(
    output_dir="./reuters-gpt2-text-gen",  # local directory
    hub_model_id="ingeniumacademy/reuters-gpt2-text-gen",  # identifier on the Hub
    evaluation_strategy="epoch",
    auto_find_batch_size=True,
    num_train_epochs=2,
    gradient_accumulation_steps=8,
    weight_decay=0.1,
    lr_scheduler_type="cosine",
    learning_rate=5e-4,
    fp16=True,
    push_to_hub=True,
    logging_steps=10
)

trainer = Trainer(
    model=model,
    tokenizer=tokenizer,
    args=training_args,
    data_collator=data_collator,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
)

In [None]:
trainer.train()

You're using a GPT2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss
0,5.235,5.313089
1,4.8859,5.011938


TrainOutput(global_step=538, training_loss=5.455267271587839, metrics={'train_runtime': 1856.3213, 'train_samples_per_second': 18.598, 'train_steps_per_second': 0.29, 'total_flos': 6937233023232000.0, 'train_loss': 5.455267271587839, 'epoch': 1.99})

In [None]:
trainer.push_to_hub()

pytorch_model.bin:   0%|          | 0.00/503M [00:00<?, ?B/s]

'https://huggingface.co/ingeniumacademy/reuters-gpt2-text-gen/tree/main/'

### Using Our Model In Pipeline

In [None]:
import torch
from transformers import pipeline

pipe = pipeline(
    "text-generation", model="ingeniumacademy/reuters-gpt2-text-gen"
)

Downloading (…)lve/main/config.json:   0%|          | 0.00/898 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/503M [00:00<?, ?B/s]

Downloading (…)neration_config.json:   0%|          | 0.00/111 [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/234 [00:00<?, ?B/s]

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/819k [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/465k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/2.17M [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/131 [00:00<?, ?B/s]

In [None]:
sample = dataset['test'][2]
sample

{'title': 'CHEFS <CHEF.O> COMPLETES PRIVATE SALE',
 'body': "Chefs International\nInc said it completed a private sale of nine mln units of its\nsecurities for 20 cts per unit for a total price of 1,800,000\ndlrs.\n    The company said each unit consisted of one share of Chef's\ncommon stock and one three-year warrant exerciseable to\npurchase one share of Chef's stock at 25 cents.\n    The company also said Robert E. Brennan purchased 8,250,000\nof the units.\n   \n Reuter\n\x03",
 'full_article': "TITLE:CHEFS <CHEF.O> COMPLETES PRIVATE SALE\n\nBODY:Chefs International\nInc said it completed a private sale of nine mln units of its\nsecurities for 20 cts per unit for a total price of 1,800,000\ndlrs.\n    The company said each unit consisted of one share of Chef's\ncommon stock and one three-year warrant exerciseable to\npurchase one share of Chef's stock at 25 cents.\n    The company also said Robert E. Brennan purchased 8,250,000\nof the units.\n   \n Reuter\n\x03"}

In [None]:
prompt = f"TITLE:{sample['title']}\n\nBODY:"
pipe(prompt, max_new_tokens=128)

Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.


[{'generated_text': "TITLE:CHEFS <CHEF.O> COMPLETES PRIVATE SALE\n\nBODY:Qtly div 16 cts vs 16 cts\n    Pay March 1,000 vs 27\n    Record February 17\n Reuter\n\x03 April 23,000 vs 18.2 mln\n    Record a share of <AMO> of 2.75\n\x03 given prior a loss gain of profit quarter 31 and loss\n\x03 due to repay 10.6 mln\n\x03 qtr.\n\x03 due 10 mln dlrs a year.\n    It bought two-year price with a share from 28.\n\x03 net profit results include six pct for March 20 cts\n\x03-date of 4.\n    Other issue of 10 to the company's-plus or\n\x03\nfor-"}]

In [None]:
prompt = f"TITLE:{sample['title']}"
pipe(prompt, max_new_tokens=128)

Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.


[{'generated_text': "TITLE:CHEFS <CHEF.O> COMPLETES PRIVATE SALE\n\nBODY:Moody's Group Inc said its venture\nhas increased the company's <C.Ca proposed shares outstanding mln dlr\nquarter. In denominations of its sale outstanding of the first 11.3\nstock net quarter.\n    Under the new common shares as it said the first quarter will\nwill be priced.\n    A company's dividend is an dividend of the loss\nby the net 31, it may be an\nfirst 1.\n    The offer from its first quarter and 1986.2 billion dlrs to be used to purchase for the\nstock period the issue for an debt.\n    The payment offer in the company added that"}]

### Keep training pre-trained model

In [None]:
# Load model directly
from transformers import AutoTokenizer, AutoModelForCausalLM

pre_tokenizer = AutoTokenizer.from_pretrained("ingeniumacademy/reuters-gpt2-text-gen")
pre_model = AutoModelForCausalLM.from_pretrained("ingeniumacademy/reuters-gpt2-text-gen")

In [None]:
training_args = TrainingArguments(
    output_dir="./reuters-gpt2-text-gen",  # local directory
    hub_model_id="ingeniumacademy/reuters-gpt2-text-gen",  # identifier on the Hub
    evaluation_strategy="epoch",
    auto_find_batch_size=True,
    num_train_epochs=1,
    gradient_accumulation_steps=8,
    weight_decay=0.1,
    lr_scheduler_type="cosine",
    learning_rate=5e-4,
    fp16=True,
    push_to_hub=True,
    logging_steps=10
)

trainer = Trainer(
    model=pre_model,
    tokenizer=pre_tokenizer,
    args=training_args,
    data_collator=data_collator,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
)

In [None]:
trainer.train()

Epoch,Training Loss,Validation Loss
0,4.8282,4.253104


TrainOutput(global_step=234, training_loss=5.001323007110857, metrics={'train_runtime': 847.5428, 'train_samples_per_second': 17.698, 'train_steps_per_second': 0.276, 'total_flos': 3058415069184000.0, 'train_loss': 5.001323007110857, 'epoch': 1.0})

In [None]:
trainer.push_to_hub()

pytorch_model.bin:   0%|          | 0.00/503M [00:00<?, ?B/s]

training_args.bin:   0%|          | 0.00/4.09k [00:00<?, ?B/s]

Upload 2 LFS files:   0%|          | 0/2 [00:00<?, ?it/s]

'https://huggingface.co/rb05751/reuters-gpt2-text-gen/tree/main/'