In [1]:
!pip install transformers torch evaluate accelerate
!pip install -U datasets



In [2]:
from datasets import load_dataset

dataset = load_dataset("ingeniumacademy/reuters_articles")

dataset

DatasetDict({
    train: Dataset({
        features: ['title', 'body'],
        num_rows: 17262
    })
    validation: Dataset({
        features: ['title', 'body'],
        num_rows: 2158
    })
    test: Dataset({
        features: ['title', 'body'],
        num_rows: 2158
    })
})

In [3]:
def create_full_article_col(example):
  return {'full_article': f"TITLE:{example['title']}\n\nBODY:{example['body']}"}

dataset = dataset.map(create_full_article_col)

dataset

DatasetDict({
    train: Dataset({
        features: ['title', 'body', 'full_article'],
        num_rows: 17262
    })
    validation: Dataset({
        features: ['title', 'body', 'full_article'],
        num_rows: 2158
    })
    test: Dataset({
        features: ['title', 'body', 'full_article'],
        num_rows: 2158
    })
})

In [4]:
dataset['train'][0]['full_article']

'TITLE:BAHIA COCOA REVIEW\n\nBODY:Showers continued throughout the week in\nthe Bahia cocoa zone, alleviating the drought since early\nJanuary and improving prospects for the coming temporao,\nalthough normal humidity levels have not been restored,\nComissaria Smith said in its weekly review.\n    The dry period means the temporao will be late this year.\n    Arrivals for the week ended February 22 were 155,221 bags\nof 60 kilos making a cumulative total for the season of 5.93\nmln against 5.81 at the same stage last year. Again it seems\nthat cocoa delivered earlier on consignment was included in the\narrivals figures.\n    Comissaria Smith said there is still some doubt as to how\nmuch old crop cocoa is still available as harvesting has\npractically come to an end. With total Bahia crop estimates\naround 6.4 mln bags and sales standing at almost 6.2 mln there\nare a few hundred thousand bags still in the hands of farmers,\nmiddlemen, exporters and processors.\n    There are doubts as

In [5]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("ingeniumacademy/gpt2-reuters-tokenizer")

tokenizer_config.json:   0%|          | 0.00/440 [00:00<?, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/99.0 [00:00<?, ?B/s]

In [6]:
CONTEXT_LENGTH = 512

def tokenize(element):
    outputs = tokenizer(
        element["full_article"],
        truncation=True,
        max_length=CONTEXT_LENGTH,
        return_overflowing_tokens=False
    )

    return outputs


tokenized_datasets = dataset.map(tokenize, batched=True, remove_columns=dataset["train"].column_names)

tokenized_datasets

Map:   0%|          | 0/17262 [00:00<?, ? examples/s]

Map:   0%|          | 0/2158 [00:00<?, ? examples/s]

Map:   0%|          | 0/2158 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask'],
        num_rows: 17262
    })
    validation: Dataset({
        features: ['input_ids', 'attention_mask'],
        num_rows: 2158
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask'],
        num_rows: 2158
    })
})

### Preparing Model For Training

In [8]:
from transformers import AutoTokenizer, GPT2LMHeadModel, AutoConfig

config = AutoConfig.from_pretrained(
    "gpt2",
    vocab_size=len(tokenizer),
    n_ctx=CONTEXT_LENGTH,
    bos_token_id=tokenizer.bos_token_id,
    eos_token_id=tokenizer.eos_token_id,
    report_to="none"
)

config

GPT2Config {
  "activation_function": "gelu_new",
  "architectures": [
    "GPT2LMHeadModel"
  ],
  "attn_pdrop": 0.1,
  "bos_token_id": 0,
  "embd_pdrop": 0.1,
  "eos_token_id": 0,
  "initializer_range": 0.02,
  "layer_norm_epsilon": 1e-05,
  "model_type": "gpt2",
  "n_ctx": 512,
  "n_embd": 768,
  "n_head": 12,
  "n_inner": null,
  "n_layer": 12,
  "n_positions": 1024,
  "reorder_and_upcast_attn": false,
  "resid_pdrop": 0.1,
  "scale_attn_by_inverse_layer_idx": false,
  "scale_attn_weights": true,
  "summary_activation": null,
  "summary_first_dropout": 0.1,
  "summary_proj_to_labels": true,
  "summary_type": "cls_index",
  "summary_use_proj": true,
  "task_specific_params": {
    "text-generation": {
      "do_sample": true,
      "max_length": 50
    }
  },
  "transformers_version": "4.55.0",
  "use_cache": true,
  "vocab_size": 52000
}

In [9]:
model = GPT2LMHeadModel(config)

model_size = sum(t.numel() for t in model.parameters())
print(f"GPT-2 size: {model_size/1000**2:.1f}M parameters")

GPT-2 size: 125.8M parameters


In [10]:
from transformers import DataCollatorForLanguageModeling

tokenizer.pad_token = tokenizer.eos_token
data_collator = DataCollatorForLanguageModeling(tokenizer, mlm=False)

### Training Model

In [11]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [13]:
from transformers import Trainer, TrainingArguments
import os

os.environ["WANDB_DISABLED"] = "true"

training_args = TrainingArguments(
    output_dir="./reuters-gpt2-text-gen",  # local directory
    hub_model_id="rebirthmonkey/reuters-gpt2-text-gen",  # identifier on the Hub
    eval_strategy="epoch",
    auto_find_batch_size=True,
    num_train_epochs=2,
    gradient_accumulation_steps=8,
    weight_decay=0.1,
    lr_scheduler_type="cosine",
    learning_rate=5e-4,
    fp16=True,
    push_to_hub=True,
    logging_steps=10,
    report_to="none"
)

trainer = Trainer(
    model=model,
    tokenizer=tokenizer,
    args=training_args,
    data_collator=data_collator,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
)

  trainer = Trainer(


In [14]:
trainer.train()

`loss_type=None` was set in the config but it is unrecognised.Using the default loss: `ForCausalLMLoss`.


Epoch,Training Loss,Validation Loss
1,5.1909,5.285306
2,4.7733,4.993577


TrainOutput(global_step=540, training_loss=5.394092164216218, metrics={'train_runtime': 965.8353, 'train_samples_per_second': 35.745, 'train_steps_per_second': 0.559, 'total_flos': 6920172490752000.0, 'train_loss': 5.394092164216218, 'epoch': 2.0})

In [15]:
trainer.push_to_hub()

Processing Files (0 / 0)                : |          |  0.00B /  0.00B            

New Data Upload                         : |          |  0.00B /  0.00B            

  ...ers-gpt2-text-gen/training_args.bin: 100%|##########| 5.37kB / 5.37kB            

  ...ers-gpt2-text-gen/model.safetensors:   8%|8         | 41.9MB /  503MB            

CommitInfo(commit_url='https://huggingface.co/rebirthmonkey/reuters-gpt2-text-gen/commit/d1999222508e251233bceec9e8c153d7d84ca211', commit_message='End of training', commit_description='', oid='d1999222508e251233bceec9e8c153d7d84ca211', pr_url=None, repo_url=RepoUrl('https://huggingface.co/rebirthmonkey/reuters-gpt2-text-gen', endpoint='https://huggingface.co', repo_type='model', repo_id='rebirthmonkey/reuters-gpt2-text-gen'), pr_revision=None, pr_num=None)

### Using Our Model In Pipeline

In [16]:
import torch
from transformers import pipeline

pipe = pipeline("text-generation", model="rebirthmonkey/reuters-gpt2-text-gen")

config.json:   0%|          | 0.00/871 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/503M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/111 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/502 [00:00<?, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/470 [00:00<?, ?B/s]

Device set to use cuda:0


In [17]:
sample = dataset['test'][2]

sample

{'title': 'CHEFS <CHEF.O> COMPLETES PRIVATE SALE',
 'body': "Chefs International\nInc said it completed a private sale of nine mln units of its\nsecurities for 20 cts per unit for a total price of 1,800,000\ndlrs.\n    The company said each unit consisted of one share of Chef's\ncommon stock and one three-year warrant exerciseable to\npurchase one share of Chef's stock at 25 cents.\n    The company also said Robert E. Brennan purchased 8,250,000\nof the units.\n   \n Reuter\n\x03",
 'full_article': "TITLE:CHEFS <CHEF.O> COMPLETES PRIVATE SALE\n\nBODY:Chefs International\nInc said it completed a private sale of nine mln units of its\nsecurities for 20 cts per unit for a total price of 1,800,000\ndlrs.\n    The company said each unit consisted of one share of Chef's\ncommon stock and one three-year warrant exerciseable to\npurchase one share of Chef's stock at 25 cents.\n    The company also said Robert E. Brennan purchased 8,250,000\nof the units.\n   \n Reuter\n\x03"}

In [18]:
prompt = f"TITLE:{sample['title']}\n\nBODY:"

pipe(prompt, max_new_tokens=128)

[{'generated_text': 'TITLE:CHEFS <CHEF.O> COMPLETES PRIVATE SALE\n\nBODY:Oper stock with the sale of the\n<General Bank, said it plans to\nagreement.\n    The company said it is raising a share to\na unit and chief executive securities.\n    The company said it are able to go to\nit will be made a share price of the\nwere in a result.\n    The company said that the sale and Exchange said it said the company has\nwould be made.\n Reuter\n\x03 of the company said that it was a share to the time will be reached with the\n\x03 to use.\n    The company of the company said its\n\x03 of the proposed stock of an initial\n\x03'}]

In [19]:
prompt = f"TITLE:{sample['title']}"

pipe(prompt, max_new_tokens=128)

[{'generated_text': "TITLE:CHEFS <CHEF.O> COMPLETES PRIVATE SALE\n\nBODY:Canadian Bank Inc said it has\nto help the sale of its common shares of its\nof the common stock offer of\nmln dlr debt, the company said.\n    It said the two-for the company said it will be\nwill be used to take the annual\nfor the company's stock, the company said.\n    The company said the company said the company said it will be\nthe offer.\n    The company said the company said it is not be used to\n Reuter\n\x03 of the company's debt of the contract.\n    The company's stock is the offer to\n\x03 of its contract for\n"}]

### Keep training pre-trained model

In [20]:
# Load model directly
from transformers import AutoTokenizer, AutoModelForCausalLM

pre_tokenizer = AutoTokenizer.from_pretrained("rebirthmonkey/reuters-gpt2-text-gen")
pre_model = AutoModelForCausalLM.from_pretrained("rebirthmonkey/reuters-gpt2-text-gen")

In [22]:
import os

os.environ["WANDB_DISABLED"] = "true"

training_args = TrainingArguments(
    output_dir="./reuters-gpt2-text-gen",  # local directory
    hub_model_id="rebirthmonkey/reuters-gpt2-text-gen",  # identifier on the Hub
    eval_strategy="epoch",
    auto_find_batch_size=True,
    num_train_epochs=1,
    gradient_accumulation_steps=8,
    weight_decay=0.1,
    lr_scheduler_type="cosine",
    learning_rate=5e-4,
    fp16=True,
    push_to_hub=True,
    logging_steps=10,
    report_to="none"
)

trainer = Trainer(
    model=pre_model,
    tokenizer=pre_tokenizer,
    args=training_args,
    data_collator=data_collator,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
)

  trainer = Trainer(


In [23]:
trainer.train()

Epoch,Training Loss,Validation Loss
1,4.6447,4.812656


TrainOutput(global_step=270, training_loss=4.840468936496311, metrics={'train_runtime': 485.6795, 'train_samples_per_second': 35.542, 'train_steps_per_second': 0.556, 'total_flos': 3475886247936000.0, 'train_loss': 4.840468936496311, 'epoch': 1.0})

In [24]:
trainer.push_to_hub()

Processing Files (0 / 0)                : |          |  0.00B /  0.00B            

New Data Upload                         : |          |  0.00B /  0.00B            

  ...ers-gpt2-text-gen/training_args.bin: 100%|##########| 5.37kB / 5.37kB            

  ...ers-gpt2-text-gen/model.safetensors:   8%|8         | 41.9MB /  503MB            

CommitInfo(commit_url='https://huggingface.co/rebirthmonkey/reuters-gpt2-text-gen/commit/d03aea42535b29b5f2555c4f4bb798eaa94ec848', commit_message='End of training', commit_description='', oid='d03aea42535b29b5f2555c4f4bb798eaa94ec848', pr_url=None, repo_url=RepoUrl('https://huggingface.co/rebirthmonkey/reuters-gpt2-text-gen', endpoint='https://huggingface.co', repo_type='model', repo_id='rebirthmonkey/reuters-gpt2-text-gen'), pr_revision=None, pr_num=None)