In [1]:
!pip install transformers datasets torch ipywidgets



In [2]:
from datasets import load_dataset

dataset = load_dataset("rebirthmonkey/reuters_articles")

dataset

DatasetDict({
    train: Dataset({
        features: ['title', 'body'],
        num_rows: 17262
    })
    validation: Dataset({
        features: ['title', 'body'],
        num_rows: 2158
    })
    test: Dataset({
        features: ['title', 'body'],
        num_rows: 2158
    })
})

### Creating New Column

In [3]:
def create_full_article_col(example):
  return {'full_article': f"TITLE:{example['title']}\n\nBODY:{example['body']}"}

dataset = dataset.map(create_full_article_col)

dataset

DatasetDict({
    train: Dataset({
        features: ['title', 'body', 'full_article'],
        num_rows: 17262
    })
    validation: Dataset({
        features: ['title', 'body', 'full_article'],
        num_rows: 2158
    })
    test: Dataset({
        features: ['title', 'body', 'full_article'],
        num_rows: 2158
    })
})

In [4]:
dataset['train'][0]['full_article']

'TITLE:BAHIA COCOA REVIEW\n\nBODY:Showers continued throughout the week in\nthe Bahia cocoa zone, alleviating the drought since early\nJanuary and improving prospects for the coming temporao,\nalthough normal humidity levels have not been restored,\nComissaria Smith said in its weekly review.\n    The dry period means the temporao will be late this year.\n    Arrivals for the week ended February 22 were 155,221 bags\nof 60 kilos making a cumulative total for the season of 5.93\nmln against 5.81 at the same stage last year. Again it seems\nthat cocoa delivered earlier on consignment was included in the\narrivals figures.\n    Comissaria Smith said there is still some doubt as to how\nmuch old crop cocoa is still available as harvesting has\npractically come to an end. With total Bahia crop estimates\naround 6.4 mln bags and sales standing at almost 6.2 mln there\nare a few hundred thousand bags still in the hands of farmers,\nmiddlemen, exporters and processors.\n    There are doubts as

### Training Our Own Tokenizer

In [5]:
# Create a batched dataset for training, creates an iterator object for later usage when training tokenizer
training_corpus = (
    dataset["train"][i : i + 1000]["full_article"]
    for i in range(0, len(dataset["train"]), 1000)
)

In [6]:
from transformers import AutoTokenizer

old_tokenizer = AutoTokenizer.from_pretrained("gpt2") # train gpt2 tokenizer

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Here we are fine-tuning a pre-trained tokenizer instead of creating one from scratch. This makes training a lot easier and faster.

In [7]:
tokenizer = old_tokenizer.train_new_from_iterator(training_corpus, 52000) # vocab size of 52000

In [8]:
example = dataset['test'][2]['full_article']

example

"TITLE:CHEFS <CHEF.O> COMPLETES PRIVATE SALE\n\nBODY:Chefs International\nInc said it completed a private sale of nine mln units of its\nsecurities for 20 cts per unit for a total price of 1,800,000\ndlrs.\n    The company said each unit consisted of one share of Chef's\ncommon stock and one three-year warrant exerciseable to\npurchase one share of Chef's stock at 25 cents.\n    The company also said Robert E. Brennan purchased 8,250,000\nof the units.\n   \n Reuter\n\x03"

In [9]:
old_tokenizer.tokenize(example)

['TIT',
 'LE',
 ':',
 'CH',
 'E',
 'FS',
 'Ġ<',
 'CH',
 'EF',
 '.',
 'O',
 '>',
 'ĠCOMPLE',
 'T',
 'ES',
 'ĠPR',
 'IV',
 'ATE',
 'ĠS',
 'ALE',
 'Ċ',
 'Ċ',
 'B',
 'ODY',
 ':',
 'Che',
 'fs',
 'ĠInternational',
 'Ċ',
 'Inc',
 'Ġsaid',
 'Ġit',
 'Ġcompleted',
 'Ġa',
 'Ġprivate',
 'Ġsale',
 'Ġof',
 'Ġnine',
 'Ġm',
 'ln',
 'Ġunits',
 'Ġof',
 'Ġits',
 'Ċ',
 'sec',
 'urities',
 'Ġfor',
 'Ġ20',
 'Ġc',
 'ts',
 'Ġper',
 'Ġunit',
 'Ġfor',
 'Ġa',
 'Ġtotal',
 'Ġprice',
 'Ġof',
 'Ġ1',
 ',',
 '800',
 ',',
 '000',
 'Ċ',
 'dl',
 'rs',
 '.',
 'Ċ',
 'Ġ',
 'Ġ',
 'Ġ',
 'ĠThe',
 'Ġcompany',
 'Ġsaid',
 'Ġeach',
 'Ġunit',
 'Ġconsisted',
 'Ġof',
 'Ġone',
 'Ġshare',
 'Ġof',
 'ĠChef',
 "'s",
 'Ċ',
 'common',
 'Ġstock',
 'Ġand',
 'Ġone',
 'Ġthree',
 '-',
 'year',
 'Ġwarrant',
 'Ġexercise',
 'able',
 'Ġto',
 'Ċ',
 'p',
 'urchase',
 'Ġone',
 'Ġshare',
 'Ġof',
 'ĠChef',
 "'s",
 'Ġstock',
 'Ġat',
 'Ġ25',
 'Ġcents',
 '.',
 'Ċ',
 'Ġ',
 'Ġ',
 'Ġ',
 'ĠThe',
 'Ġcompany',
 'Ġalso',
 'Ġsaid',
 'ĠRobert',
 'ĠE',
 '.',
 'ĠBren

In [10]:
tokenizer.tokenize(example)

['TITLE',
 ':',
 'CH',
 'EF',
 'S',
 'Ġ<',
 'CH',
 'EF',
 '.',
 'O',
 '>',
 'ĠCOMPLETES',
 'ĠPRIVATE',
 'ĠSALE',
 'Ċ',
 'Ċ',
 'BODY',
 ':',
 'Che',
 'f',
 's',
 'ĠInternational',
 'Ċ',
 'Inc',
 'Ġsaid',
 'Ġit',
 'Ġcompleted',
 'Ġa',
 'Ġprivate',
 'Ġsale',
 'Ġof',
 'Ġnine',
 'Ġmln',
 'Ġunits',
 'Ġof',
 'Ġits',
 'Ċ',
 'securities',
 'Ġfor',
 'Ġ20',
 'Ġcts',
 'Ġper',
 'Ġunit',
 'Ġfor',
 'Ġa',
 'Ġtotal',
 'Ġprice',
 'Ġof',
 'Ġ1',
 ',',
 '800',
 ',',
 '000',
 'Ċ',
 'dlrs',
 '.',
 'ĊĠĠĠ',
 'ĠThe',
 'Ġcompany',
 'Ġsaid',
 'Ġeach',
 'Ġunit',
 'Ġconsisted',
 'Ġof',
 'Ġone',
 'Ġshare',
 'Ġof',
 'ĠChe',
 'f',
 "'s",
 'Ċ',
 'common',
 'Ġstock',
 'Ġand',
 'Ġone',
 'Ġthree',
 '-',
 'year',
 'Ġwarrant',
 'Ġexerciseable',
 'Ġto',
 'Ċ',
 'purchase',
 'Ġone',
 'Ġshare',
 'Ġof',
 'ĠChe',
 'f',
 "'s",
 'Ġstock',
 'Ġat',
 'Ġ25',
 'Ġcents',
 '.',
 'ĊĠĠĠ',
 'ĠThe',
 'Ġcompany',
 'Ġalso',
 'Ġsaid',
 'ĠRobert',
 'ĠE',
 '.',
 'ĠBrennan',
 'Ġpurchased',
 'Ġ8',
 ',',
 '250',
 ',',
 '000',
 'Ċ',
 'of',
 'Ġthe',
 '

In [11]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [12]:
tokenizer.push_to_hub("gpt2-reuters-tokenizer")

CommitInfo(commit_url='https://huggingface.co/rebirthmonkey/gpt2-reuters-tokenizer/commit/33d081b02d95a4b9f8c3a583d8eb60a87ed9ca62', commit_message='Upload tokenizer', commit_description='', oid='33d081b02d95a4b9f8c3a583d8eb60a87ed9ca62', pr_url=None, repo_url=RepoUrl('https://huggingface.co/rebirthmonkey/gpt2-reuters-tokenizer', endpoint='https://huggingface.co', repo_type='model', repo_id='rebirthmonkey/gpt2-reuters-tokenizer'), pr_revision=None, pr_num=None)

# Using Our Tokenizer

In [13]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("rebirthmonkey/gpt2-reuters-tokenizer")

tokenizer_config.json:   0%|          | 0.00/471 [00:00<?, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/99.0 [00:00<?, ?B/s]

In [14]:
example = dataset['test'][2]

example

{'title': 'CHEFS <CHEF.O> COMPLETES PRIVATE SALE',
 'body': "Chefs International\nInc said it completed a private sale of nine mln units of its\nsecurities for 20 cts per unit for a total price of 1,800,000\ndlrs.\n    The company said each unit consisted of one share of Chef's\ncommon stock and one three-year warrant exerciseable to\npurchase one share of Chef's stock at 25 cents.\n    The company also said Robert E. Brennan purchased 8,250,000\nof the units.\n   \n Reuter\n\x03",
 'full_article': "TITLE:CHEFS <CHEF.O> COMPLETES PRIVATE SALE\n\nBODY:Chefs International\nInc said it completed a private sale of nine mln units of its\nsecurities for 20 cts per unit for a total price of 1,800,000\ndlrs.\n    The company said each unit consisted of one share of Chef's\ncommon stock and one three-year warrant exerciseable to\npurchase one share of Chef's stock at 25 cents.\n    The company also said Robert E. Brennan purchased 8,250,000\nof the units.\n   \n Reuter\n\x03"}

In [15]:
tokenizer.tokenize(example['full_article'])

['TITLE',
 ':',
 'CH',
 'EF',
 'S',
 'Ġ<',
 'CH',
 'EF',
 '.',
 'O',
 '>',
 'ĠCOMPLETES',
 'ĠPRIVATE',
 'ĠSALE',
 'Ċ',
 'Ċ',
 'BODY',
 ':',
 'Che',
 'f',
 's',
 'ĠInternational',
 'Ċ',
 'Inc',
 'Ġsaid',
 'Ġit',
 'Ġcompleted',
 'Ġa',
 'Ġprivate',
 'Ġsale',
 'Ġof',
 'Ġnine',
 'Ġmln',
 'Ġunits',
 'Ġof',
 'Ġits',
 'Ċ',
 'securities',
 'Ġfor',
 'Ġ20',
 'Ġcts',
 'Ġper',
 'Ġunit',
 'Ġfor',
 'Ġa',
 'Ġtotal',
 'Ġprice',
 'Ġof',
 'Ġ1',
 ',',
 '800',
 ',',
 '000',
 'Ċ',
 'dlrs',
 '.',
 'ĊĠĠĠ',
 'ĠThe',
 'Ġcompany',
 'Ġsaid',
 'Ġeach',
 'Ġunit',
 'Ġconsisted',
 'Ġof',
 'Ġone',
 'Ġshare',
 'Ġof',
 'ĠChe',
 'f',
 "'s",
 'Ċ',
 'common',
 'Ġstock',
 'Ġand',
 'Ġone',
 'Ġthree',
 '-',
 'year',
 'Ġwarrant',
 'Ġexerciseable',
 'Ġto',
 'Ċ',
 'purchase',
 'Ġone',
 'Ġshare',
 'Ġof',
 'ĠChe',
 'f',
 "'s",
 'Ġstock',
 'Ġat',
 'Ġ25',
 'Ġcents',
 '.',
 'ĊĠĠĠ',
 'ĠThe',
 'Ġcompany',
 'Ġalso',
 'Ġsaid',
 'ĠRobert',
 'ĠE',
 '.',
 'ĠBrennan',
 'Ġpurchased',
 'Ġ8',
 ',',
 '250',
 ',',
 '000',
 'Ċ',
 'of',
 'Ġthe',
 '