# load hugging face model

In [146]:
import transformers
from transformers import (
    AutoConfig,
    AutoModelForCausalLM,
    AutoTokenizer,
    Trainer,
    top_k_top_p_filtering)

In [2]:
model_name = 'distilgpt2'

In [3]:
gn_cfg = AutoConfig.from_pretrained(model_name)

In [4]:
gn_cfg

GPT2Config {
  "_num_labels": 1,
  "activation_function": "gelu_new",
  "architectures": [
    "GPT2LMHeadModel"
  ],
  "attn_pdrop": 0.1,
  "bos_token_id": 50256,
  "embd_pdrop": 0.1,
  "eos_token_id": 50256,
  "gradient_checkpointing": false,
  "id2label": {
    "0": "LABEL_0"
  },
  "initializer_range": 0.02,
  "label2id": {
    "LABEL_0": 0
  },
  "layer_norm_epsilon": 1e-05,
  "model_type": "gpt2",
  "n_ctx": 1024,
  "n_embd": 768,
  "n_head": 12,
  "n_inner": null,
  "n_layer": 6,
  "n_positions": 1024,
  "resid_pdrop": 0.1,
  "scale_attn_weights": true,
  "summary_activation": null,
  "summary_first_dropout": 0.1,
  "summary_proj_to_labels": true,
  "summary_type": "cls_index",
  "summary_use_proj": true,
  "task_specific_params": {
    "text-generation": {
      "do_sample": true,
      "max_length": 50
    }
  },
  "transformers_version": "4.10.0",
  "use_cache": true,
  "vocab_size": 50257
}

In [5]:
tok = AutoTokenizer.from_pretrained(model_name)

In [6]:
tok

PreTrainedTokenizerFast(name_or_path='distilgpt2', vocab_size=50257, model_max_len=1024, is_fast=True, padding_side='right', special_tokens={'bos_token': '<|endoftext|>', 'eos_token': '<|endoftext|>', 'unk_token': '<|endoftext|>'})

In [7]:
tok.pad_token = tok.eos_token

In [8]:
model =  AutoModelForCausalLM.from_pretrained(model_name)

# Load finetuning data

In [9]:
import pandas as pd

In [50]:
import datasets

In [61]:
hg_df = datasets.load_dataset('csv', data_files="../data/book_dscrptions.csv")

Using custom data configuration default-0e4d2a8ce410dd40
Reusing dataset csv (/Users/sadhbh/.cache/huggingface/datasets/csv/default-0e4d2a8ce410dd40/0.0.0/9144e0a4e8435090117cea53e6c7537173ef2304525df4a077c435d8ee7828ff)


In [75]:
hg_df = hg_df.rename_column('0', 'text')

In [77]:
hg_df['train']['text'][-1]

'Poor People. William T. Vollmann. Examines and analyzes the diversity of poverty, using a series of interviews with impoverished people from around the world, who draw on their belief systems to account for their financial disadvantages.'

# Tokenize data

In [14]:
import itertools

## add eos to data

In [80]:
def add_eos(sample, eos_token=tok.eos_token):
    return {'text': "".join([sample['text'], eos_token])}

In [83]:
hg_df['train'] = hg_df['train'].map(add_eos)

Loading cached processed dataset at /Users/sadhbh/.cache/huggingface/datasets/csv/default-0e4d2a8ce410dd40/0.0.0/9144e0a4e8435090117cea53e6c7537173ef2304525df4a077c435d8ee7828ff/cache-523d841baf0d1c0d.arrow


In [84]:
hg_df['train']['text'][-1]

'Poor People. William T. Vollmann. Examines and analyzes the diversity of poverty, using a series of interviews with impoverished people from around the world, who draw on their belief systems to account for their financial disadvantages.<|endoftext|>'

## tokenize texts

In [90]:
def tokenize_function(text):
    return tok(text['text'])

In [91]:
tokenized = hg_df.map(tokenize_function)

HBox(children=(FloatProgress(value=0.0, max=7779.0), HTML(value='')))




In [100]:
tokenized = tokenized['train'].remove_columns('text')

## concat texts and batch

In [93]:
tok.pad_token_id

50256

In [102]:
block_size = 1024
def group_texts(examples):
    # Concatenate all texts.
    concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
    total_length = len(concatenated_examples[list(examples.keys())[0]])
    # We drop the small remainder, we could add padding if the model supported it instead of this drop, you can
        # customize this part to your needs.
    total_length = (total_length // block_size) * block_size
    # Split by chunks of max_len.
    result = {
        k: [t[i : i + block_size] for i in range(0, total_length, block_size)]
        for k, t in concatenated_examples.items()
    }
    result["labels"] = result["input_ids"].copy()
    return result

In [103]:
lm_input = tokenized.map(
    group_texts,
    batched=True,
    batch_size=1000,
    num_proc=4,
)

   

HBox(children=(FloatProgress(value=0.0, description=' #1', max=2.0, style=ProgressStyle(description_width='ini…

 

HBox(children=(FloatProgress(value=0.0, description=' #2', max=2.0, style=ProgressStyle(description_width='ini…

HBox(children=(FloatProgress(value=0.0, description=' #0', max=2.0, style=ProgressStyle(description_width='ini…

HBox(children=(FloatProgress(value=0.0, description=' #3', max=2.0, style=ProgressStyle(description_width='ini…







In [110]:
"".join([tok.decode(idi) for idi in lm_input[2]['input_ids']])

' to America after two decades of living abroad and his disconcerting reunion with his homeland as he discusses motels, tax-return instructions, and hardware stores.<|endoftext|>The Lost Continent: Travels in Small Town America. Bill Bryson. An unsparing and hilarious account of one man\'s rediscovery of America and his search for the perfect small town.<|endoftext|>Neither Here nor There: Travels in Europe. Bill Bryson. Like many of his generation, Bill Bryson backpacked across Europe in the early seventies -- in search of enlightenment, beer, and women. Twenty years later he decided to retrace the journey he undertook in the halcyon days of his youth. The result is Neither Here Nor There, an affectionate and riotously funny pilgrimage from the frozen wastes of Scandinavia to the chaotic tumult of Istanbul, with stops along the way in Europe\'s most diverting and historic locales. Like many of his generation, Bill Bryson backpacked across Europe in the early seventies--in search of en

# Finetune model

In [111]:
trainer = Trainer(
    model=model,
    train_dataset=lm_input,
    tokenizer=tok
)

No `TrainingArguments` passed, using `output_dir=tmp_trainer`.
PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


In [112]:
trainer.train()

***** Running training *****
  Num examples = 780
  Num Epochs = 3
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 294


Step,Training Loss




Training completed. Do not forget to share your model on huggingface.co/models =)




TrainOutput(global_step=294, training_loss=3.735368170705782, metrics={'train_runtime': 51449.2764, 'train_samples_per_second': 0.045, 'train_steps_per_second': 0.006, 'total_flos': 611434396385280.0, 'train_loss': 3.735368170705782, 'epoch': 3.0})

In [114]:
!ls

data_source.ipynb          [1m[36mtmp_trainer[m[m
finetune_transformer.ipynb


In [117]:
model.save_pretrained(save_directory='finetuned_model')

Configuration saved in finetuned_model/config.json
Model weights saved in finetuned_model/pytorch_model.bin


In [213]:
tok.save_pretrained(save_directory='finetuned_model/tokenizer')

tokenizer config file saved in finetuned_model/tokenizer/tokenizer_config.json
Special tokens file saved in finetuned_model/tokenizer/special_tokens_map.json


('finetuned_model/tokenizer/tokenizer_config.json',
 'finetuned_model/tokenizer/special_tokens_map.json',
 'finetuned_model/tokenizer/vocab.json',
 'finetuned_model/tokenizer/merges.txt',
 'finetuned_model/tokenizer/added_tokens.json',
 'finetuned_model/tokenizer/tokenizer.json')

# generate output

## token by token

In [160]:
from torch.nn import functional as F
from torch import multinomial

In [195]:
test_input = tok("Brave New World.", return_tensors='pt')

In [196]:
test_input

{'input_ids': tensor([[39787,   968,  2159,    13]]), 'attention_mask': tensor([[1, 1, 1, 1]])}

In [172]:
output = model(test_input['input_ids'], attention_mask=test_input['attention_mask'])

In [173]:
last_state_logits = output['logits'][:, -1, :]

In [174]:
filtered_logits = top_k_top_p_filtering(last_state_logits, top_k=5, top_p=1.0)

In [175]:
gen_token = multinomial(F.softmax(filtered_logits, dim=-1), num_samples=1)

In [176]:
multinomial?

In [177]:
tok.decode(gen_token.tolist()[0])

' Richard'

## using generate func

In [188]:
output = model.generate(**test_input, do_sample=True, top_p=0.84, top_k=100, max_length=200)

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


In [189]:
tok.decode(output.tolist()[0])

"A Brave New World. David Lynch. A remarkable new novel in a collection of people's stories and experiences and a story that's sure to be a delight to watch.<|endoftext|>"

In [190]:
output = model.generate(**test_input, do_sample=True, top_p=0.84, top_k=100, max_length=200)

tok.decode(output.tolist()[0])

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


"A Brave New World. The author of The Book and Its Tales explores the life and death of the 'good man' and its aftermath.<|endoftext|>"

In [199]:
output = model.generate(**test_input, do_sample=True, top_p=0.84, top_k=100, max_length=200)

tok.decode(output.tolist()[0])

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


"Brave New World. John Wootman. After three days as a reporter, a young man is killed when his wife's home is raided, a young man's wife takes on the role of detective in the first episode of its first episode. But a mysterious, enigmatic investigator has also become a danger when a young woman is caught hiding among the rich-people. Reprint. 100,000 first printing.<|endoftext|>"

In [200]:
output = model.generate(**test_input, do_sample=True, top_p=0.84, top_k=100, max_length=200)

tok.decode(output.tolist()[0])

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


'Brave New World. Dorsett. When a pair of men arrive at the Skeligoth Library for a magical world, they discover the true nature of their dreams, and must choose between a new world of immortality and an unimaginable fate. Reprint. 25,000 first printing.<|endoftext|>'

In [201]:
test_input = tok("The", return_tensors='pt')
output = model.generate(**test_input, do_sample=True, top_p=0.84, top_k=100, max_length=200)
tok.decode(output.tolist()[0])

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


'The ausine: The Death and the Life of a Species in a World in the Twenty Years of an American Biographer, Elizabeth Taylor.<|endoftext|>'

In [202]:
test_input = tok("The", return_tensors='pt')
output = model.generate(**test_input, do_sample=True, top_p=0.84, top_k=100, max_length=200)
tok.decode(output.tolist()[0])

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


"The2. It's the way of life, not the way of money, it's how people think about it. It's about finding a way out of a place of need, and knowing the place that is yours for you.<|endoftext|>"

In [203]:
test_input = tok("The", return_tensors='pt')
output = model.generate(**test_input, do_sample=True, top_p=0.84, top_k=100, max_length=200)
tok.decode(output.tolist()[0])

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


"The.\nThe Church of England. Richard V. Williams. A biography of the Church of England's leading theologian. Contains the book's cover in English.<|endoftext|>"

In [205]:
test_input = tok("What I Want?", return_tensors='pt')
output = model.generate(**test_input, do_sample=True, top_p=0.84, top_k=100, max_length=200)
tok.decode(output.tolist()[0])

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


"What I Want? (For the Love of Divers  #2). Joseph Gussman. The story of the vampire/murder-dancer's quest to discover the secrets of a dying child is one of the many tales of the vampire, with a unique blend of wit and fantasy.<|endoftext|>"

In [212]:
test_input = tok("Turtle", return_tensors='pt')
output = model.generate(**test_input, do_sample=True, top_p=0.84, top_k=100, min_length=10, max_length=200)
tok.decode(output.tolist()[0])

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


'Turtle. In the title novel, "Elric," the giant of the sea, he becomes the first to see his friend, the famous Elric.<|endoftext|>'

In [194]:
df[df['text'].apply(lambda x: 'Brave New World' in x)]

Unnamed: 0,text
1036,Brave New World. Aldous Huxley. Huxley's class...
1040,Moksha: Writings on Psychedelics & the Visiona...
1041,Point Counter Point. Aldous Huxley. Aldous Hux...
1121,Brave New World / Brave New World Revisited. A...
1122,Brave New World and Brave New World Revisited....
1123,Brave New World Revisited. Aldous Huxley. When...
1124,Brave New World. Aldous Huxley. Huxley's class...
7185,The Handmaid's Tale. Margaret Atwood. (Book Ja...
