# load hugging face model

In [1]:
import transformers
from transformers import (    AutoConfig,
    AutoModelForCausalLM,
    AutoTokenizer,
    Trainer)

In [2]:
model_name = 'distilgpt2'

In [3]:
gn_cfg = AutoConfig.from_pretrained(model_name)

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=762.0, style=ProgressStyle(description_…




In [4]:
gn_cfg

GPT2Config {
  "_num_labels": 1,
  "activation_function": "gelu_new",
  "architectures": [
    "GPT2LMHeadModel"
  ],
  "attn_pdrop": 0.1,
  "bos_token_id": 50256,
  "embd_pdrop": 0.1,
  "eos_token_id": 50256,
  "gradient_checkpointing": false,
  "id2label": {
    "0": "LABEL_0"
  },
  "initializer_range": 0.02,
  "label2id": {
    "LABEL_0": 0
  },
  "layer_norm_epsilon": 1e-05,
  "model_type": "gpt2",
  "n_ctx": 1024,
  "n_embd": 768,
  "n_head": 12,
  "n_inner": null,
  "n_layer": 6,
  "n_positions": 1024,
  "resid_pdrop": 0.1,
  "scale_attn_weights": true,
  "summary_activation": null,
  "summary_first_dropout": 0.1,
  "summary_proj_to_labels": true,
  "summary_type": "cls_index",
  "summary_use_proj": true,
  "task_specific_params": {
    "text-generation": {
      "do_sample": true,
      "max_length": 50
    }
  },
  "transformers_version": "4.10.0",
  "use_cache": true,
  "vocab_size": 50257
}

In [5]:
tok = AutoTokenizer.from_pretrained(model_name)

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=1042301.0, style=ProgressStyle(descript…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=456318.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=1355256.0, style=ProgressStyle(descript…




In [6]:
tok

PreTrainedTokenizerFast(name_or_path='distilgpt2', vocab_size=50257, model_max_len=1024, is_fast=True, padding_side='right', special_tokens={'bos_token': '<|endoftext|>', 'eos_token': '<|endoftext|>', 'unk_token': '<|endoftext|>'})

In [58]:
tok.pad_token = tok.eos_token

In [7]:
model =  AutoModelForCausalLM.from_pretrained(model_name)

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=352833716.0, style=ProgressStyle(descri…




# Load finetuning data

In [8]:
import pandas as pd

In [24]:
df = pd.read_csv("../data/book_dscrptions.csv", header=0)

In [31]:
df.columns = ['text']
df.columns, df.shape

(Index(['text'], dtype='object'), (7779, 1))

In [40]:
df.iloc[0], type(df.iloc[0]['text'])

(text    Harry Potter and the Half-Blood Prince (Harry ...
 Name: 0, dtype: object,
 str)

In [33]:
for item in df.sample(5).values:
    print(item)
    print('-'*10)
    print('\n')

["Candy. Mian Mian/Andrea Lingenfelter. An international literary phenomenon-now available for the first time in English translation-Candy is a hip, harrowing tale of risk and desire, the story of a young Chinese woman forging a life for herself in a world seemingly devoid of guidelines. Hong, who narrates the novel, and whose life in many ways parallels the author's own, drops out of high school and runs away at age 17 to the frontier city of Shenzen. As Hong navigates the temptations of the city, she quickly falls in love with a young musician and together they dive into a cruel netherworld of alcohol, drugs, and excess, a life that fails to satisfy Hong's craving for an authentic self, and for a love that will define her. This startling and subversive novel is a blast of sex, drugs, and rock 'n' roll that opens up to us a modern China we've never seen before. - Banned in China-with Mian Mian labeled the 'poster child for spiritual pollution'-CANDY still managed to sell 60,000 copies

# Tokenize data

In [96]:
import itertools

## add eos to data

In [65]:
def add_eos(df, eos_token):
    df['text'] = df['text'].apply(lambda x: "".join([x, eos_token]))
    return df

In [66]:
df = add_eos(df, tok.eos_token)

## tokenize texts

In [67]:
def tokenize_function(text):
    return tok(text)

In [68]:
df['text'].str.len().describe()

count    7779.000000
mean      478.926469
std       415.579814
min        40.000000
25%       244.000000
50%       320.000000
75%       557.000000
max      9036.000000
Name: text, dtype: float64

In [76]:
tokenized = df['text'].apply(tokenize_function)

## concat texts and batch

In [103]:
batch_df = tokenized.iloc[0:10].values
dict_keys=['input_ids', 'attention_mask']

{'input_ids': [18308,
  14179,
  290,
  262,
  13139,
  12,
  21659,
  9005,
  357,
  18308,
  14179,
  220,
  1303,
  21,
  737,
  449,
  13,
  42,
  13,
  41558,
  14,
  24119,
  5675,
  6836,
  2634,
  13,
  1649,
  5850,
  14179,
  290,
  262,
  13139,
  12,
  21659,
  9005,
  9808,
  11,
  262,
  1175,
  1028,
  26111,
  468,
  9258,
  13,
  383,
  16884,
  278,
  995,
  468,
  6626,
  866,
  262,
  3504,
  11,
  290,
  355,
  262,
  18499,
  3817,
  11,
  262,
  3048,
  772,
  19431,
  625,
  4291,
  262,
  337,
  6837,
  829,
  13,
  27442,
  318,
  1497,
  422,
  30922,
  329,
  890,
  9574,
  11,
  290,
  262,
  8284,
  286,
  262,
  9643,
  468,
  6989,
  21634,
  516,
  9089,
  13,
  843,
  1865,
  11,
  355,
  287,
  477,
  9976,
  11,
  1204,
  2925,
  319,
  13,
  5850,
  11,
  6575,
  11,
  290,
  19959,
  11,
  1719,
  3804,
  511,
  440,
  13,
  54,
  13,
  43,
  13,
  1241,
  26420,
  11,
  923,
  319,
  511,
  15670,
  399,
  13,
  36,
  13,
  54,
  13,
  51,
  13,
 

In [77]:
def aggregate_and_tokenize(examples, block_size=128, padding_token=tok.pad_token,
                           dict_keys=['input_ids', 'attention_mask']):
#     agg_and_batch
    concatenated_examples = {k: list(itertools.chain(*[d.get(k) for d in examples])) for k in dict_keys}
    total_length = len(concatenated_examples[list(examples[0].keys())[0]])
    # Split by chunks of max_len.
    result = {
        k: [t[i : i + block_size] for i in range(0, total_length, block_size)]
        for k, t in concatenated_examples.items()
    }
    # add padding if last split is smaller than block size
    if len(result[list(examples.keys())[0]][-1]) < block_size:
        result[list(examples.keys())[0]][-1].append([padding_token]*(block_size-len(result[list(examples.keys())[0]][-1])))
    result["labels"] = result["input_ids"].copy()
    return result

In [88]:
model_input = []
bsize = 1000
for batch in range(0, tokenized.shape[0], bsize):
    batch_df = tokenized.iloc[batch-bsize:batch].values
    result = aggregate_and_tokenize(batch_df)
    model_input.append(result)

AttributeError: 'numpy.ndarray' object has no attribute 'keys'

In [87]:
sum([[1, 2, 3]], [])

[1, 2, 3]

In [85]:
[type(example[k]) for k in example.keys()]

[list, list]

In [78]:
input_text = tokenized.apply(aggregate_and_tokenize)

TypeError: can only concatenate list (not "int") to list

# Finetune model

In [None]:
trainer = Trainer(
    model=model,
    train_dataset=train_dataset,
    tokenizer=tok,
    # Data collator will default to DataCollatorWithPadding, so we change it.
    data_collator=default_data_collator,
)