In [16]:
from datasets import load_dataset

from tokenizers import Tokenizer
from tokenizers.models import BPE 
from tokenizers.trainers import BpeTrainer 
from tokenizers.pre_tokenizers import Whitespace 
import os 

import evaluate

from transformers import (
    AutoTokenizer, 
    DataCollatorForLanguageModeling, 
    Trainer, 
    TrainingArguments, 
    AutoModelForCausalLM, 
    AutoModelForSequenceClassification
)

import time

In [17]:
raw_datasets = load_dataset('wikitext', 'wikitext-2-raw-v1')

In [18]:
print(raw_datasets)

train_split = raw_datasets['train']
print(train_split)
print(train_split.features)
print(raw_datasets['train'][1])

DatasetDict({
    test: Dataset({
        features: ['text'],
        num_rows: 4358
    })
    train: Dataset({
        features: ['text'],
        num_rows: 36718
    })
    validation: Dataset({
        features: ['text'],
        num_rows: 3760
    })
})
Dataset({
    features: ['text'],
    num_rows: 36718
})
{'text': Value(dtype='string', id=None)}
{'text': ' = Valkyria Chronicles III = \n'}


In [19]:
def get_training_corpus():
    batch_size = 1000 
    for i in range(0, len(raw_datasets['train']), batch_size): 
        yield raw_datasets['train'][i: i + batch_size]['text']


text_iterator = get_training_corpus()
print(next(text_iterator)[5:10])

[" It met with positive sales in Japan , and was praised by both Japanese and western critics . After release , it received downloadable content , along with an expanded edition in November of that year . It was also adapted into manga and an original video animation series . Due to low sales of Valkyria Chronicles II , Valkyria Chronicles III was not localized , but a fan translation compatible with the game 's expanded edition was released in 2014 . Media.Vision would return to the franchise with the development of Valkyria : Azure Revolution for the PlayStation 4 . \n", '', ' = = Gameplay = = \n', '', " As with previous Valkyira Chronicles games , Valkyria Chronicles III is a tactical role @-@ playing game where players take control of a military unit and take part in missions against enemy forces . Stories are told through comic book @-@ like panels with animated character portraits , with characters speaking partially through voiced speech bubbles and partially through unvoiced te

In [20]:
# blank tokenizer with BPE model 
tokenizer = Tokenizer(BPE(unk_token='[UNK]'))

# pre tokenizer which splits text into words 
tokenizer.pre_tokenizer = Whitespace()

# vocab_size = # subword units the tokenizer can have 
trainer = BpeTrainer(vocab_size = 25000, special_tokens = ['[UNK]', '[PAD]', '[CLS]', '[SEP]', '[MASK]'])
tokenizer.train_from_iterator(get_training_corpus(), trainer = trainer)
print('training complete')

training complete


In [21]:
os.makedirs('custom_tokenizer', exist_ok = True)
tokenizer.save('custom_tokenizer/tokenizer.json')

In [22]:
loaded_tokenizer = Tokenizer.from_file('custom_tokenizer/tokenizer.json')

sentence = "This is a test of our new tokenizer."
output = loaded_tokenizer.encode(sentence)
print('sentence : ', sentence)
print('tokens : ', output.tokens)
print('token ids : ', output.ids)

sentence :  This is a test of our new tokenizer.
tokens :  ['This', 'is', 'a', 'test', 'of', 'our', 'new', 'to', 'ken', 'izer', '.']
token ids :  [1514, 1034, 69, 2319, 1031, 1165, 1366, 1036, 3201, 14114, 18]


### Finetuning GPT2 

In [23]:
dataset = load_dataset('wikitext', 'wikitext-2-raw-v1', split='train[:5000]')

dataset = dataset.filter(lambda ex: len(ex['text']) > 0)
split = dataset.train_test_split(test_size = 0.1, seed = 42)
train_raw = split['train']
val_raw = split['test']

In [24]:
model_name = 'gpt2'
tokenizer = AutoTokenizer.from_pretrained(model_name) # tokenizer used for gpt2
print('old vocab size ', len(tokenizer))
# adding pad token for bacthing 
if tokenizer.pad_token is None: 
    tokenizer.add_special_tokens({'pad_token': '[PAD]'})    

print('new vocab size ', len(tokenizer))
model = AutoModelForCausalLM.from_pretrained(model_name)
model.resize_token_embeddings(len(tokenizer)) # match new vocab size 

old vocab size  50257
new vocab size  50258


Embedding(50258, 768)

In [25]:
# tokenize text 
max_length = 512 
def tokenize_fn(examples): 
    return tokenizer(examples['text'], truncation=True, max_length=max_length)

train_tok = train_raw.map(tokenize_fn, batched=True, remove_columns=['text'])
val_tok = val_raw.map(tokenize_fn, batched=True, remove_columns=['text'])

train_tok.set_format(type='torch', columns = ['input_ids', 'attention_mask'])
val_tok.set_format(type='torch', columns = ['input_ids', 'attention_mask'])

In [26]:
# batches tokenizer data and pads sequence in the batch to the same length 
data_collator = DataCollatorForLanguageModeling(tokenizer = tokenizer, mlm = False) # clm

training_args = TrainingArguments(
    output_dir = './gpt2-wikitext-finetuned', 
    num_train_epochs = 10, 
    per_device_train_batch_size = 4, 
    learning_rate = 5e-5, 
    weight_decay=0.01, 
    warmup_steps = 500, 
    eval_strategy = 'steps', 
    eval_steps = 500, 
    save_strategy = 'steps', 
    save_steps = 500, 
    load_best_model_at_end = True, 
    save_total_limit=3, 
    fp16 = True, # mix precision for speed 
    report_to = 'none'
)

In [27]:
trainer = Trainer(
    model = model, 
    args = training_args, 
    train_dataset=train_tok, 
    eval_dataset=val_tok,  
    data_collator=data_collator, 
    # processing_class = tokenizer
)

In [28]:
start_time = time.perf_counter()
trainer.train()

end_time = time.perf_counter()
print((end_time-start_time)//60, ' m', (end_time-start_time)%60, 'secs')

  0%|          | 0/7260 [00:00<?, ?it/s]

{'loss': 13.1262, 'grad_norm': 6.765242099761963, 'learning_rate': 4.92e-05, 'epoch': 0.69}


  0%|          | 0/41 [00:00<?, ?it/s]

{'eval_loss': 3.4197072982788086, 'eval_runtime': 16.9849, 'eval_samples_per_second': 19.017, 'eval_steps_per_second': 2.414, 'epoch': 0.69}
{'loss': 3.3448, 'grad_norm': 6.078546524047852, 'learning_rate': 4.636094674556213e-05, 'epoch': 1.38}


  0%|          | 0/41 [00:00<?, ?it/s]

{'eval_loss': 3.1987719535827637, 'eval_runtime': 15.4836, 'eval_samples_per_second': 20.861, 'eval_steps_per_second': 2.648, 'epoch': 1.38}
{'loss': 3.1319, 'grad_norm': 4.227630138397217, 'learning_rate': 4.2662721893491124e-05, 'epoch': 2.07}


  0%|          | 0/41 [00:00<?, ?it/s]

{'eval_loss': 3.1648731231689453, 'eval_runtime': 15.4974, 'eval_samples_per_second': 20.842, 'eval_steps_per_second': 2.646, 'epoch': 2.07}
{'loss': 2.8552, 'grad_norm': 10.874784469604492, 'learning_rate': 3.896449704142012e-05, 'epoch': 2.75}


  0%|          | 0/41 [00:00<?, ?it/s]

{'eval_loss': 3.135662078857422, 'eval_runtime': 15.5619, 'eval_samples_per_second': 20.756, 'eval_steps_per_second': 2.635, 'epoch': 2.75}
{'loss': 2.6875, 'grad_norm': 5.875648021697998, 'learning_rate': 3.5266272189349114e-05, 'epoch': 3.44}


  0%|          | 0/41 [00:00<?, ?it/s]

{'eval_loss': 3.1644887924194336, 'eval_runtime': 15.5246, 'eval_samples_per_second': 20.806, 'eval_steps_per_second': 2.641, 'epoch': 3.44}
{'loss': 2.5883, 'grad_norm': 6.053055286407471, 'learning_rate': 3.1568047337278106e-05, 'epoch': 4.13}


  0%|          | 0/41 [00:00<?, ?it/s]

{'eval_loss': 3.1842610836029053, 'eval_runtime': 15.5, 'eval_samples_per_second': 20.839, 'eval_steps_per_second': 2.645, 'epoch': 4.13}
{'loss': 2.4332, 'grad_norm': 8.307164192199707, 'learning_rate': 2.7869822485207102e-05, 'epoch': 4.82}


  0%|          | 0/41 [00:00<?, ?it/s]

{'eval_loss': 3.1779539585113525, 'eval_runtime': 15.4646, 'eval_samples_per_second': 20.886, 'eval_steps_per_second': 2.651, 'epoch': 4.82}
{'loss': 2.2933, 'grad_norm': 5.00571346282959, 'learning_rate': 2.4171597633136094e-05, 'epoch': 5.51}


  0%|          | 0/41 [00:00<?, ?it/s]

{'eval_loss': 3.221693992614746, 'eval_runtime': 15.4693, 'eval_samples_per_second': 20.88, 'eval_steps_per_second': 2.65, 'epoch': 5.51}
{'loss': 2.236, 'grad_norm': 15.666465759277344, 'learning_rate': 2.047337278106509e-05, 'epoch': 6.2}


  0%|          | 0/41 [00:00<?, ?it/s]

{'eval_loss': 3.2799105644226074, 'eval_runtime': 15.4679, 'eval_samples_per_second': 20.882, 'eval_steps_per_second': 2.651, 'epoch': 6.2}
{'loss': 2.1419, 'grad_norm': 6.070513725280762, 'learning_rate': 1.6775147928994085e-05, 'epoch': 6.89}


  0%|          | 0/41 [00:00<?, ?it/s]

{'eval_loss': 3.2689168453216553, 'eval_runtime': 15.7307, 'eval_samples_per_second': 20.533, 'eval_steps_per_second': 2.606, 'epoch': 6.89}
{'loss': 2.0368, 'grad_norm': 6.930351734161377, 'learning_rate': 1.3076923076923078e-05, 'epoch': 7.58}


  0%|          | 0/41 [00:00<?, ?it/s]

{'eval_loss': 3.315659523010254, 'eval_runtime': 16.3186, 'eval_samples_per_second': 19.793, 'eval_steps_per_second': 2.512, 'epoch': 7.58}
{'loss': 2.0123, 'grad_norm': 8.146483421325684, 'learning_rate': 9.378698224852072e-06, 'epoch': 8.26}


  0%|          | 0/41 [00:00<?, ?it/s]

{'eval_loss': 3.3545081615448, 'eval_runtime': 16.0478, 'eval_samples_per_second': 20.127, 'eval_steps_per_second': 2.555, 'epoch': 8.26}
{'loss': 1.9532, 'grad_norm': 6.600475311279297, 'learning_rate': 5.680473372781065e-06, 'epoch': 8.95}


  0%|          | 0/41 [00:00<?, ?it/s]

{'eval_loss': 3.3555212020874023, 'eval_runtime': 15.9897, 'eval_samples_per_second': 20.2, 'eval_steps_per_second': 2.564, 'epoch': 8.95}
{'loss': 1.9068, 'grad_norm': 6.371081829071045, 'learning_rate': 1.9822485207100593e-06, 'epoch': 9.64}


  0%|          | 0/41 [00:00<?, ?it/s]

{'eval_loss': 3.386094093322754, 'eval_runtime': 15.7731, 'eval_samples_per_second': 20.478, 'eval_steps_per_second': 2.599, 'epoch': 9.64}


There were missing keys in the checkpoint model loaded: ['lm_head.weight'].


{'train_runtime': 4944.6942, 'train_samples_per_second': 5.873, 'train_steps_per_second': 1.468, 'train_loss': 3.149932844514032, 'epoch': 10.0}
82.0  m 27.058065599994734 secs


In [29]:
prompt = "Himalaya mountains are "

encoding = tokenizer(prompt, return_tensors = 'pt', padding = True).to('cuda')
model.config.pad_token_id = tokenizer.pad_token_id

outputs = model.generate(
    input_ids = encoding['input_ids'], 
    attention_ids = encoding['input_ids'], 
    attention_mask = encoding['attention_mask'], 
    max_length = 100, 
    num_return_sequences = 1, 
    temperature = 0.7, 
    top_k = 50, 
    repetition_penalty = 1.2, 
    do_sample = True 
)

print(tokenizer.decode(outputs[0], skip_special_tokens=True))

ValueError: The following `model_kwargs` are not used by the model: ['attention_ids'] (note: typos in the generate arguments will also show up in this list)

### Fine tune BERT

In [None]:
raw_datasets = load_dataset('imdb')

small_train_dataset = raw_datasets['train'].shuffle(seed=42).select(range(1000))
small_test_dataset = raw_datasets['test'].shuffle(seed=42).select(range(1000))

small_train_dataset[0]

train-00000-of-00001.parquet:   0%|          | 0.00/21.0M [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


test-00000-of-00001.parquet:   0%|          | 0.00/20.5M [00:00<?, ?B/s]

unsupervised-00000-of-00001.parquet:   0%|          | 0.00/42.0M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating unsupervised split:   0%|          | 0/50000 [00:00<?, ? examples/s]

{'text': 'There is no relation at all between Fortier and Profiler but the fact that both are police series about violent crimes. Profiler looks crispy, Fortier looks classic. Profiler plots are quite simple. Fortier\'s plot are far more complicated... Fortier looks more like Prime Suspect, if we have to spot similarities... The main character is weak and weirdo, but have "clairvoyance". People like to compare, to judge, to evaluate. How about just enjoying? Funny thing too, people writing Fortier looks American but, on the other hand, arguing they prefer American series (!!!). Maybe it\'s the language, or the spirit, but I think this series is more English than American. By the way, the actors are really good and funny. The acting is not superficial at all...',
 'label': 1}

In [None]:
model_name = 'distilbert-base-uncased'
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
def tokenize_function(example): 
    return tokenizer(example['text'], padding='max_length', truncation=True)

tokenized_train_dataset = small_train_dataset.map(tokenize_function, batched = True)
tokenized_test_dataset = small_test_dataset.map(tokenize_function, batched = True)

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

In [None]:
import numpy as np 
metric = evaluate.load('accuracy')

def compute_metrics(eval_pred): 
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis = -1)
    return metric.compute(predictions=predictions, references=labels)

training_args = TrainingArguments(
    output_dir='sentiment_model', 
    eval_strategy='epoch', 
    num_train_epochs=2, 
    per_device_train_batch_size=8, 
    per_device_eval_batch_size=8, 
    learning_rate=5e-5, 
    weight_decay=0.01, 
    fp16=True, 
    report_to='none'
)

Downloading builder script: 0.00B [00:00, ?B/s]

In [None]:
trainer = Trainer(
    model = model, 
    train_dataset=tokenized_train_dataset, 
    eval_dataset= tokenized_test_dataset, 
    compute_metrics=compute_metrics
)

trainer.train()

  0%|          | 0/375 [00:00<?, ?it/s]

{'train_runtime': 149.0811, 'train_samples_per_second': 20.123, 'train_steps_per_second': 2.515, 'train_loss': 0.29058148193359373, 'epoch': 3.0}


TrainOutput(global_step=375, training_loss=0.29058148193359373, metrics={'train_runtime': 149.0811, 'train_samples_per_second': 20.123, 'train_steps_per_second': 2.515, 'total_flos': 397402195968000.0, 'train_loss': 0.29058148193359373, 'epoch': 3.0})

In [None]:
from transformers import pipeline 
sentiment_pipeline = pipeline('sentiment-analysis', model = trainer.model, tokenizer=tokenizer, device = 'cuda')

print(sentiment_pipeline('this movie was fantastic, really loved it'))
print(sentiment_pipeline('this movie was boring, really hated it'))

[{'label': 'LABEL_1', 'score': 0.9904590845108032}]
[{'label': 'LABEL_0', 'score': 0.9930827617645264}]
