In [1]:

## !pip install datasets
## !pip install transformers[torch]


In [2]:

import torch

torch.cuda.is_available()


True

In [3]:

torch.cuda.device_count()


1

In [4]:

torch.cuda.current_device()


0

In [5]:

torch.cuda.device(0)


<torch.cuda.device at 0x2b26d9b657c0>

In [6]:

torch.cuda.get_device_name(0)


'Tesla V100-PCIE-32GB'

In [7]:

from datasets import load_dataset
datasets = load_dataset('wikitext', 'wikitext-2-raw-v1')


In [8]:

# datasets = load_dataset("text", data_files={"train": path_to_train.txt, "validation": path_to_validation.txt}


In [9]:

datasets["train"][10]


{'text': ' The game \'s battle system , the BliTZ system , is carried over directly from Valkyira Chronicles . During missions , players select each unit using a top @-@ down perspective of the battlefield map : once a character is selected , the player moves the character around the battlefield in third @-@ person . A character can only act once per @-@ turn , but characters can be granted multiple turns at the expense of other characters \' turns . Each character has a field and distance of movement limited by their Action Gauge . Up to nine characters can be assigned to a single mission . During gameplay , characters will call out if something happens to them , such as their health points ( HP ) getting low or being knocked out by enemy attacks . Each character has specific " Potentials " , skills unique to each character . They are divided into " Personal Potential " , which are innate skills that remain unaltered unless otherwise dictated by the story and can either help or impede

In [10]:

from datasets import ClassLabel
import random
import pandas as pd
from IPython.display import display, HTML

def show_random_elements(dataset, num_examples=10):
    assert num_examples <= len(dataset), "Can't pick more elements than there are in the dataset."
    picks = []
    for _ in range(num_examples):
        pick = random.randint(0, len(dataset)-1)
        while pick in picks:
            pick = random.randint(0, len(dataset)-1)
        picks.append(pick)
    
    df = pd.DataFrame(dataset[picks])
    for column, typ in dataset.features.items():
        if isinstance(typ, ClassLabel):
            df[column] = df[column].transform(lambda i: typ.names[i])
    display(HTML(df.to_html()))


In [11]:

show_random_elements(datasets["train"])


Unnamed: 0,text
0,"Olivier 's honours included a knighthood ( 1947 ) , a life peerage ( 1970 ) and the Order of Merit ( 1981 ) . For his on @-@ screen work he received four Academy Awards , two British Academy Film Awards , five Emmy Awards and three Golden Globe Awards . The National Theatre 's largest auditorium is named in his honour , and he is commemorated in the Laurence Olivier Awards , given annually by the Society of London Theatre . He was married three times , to the actresses Jill Esmond from 1930 to 1940 , Vivien Leigh from 1940 to 1960 , and Joan Plowright from 1961 until his death . \n"
1,
2,
3,"Microsoft Word does not treat sentences differently by default , but the grammar checking can be set to prefer a specific number of spaces between sentences . \n"
4,Credits are adapted from the liner notes of the album Destiny Fulfilled . \n
5,
6,
7,"Anderson dated ( and frequently collaborated with ) singer Fiona Apple for several years during the late 1990s and early 2000s . He has been in a relationship with actress and comedian Maya Rudolph since 2001 . They live together in the San Fernando Valley with their four children : daughters Pearl Bailey ( born October 2005 ) , Lucille ( born November 2009 ) , and Minnie Ida ( born August 2013 ) and son Jack ( born July 2011 ) . \n"
8,"As the hydrogen shell burning produces more helium , the core increases in mass and temperature . In a red giant of up to 2 @.@ 25 M ☉ , the mass of the helium core becomes degenerate prior to helium fusion . Finally , when the temperature increases sufficiently , helium fusion begins explosively in what is called a helium flash , and the star rapidly shrinks in radius , increases its surface temperature , and moves to the horizontal branch of the HR diagram . For more massive stars , helium core fusion starts before the core becomes degenerate , and the star spends some time in the red clump , slowly burning helium , before the outer convective envelope collapses and the star then moves to the horizontal branch . \n"
9,


In [12]:

model_checkpoint = "gpt2"
tokenizer_checkpoint = "sgugger/gpt2-like-tokenizer"


In [13]:

from transformers import AutoTokenizer
    
tokenizer = AutoTokenizer.from_pretrained(tokenizer_checkpoint)


In [14]:

def tokenize_function(examples):
    return tokenizer(examples["text"])


In [15]:

tokenized_datasets = datasets.map(tokenize_function, batched=True, num_proc=4, remove_columns=["text"])




Map (num_proc=4):   0%|          | 0/4358 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/36718 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/3760 [00:00<?, ? examples/s]

In [16]:

tokenized_datasets["train"][1]



{'input_ids': [238, 8576, 9441, 2987, 238, 252],
 'attention_mask': [1, 1, 1, 1, 1, 1]}

In [17]:

# block_size = tokenizer.model_max_length
block_size = 128


In [18]:

def group_texts(examples):
    # Concatenate all texts.
    concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
    total_length = len(concatenated_examples[list(examples.keys())[0]])
    # We drop the small remainder, we could add padding if the model supported it instead of this drop, you can
        # customize this part to your needs.
    total_length = (total_length // block_size) * block_size
    # Split by chunks of max_len.
    result = {
        k: [t[i : i + block_size] for i in range(0, total_length, block_size)]
        for k, t in concatenated_examples.items()
    }
    result["labels"] = result["input_ids"].copy()
    return result


In [19]:

lm_datasets = tokenized_datasets.map(
    group_texts,
    batched=True,
    batch_size=1000,
    num_proc=4,
)


Map (num_proc=4):   0%|          | 0/4358 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/36718 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/3760 [00:00<?, ? examples/s]

In [20]:

tokenizer.decode(lm_datasets["train"][1]["input_ids"])


' the " Nameless ", a penal military unit serving the nation of Gallia during the Second Europan War who perform secret black operations and are pitted against the Imperial unit " Calamaty Raven ". \n The game began development in 2010, carrying over a large portion of the work done on Valkyria Chronicles II. While it retained the standard features of the series, it also underwent multiple adjustments, such as making the game more forgiving for series newcomers. Character designer Raita Honjou and composer Hitoshi Sakimoto both returned from previous entries, along with Valkyria Chronicles II director Takeshi Ozawa. A large'

In [21]:

from transformers import AutoConfig, AutoModelForCausalLM

config = AutoConfig.from_pretrained(model_checkpoint)
model = AutoModelForCausalLM.from_config(config)


In [22]:

from transformers import Trainer, TrainingArguments


In [23]:

training_args = TrainingArguments(
    f"/scratch/scholar/rcalix/{model_checkpoint}-wikitext2",
    evaluation_strategy = "epoch",
    learning_rate=2e-5,
    weight_decay=0.01,
    push_to_hub=False
)


In [24]:

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=lm_datasets["train"],
    eval_dataset=lm_datasets["validation"],
)


Detected kernel version 3.10.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


In [25]:

trainer.train()


Epoch,Training Loss,Validation Loss
1,6.5516,6.474864
2,6.1931,6.20037
3,6.0178,6.114839


TrainOutput(global_step=6747, training_loss=6.392374855404485, metrics={'train_runtime': 1042.5124, 'train_samples_per_second': 51.772, 'train_steps_per_second': 6.472, 'total_flos': 3525678710784000.0, 'train_loss': 6.392374855404485, 'epoch': 3.0})

In [26]:

import math
eval_results = trainer.evaluate()
print(f"Perplexity: {math.exp(eval_results['eval_loss']):.2f}")


Perplexity: 452.52



The perplexity is still quite high since for this demo we trained on a small dataset for a small number of epochs. For a real LM training, you would need a larger dataset and more epochs.
