In [None]:
from transformers import GPT2Tokenizer, TextDataset, DataCollatorForLanguageModeling, GPT2LMHeadModel, pipeline, \
                         Trainer, TrainingArguments

In [None]:
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')  # load up a standard gpt2 model

tokenizer.pad_token = tokenizer.eos_token
# set our pad token to be the eos token. This lets gpt know how to fill space

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

In [None]:
# load up our data into a dataset
pds_data = TextDataset(
    tokenizer=tokenizer,
    file_path='/content/PDS2.txt',  # Principles of Data Science - Sinan Ozdemir
    block_size=64  # length of each chunk of text to use as a datapoint
)



In [None]:
pds_data[0], pds_data[0].shape  # inspect the first point

(tensor([  198,   464,  1366,  3783,   569,  1697, 16362,   198,   464, 10688,
           198,   198, 16281,   784, 10922,   263,    12,  8344,  4872,  4981,
           198,   198, 34556,  8300,   198,  5195, 11361,    30,   198,   198,
         37906,  6593,   198, 16281,   286,  4096, 11361,   198, 16281,   784,
         32096,   257,  2060,  6126,   198,   198, 43961,  3725,   198,   198,
          4366,   517, 29191,   198,  6601,  3783,  1339,  3640,   198,   198,
         20448,  2050,   784,  3557]),
 torch.Size([64]))

In [None]:
print(tokenizer.decode(pds_data[0]))


The data science Venn diagram
The math

Example – spawner-recruit models

Computer programming
Why Python?

Python practices
Example of basic Python
Example – parsing a single tweet

Domain knowledge

Some more terminology
Data science case studies

Case study – autom


In [None]:
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer, mlm=False,
    # MLM is Masked Language Modelling (for BERT + auto-encoding tasks)
)

In [None]:
# example of how collator pads data dynamically
collator_example = data_collator([tokenizer('I am an input'), tokenizer('So am I')])

collator_example

{'input_ids': tensor([[   40,   716,   281,  5128],
        [ 2396,   716,   314, 50256]]), 'attention_mask': tensor([[1, 1, 1, 1],
        [1, 1, 1, 0]]), 'labels': tensor([[  40,  716,  281, 5128],
        [2396,  716,  314, -100]])}

In [None]:
collator_example.input_ids  # 50256 is our pad token id

tensor([[   40,   716,   281,  5128],
        [ 2396,   716,   314, 50256]])

In [None]:
tokenizer.pad_token_id

50256

In [None]:
collator_example.attention_mask  # Note the 0 in the attention mask where we have a pad token

tensor([[1, 1, 1, 1],
        [1, 1, 1, 0]])

In [None]:
collator_example.labels  # note the -100 to ignore loss calculation for the padded token
# Labels are shifted inside the GPT model so we don't need to worry about that

tensor([[  40,  716,  281, 5128],
        [2396,  716,  314, -100]])

In [None]:
model = GPT2LMHeadModel.from_pretrained('gpt2')  # load up a GPT2 model

pretrained_generator = pipeline(  # create a generator with built in params
    'text-generation', model=model, tokenizer='gpt2',
    config={'max_length': 200, 'do_sample': True, 'top_p': 0.9, 'temperature': 0.7, 'top_k': 10}
)

model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

Device set to use cuda:0


In [None]:
print('----------')
for generated_sequence in pretrained_generator('This dataset shows the relationship', num_return_sequences=3):
    print(generated_sequence['generated_text'])
    print('----------')

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


----------
This dataset shows the relationship between the ratio of men in the highest social class to females: men were more likely than women to say that women should be forced to marry (57% higher than the ratio of men in the lowest social class to women,
----------
This dataset shows the relationship between the relative contribution of a taxon's taxonomic group and the contribution of its taxonomic gene pool. Here I'll show two general trends: there can be no better predictor of global biodiversity than the taxonomic group of
----------
This dataset shows the relationship between C-level hierarchical clustering and the likelihood of the association between each individual's school, educational level and the amount of money spent by the individual on school activities. We also use the method of SPSS version 11
----------


In [None]:
training_args = TrainingArguments(
    output_dir="./gpt2_pds", #The output directory
    overwrite_output_dir=True, #overwrite the content of the output directory
    num_train_epochs=3, # number of training epochs
    per_device_train_batch_size=32, # batch size for training
    per_device_eval_batch_size=32,  # batch size for evaluation
    logging_steps=10,
    load_best_model_at_end=True,
    evaluation_strategy='epoch',
    save_strategy='epoch'
)

trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=pds_data.examples[:int(len(pds_data.examples)*.8)],
    eval_dataset=pds_data.examples[int(len(pds_data.examples)*.8):]
)

trainer.evaluate()



{'eval_loss': 4.50043249130249,
 'eval_model_preparation_time': 0.0103,
 'eval_runtime': 2.4542,
 'eval_samples_per_second': 189.471,
 'eval_steps_per_second': 6.112}

In [None]:
trainer.train()

Epoch,Training Loss,Validation Loss,Model Preparation Time
1,3.3162,3.477132,0.0103
2,2.9909,3.444185,0.0103
3,2.909,3.437735,0.0103


There were missing keys in the checkpoint model loaded: ['lm_head.weight'].


TrainOutput(global_step=177, training_loss=3.158395788763876, metrics={'train_runtime': 157.8267, 'train_samples_per_second': 35.336, 'train_steps_per_second': 1.121, 'total_flos': 182153207808000.0, 'train_loss': 3.158395788763876, 'epoch': 3.0})

In [None]:
trainer.evaluate()  # loss decrease is slowing down so we are hitting our limit

{'eval_loss': 3.437734603881836,
 'eval_model_preparation_time': 0.0103,
 'eval_runtime': 2.5388,
 'eval_samples_per_second': 183.156,
 'eval_steps_per_second': 5.908,
 'epoch': 3.0}

In [None]:
trainer.save_model()

In [None]:
loaded_model = GPT2LMHeadModel.from_pretrained('./gpt2_pds')

finetuned_generator = pipeline(
    'text-generation', model=loaded_model, tokenizer=tokenizer,
    config={'max_length': 200, 'do_sample': True, 'top_p': 0.9, 'temperature': 0.7, 'top_k': 10}
)

Device set to use cuda:0


In [None]:
# examples are now sustainably about data
print('----------')
for generated_sequence in finetuned_generator('This dataset shows the relationship', num_return_sequences=3):
    print(generated_sequence['generated_text'])
    print('----------')

----------
This dataset shows the relationship between education level as measured by BMI. The relationships are as follows:
A − B = Age,
C + D = Age,
E + F = Age,
G + H = Age,
I +
----------
This dataset shows the relationship between the two classes (in other words, what is the correlation between each class) from
our original dataset.
[ 73 ]

Predictions Don't Grow on Trees – or Do They?

Chapter 11
----------
This dataset shows the relationship between
people's consumption levels and how frequently they shop or watch movies. This represents a
trend to the store's sales.

[ 39 ]

Communicating Data

Chapter 7

So far
----------
