In [None]:
from transformers import GPT2Tokenizer, TextDataset, DataCollatorForLanguageModeling, GPT2LMHeadModel, pipeline, \
                         Trainer, TrainingArguments

In [None]:
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')  # load up a standard gpt2 model

tokenizer.pad_token = tokenizer.eos_token
# set our pad token to be the eos token. This lets gpt know how to fill space

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

In [None]:
# load up our data into a dataset
pds_data = TextDataset(
    tokenizer=tokenizer,
    file_path='/content/K_UNIT 1.txt',  # Principles of Data Science - Sinan Ozdemir
    block_size=64  # length of each chunk of text to use as a datapoint
)



In [None]:
pds_data[0], pds_data[0].shape  # inspect the first point

(tensor([  171,   119,   123,  4944,  2043,   220,   314,   198,    33,  1921,
         19505,  3963, 34958,  3563,  8120,   198, 43467,   262, 14392,  5313,
          7712, 25161,   532, 11787,   532, 34270,   784, 47736, 18497,   532,
          5157,   437,  6168,   784,   337, 15922, 29778,   532, 28491,   262,
          1180, 24285,   784,   464,  2597,   286, 10604,   784, 28147,    12,
         19081,   784, 42591, 20137,   784,  3041,   529,    13,   628,   198,
            16,    13, 43467,   262]),
 torch.Size([64]))

In [None]:

print(tokenizer.decode(pds_data[0]))

﻿UNIT  I
BASICS OF FULL STACK
Understanding the Basic Web Development Framework - User - Browser – Webserver - Backend Services – MVC Architecture - Understanding the different stacks –The role of Express – Angular- Node – Mongo DB –React.


1.Understanding the


In [None]:
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer, mlm=False,
    # MLM is Masked Language Modelling (for BERT + auto-encoding tasks)
)

In [None]:
# example of how collator pads data dynamically
collator_example = data_collator([tokenizer('I am an input'), tokenizer('So am I')])

collator_example

{'input_ids': tensor([[   40,   716,   281,  5128],
        [ 2396,   716,   314, 50256]]), 'attention_mask': tensor([[1, 1, 1, 1],
        [1, 1, 1, 0]]), 'labels': tensor([[  40,  716,  281, 5128],
        [2396,  716,  314, -100]])}

In [None]:
collator_example.input_ids  # 50256 is our pad token id

tensor([[   40,   716,   281,  5128],
        [ 2396,   716,   314, 50256]])

In [None]:
tokenizer.pad_token_id

50256

In [None]:
collator_example.attention_mask  # Note the 0 in the attention mask where we have a pad token

tensor([[1, 1, 1, 1],
        [1, 1, 1, 0]])

In [None]:
collator_example.labels  # note the -100 to ignore loss calculation for the padded token
# Labels are shifted inside the GPT model so we don't need to worry about that

tensor([[  40,  716,  281, 5128],
        [2396,  716,  314, -100]])

In [None]:
model = GPT2LMHeadModel.from_pretrained('gpt2')  # load up a GPT2 model

pretrained_generator = pipeline(  # create a generator with built in params
    'text-generation', model=model, tokenizer='gpt2',
    config={'max_length': 200, 'do_sample': True, 'top_p': 0.9, 'temperature': 0.7, 'top_k': 10}
)

model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

Device set to use cpu


In [None]:
print('----------')
for generated_sequence in pretrained_generator('explain web server', num_return_sequences=3):
    print(generated_sequence['generated_text'])
    print('----------')

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


----------
explain web server (like WordPress/MyGoneAdmin or WordPress/Postfix) as well as the REST APIs.

Here are a few examples of this project:

A web site in Rails

A web service (like
----------
explain web server side configuration with JavaScript

JavaScript with high performance

The web server model provides an excellent opportunity to integrate with your web apps, making them easier on your development team.

For a long time we've been using
----------
explain web server. I'm not sure if there are many more.

This project starts with the Angular CLI, which brings together all the major libraries.

Angular provides a lot of features : framework, reducers, and helpers
----------


In [None]:
training_args = TrainingArguments(
    output_dir="./gpt2_pds", #The output directory
    overwrite_output_dir=True, #overwrite the content of the output directory
    num_train_epochs=3, # number of training epochs
    per_device_train_batch_size=32, # batch size for training
    per_device_eval_batch_size=32,  # batch size for evaluation
    logging_steps=10,
    load_best_model_at_end=True,
    evaluation_strategy='epoch',
    save_strategy='epoch'
)

trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=pds_data.examples[:int(len(pds_data.examples)*.8)],
    eval_dataset=pds_data.examples[int(len(pds_data.examples)*.8):]
)

trainer.evaluate()

{'eval_loss': 3.432868480682373,
 'eval_model_preparation_time': 0.0024,
 'eval_runtime': 4.1621,
 'eval_samples_per_second': 3.364,
 'eval_steps_per_second': 0.24}

In [None]:
trainer.train()

Epoch,Training Loss,Validation Loss,Model Preparation Time
1,No log,3.335754,0.0024
2,No log,3.290323,0.0024
3,No log,3.273681,0.0024


There were missing keys in the checkpoint model loaded: ['lm_head.weight'].


TrainOutput(global_step=6, training_loss=3.179828643798828, metrics={'train_runtime': 312.9068, 'train_samples_per_second': 0.518, 'train_steps_per_second': 0.019, 'total_flos': 5291163648000.0, 'train_loss': 3.179828643798828, 'epoch': 3.0})

In [None]:
trainer.evaluate()  # loss decrease is slowing down so we are hitting our limit

{'eval_loss': 3.273681402206421,
 'eval_model_preparation_time': 0.0024,
 'eval_runtime': 4.2624,
 'eval_samples_per_second': 3.285,
 'eval_steps_per_second': 0.235,
 'epoch': 3.0}

In [None]:
trainer.save_model()

In [None]:
loaded_model = GPT2LMHeadModel.from_pretrained('./gpt2_pds')

finetuned_generator = pipeline(
    'text-generation', model=loaded_model, tokenizer=tokenizer,
    config={'max_length': 200, 'do_sample': True, 'top_p': 0.9, 'temperature': 0.7, 'top_k': 10}
)

Device set to use cpu


In [None]:
# examples are now sustainably about data
print('----------')
for generated_sequence in finetuned_generator('Explain Web server', num_return_sequences=3):
    print(generated_sequence['generated_text'])
    print('----------')

----------
Explain Web server logic. Learn more. Web hosting consists of the creation of servers that process requests and respond to them. For the purposes of creating web pages, web servers need to be able to serve the page without being overwhelmed, caching and responsive
----------
Explain Web server requirements: The following will describe the required web application services for web servers:
Web client (Web UI): HTML, CSS, H2O, and Javascript.
Web server (DOM): CSS, HTML, and KV
----------
Explain Web server state by defining each service as part of its main application. For example, a browser can view the data requests on a specific page and execute a query based on the response. The state is preserved by defining data types as well as
----------
