In [73]:
from transformers import GPT2Tokenizer
#load GPT2Tokenizer from transformers.

# do_lower_case=False because programming codes are case sensitive for Python and Java
tokenizer = GPT2Tokenizer.from_pretrained("gpt2", do_lower_case=False)

tokenizer.pad_token = tokenizer.eos_token

#add special tokens, namely, the control codes <python> and <java>
special_words_to_add={"additional_special_tokens": ["<python>", "<java>"]}
tokenizer.add_special_tokens(special_words_to_add)


2

In [74]:
import pandas as pd
from datasets import Dataset

pandas_df = pd.DataFrame({})

pandas_df['text'] = text

pandas_df['text'] = pandas_df['text'].map(lambda x: f'<python> {x}')

pandas_df.head()


Dataset({
    features: ['text'],
    num_rows: 348
})

In [96]:
pandas_df.sample(10)

Unnamed: 0,text
120,<python> t('TAXJAR_API_KEY')\n\nENV = os.envir...
331,"<python> \n 'notifications',\n 'art',\n ..."
39,<python> )\nBASE_DIR = os.path.dirname(os.path...
295,"<python> Pagination',\n 'PAGE_SIZE': 100,\n..."
198,"<python> ://preprod.getshiba.com', 'https://pr..."
67,<python> URL)\n\nDEFAULT_AUTO_FIELD = 'django....
65,"<python> BASE_URL', API_BASE_URL)\n\nDEFAULT_A..."
266,<python> IL_BACKEND = 'django_ses.SESBackend'\...
78,<python> r production\n# See https://docs.djan...
205,<python> 'https://shiba-prod-fi...


In [None]:

data = Dataset.from_pandas(pandas_df)
data

In [75]:
%%time

MAX_TOKENS = 128

output = {}
# texts to numeric vectors of MAX_TOKENS
def tokenize_function(examples, tokenizer=tokenizer):
    # Add start and end token to each comment
    examples = [ex + tokenizer.eos_token for ex in examples["text"]]
    # tokenizer created input_ids and attention_mask as output
    output = tokenizer(
        examples,
        add_special_tokens=True,  # Only adds pad not eos and bos
        max_length=MAX_TOKENS,
        truncation=True,
        pad_to_max_length=True,
    )
    # shift labels for next token prediction
    # set padding token labels to -100 which is ignored in loss computation
#     output["labels"] = [x[1:] for x in output["input_ids"]]
    output["labels"] = [
        [-100 if x == tokenizer.pad_token_id else x for x in y]
        for y in output["labels"]
    ]
    # truncate input ids and attention mask to account for label shift
#     output["input_ids"] = [x[:-1] for x in output["input_ids"]]
#     output["attention_mask"] = [x[:-1] for x in output["attention_mask"]]
    return output


data = data.map(
    tokenize_function,
    batched=True,
    remove_columns=["text"],
    load_from_cache_file=True,
)
print(data)


  0%|          | 0/1 [00:00<?, ?ba/s]

Dataset({
    features: ['attention_mask', 'input_ids', 'labels'],
    num_rows: 348
})
CPU times: user 1.42 s, sys: 67.9 ms, total: 1.49 s
Wall time: 1.55 s


In [82]:
# data.set_format(type="python", columns=["input_ids", "attention_mask", "labels"])
data = data.train_test_split(
    test_size=0.20, shuffle=True, seed=1, load_from_cache_file=True
)
print(data)

DatasetDict({
    train: Dataset({
        features: ['attention_mask', 'input_ids', 'labels'],
        num_rows: 278
    })
    test: Dataset({
        features: ['attention_mask', 'input_ids', 'labels'],
        num_rows: 70
    })
})


In [76]:
from transformers import Trainer, TrainingArguments
from transformers import GPT2LMHeadModel

model = GPT2LMHeadModel.from_pretrained('gpt2')
model.resize_token_embeddings(len(tokenizer))  # Update the model embeddings with the new vocabulary size


Embedding(50259, 768)

Dataset({
    features: ['attention_mask', 'input_ids', 'labels'],
    num_rows: 278
})

In [84]:

training_args = TrainingArguments(
    output_dir="./gpt2_autocoder", #The output directory
    overwrite_output_dir=True, #overwrite the content of the output directory
    num_train_epochs=4, # number of training epochs
    per_device_train_batch_size=32, # batch size for training
    per_device_eval_batch_size=32,  # batch size for evaluation
    warmup_steps=10,  # number of warmup steps for learning rate scheduler,
    logging_steps=10
    )

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=data['train'],
    eval_dataset=data['test'],
)

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


In [85]:
trainer.evaluate()

***** Running Evaluation *****
  Num examples = 70
  Batch size = 32


{'eval_loss': 80.59626007080078,
 'eval_runtime': 43.5323,
 'eval_samples_per_second': 1.608,
 'eval_steps_per_second': 0.069}

In [86]:
trainer.train()

***** Running training *****
  Num examples = 278
  Num Epochs = 4
  Instantaneous batch size per device = 32
  Total train batch size (w. parallel, distributed & accumulation) = 32
  Gradient Accumulation steps = 1
  Total optimization steps = 36


Step,Training Loss
10,63.8585
20,44.7133
30,18.0309




Training completed. Do not forget to share your model on huggingface.co/models =)




TrainOutput(global_step=36, training_loss=36.503550635443794, metrics={'train_runtime': 3443.87, 'train_samples_per_second': 0.323, 'train_steps_per_second': 0.01, 'total_flos': 72071691264000.0, 'train_loss': 36.503550635443794, 'epoch': 4.0})

In [87]:
trainer.evaluate()

***** Running Evaluation *****
  Num examples = 70
  Batch size = 32


{'eval_loss': 7.404366493225098,
 'eval_runtime': 23.7056,
 'eval_samples_per_second': 2.953,
 'eval_steps_per_second': 0.127,
 'epoch': 4.0}

In [92]:
from transformers import pipeline
generator = pipeline(
    'text-generation', model=model, tokenizer=tokenizer,
    config={'max_length': 100}
)

In [97]:

for generated_text in generator("<python> See https", num_return_sequences=5):
    print(generated_text['generated_text'])
    

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


<python> See https://en.wikipedia
The name "Shining" means toSee https://en.wikipedia / Shining /Shining/Shining /Shining /Shining /LingeringTheLingering /Ling
<python> See https://www.reddit.http://youtu.http://www.http://www.http://www.http://www.TheFreeSawTheWhiteWeep. / http/http. / http://http. /
<python> See https://www.youtube.Use https://www.youtube. // www.youtube. Get https://(not www.)Gethttps www. Get www. }
Get: www.
<python> See https://thecSee https://I_i.com.i.I.,I.I.,I.I.I.,I.I.,i.,I.I.II.II.III.II.II
<python> See https://www.youtube.YouSee https://www.youtube. YouSeeTheMuseum httpswww.facebook. TheMUSEBits httpswww.facebook. "GIFM" httpshttps httpsP1TheMUSE
