In [1]:
from datasets import Dataset, load_dataset
from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, Trainer, DataCollatorForLanguageModeling
from itertools import chain
import pandas as pd
from pathlib import Path

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Our data is located in the /data directory. Let's see what we're working with.
!ls ./data

CVE-2020-29583.txt  xss.txt


In [3]:
# Load the file names into a list
data_path = Path("./data")
file_paths = [filename for filename in data_path.glob("*.txt")]

In [4]:
# Read all the file contents into a list
file_data = list()
for filename in file_paths:
    with open(filename, "r") as f:
        data = f.read()
    file_data.append(data)

In [5]:
print(file_data[0])
print('=====================')
print(file_data[1])

A Community-Developed List of Software & Hardware Weakness Types Cross-site scripting (XSS) vulnerabilities occur when:  There are three main kinds of XSS:  Once the malicious script is injected, the attacker can perform a variety of malicious activities. The attacker could transfer private information, such as cookies that may include session information, from the victim's machine to the attacker. The attacker could send malicious requests to a web site on behalf of the victim, which could be especially dangerous to the site if the victim has administrator privileges to manage that site. Phishing attacks could be used to emulate trusted web sites and trick the victim into entering a password, allowing the attacker to compromise the victim's account on that web site. Finally, the script could exploit a vulnerability in the web browser itself possibly taking over the victim's machine, sometimes referred to as "drive-by hacking."  In many cases, the attack can be launched without the vic

In [6]:
# Convert our list of text into a dataset using .from_dict()
dataset = Dataset.from_dict({"text": file_data})

In [7]:
# Preview the dataset
dataset["text"]

['A Community-Developed List of Software & Hardware Weakness Types Cross-site scripting (XSS) vulnerabilities occur when:  There are three main kinds of XSS:  Once the malicious script is injected, the attacker can perform a variety of malicious activities. The attacker could transfer private information, such as cookies that may include session information, from the victim\'s machine to the attacker. The attacker could send malicious requests to a web site on behalf of the victim, which could be especially dangerous to the site if the victim has administrator privileges to manage that site. Phishing attacks could be used to emulate trusted web sites and trick the victim into entering a password, allowing the attacker to compromise the victim\'s account on that web site. Finally, the script could exploit a vulnerability in the web browser itself possibly taking over the victim\'s machine, sometimes referred to as "drive-by hacking."  In many cases, the attack can be launched without th

In [8]:
# Load the tokenizer for GPT-2
tokenizer = AutoTokenizer.from_pretrained('gpt2')

# The tokenizer does not have a pad token, so we'll specify one.
tokenizer.pad_token = tokenizer.eos_token

# Load the GPT-2 model
model = AutoModelForCausalLM.from_pretrained('gpt2')

tokenizer_config.json: 100%|██████████| 26.0/26.0 [00:00<00:00, 131kB/s]
config.json: 100%|██████████| 665/665 [00:00<00:00, 3.88MB/s]
vocab.json: 100%|██████████| 1.04M/1.04M [00:00<00:00, 12.8MB/s]
merges.txt: 100%|██████████| 456k/456k [00:00<00:00, 18.5MB/s]
tokenizer.json: 100%|██████████| 1.36M/1.36M [00:00<00:00, 15.1MB/s]
pytorch_model.bin: 100%|██████████| 548M/548M [00:02<00:00, 203MB/s]  
generation_config.json: 100%|██████████| 124/124 [00:00<00:00, 504kB/s]


In [9]:
# Create a tokenization function to tokenize the dataset
def tokenize_function(examples):
    output = tokenizer(examples['text'])
    return output

# Run the tokenizer over our dataset using the .map method 
# NOTE: For large datasets, this can take a while
tokenized_dataset = dataset.map(tokenize_function, batched=True)

# We want to remove our original dataset's column names from the tokenized dataset
tokenized_dataset = tokenized_dataset.remove_columns(dataset.column_names)

Map:   0%|          | 0/2 [00:00<?, ? examples/s]Token indices sequence length is longer than the specified maximum sequence length for this model (1734 > 1024). Running this sequence through the model will result in indexing errors
                                                 

In [37]:
print(tokenized_dataset)
for item in tokenized_dataset:
    print(item.keys())
#     print(item.values())
    for k in item.keys():
        print(f"k: {k}")
        print(f"item[k]: {item[k]}")
#         concatenated_examples[k] = list(chain(*examples[k]))


Dataset({
    features: ['input_ids', 'attention_mask'],
    num_rows: 2
})
dict_keys(['input_ids', 'attention_mask'])
k: input_ids
item[k]: [32, 8108, 12, 19246, 276, 7343, 286, 10442, 1222, 28715, 28788, 1108, 24897, 6372, 12, 15654, 36883, 357, 55, 5432, 8, 23805, 3051, 618, 25, 220, 1318, 389, 1115, 1388, 6982, 286, 1395, 5432, 25, 220, 4874, 262, 17412, 4226, 318, 25077, 11, 262, 15250, 460, 1620, 257, 4996, 286, 17412, 4568, 13, 383, 15250, 714, 4351, 2839, 1321, 11, 884, 355, 14746, 326, 743, 2291, 6246, 1321, 11, 422, 262, 3117, 338, 4572, 284, 262, 15250, 13, 383, 15250, 714, 3758, 17412, 7007, 284, 257, 3992, 2524, 319, 8378, 286, 262, 3117, 11, 543, 714, 307, 2592, 4923, 284, 262, 2524, 611, 262, 3117, 468, 18382, 18850, 284, 6687, 326, 2524, 13, 1380, 3929, 3434, 714, 307, 973, 284, 33836, 13467, 3992, 5043, 290, 6908, 262, 3117, 656, 8218, 257, 9206, 11, 5086, 262, 15250, 284, 13110, 262, 3117, 338, 1848, 319, 326, 3992, 2524, 13, 9461, 11, 262, 4226, 714, 14561, 257, 1513

In [48]:
# This function was lightly modified from the HuggingFace run_clm.py
# You can find the original function at https://github.com/huggingface/transformers/blob/main/examples/pytorch/language-modeling/run_clm.py
# Create a preprocessing function to group our texts together in chunks of 1024
# examples is one row of tokenized_dataset, the keys of the row are ['input_ids', 'attention_mask'], both items are list
def group_texts(examples):
    # Specify our bock size -- 1024
    block_size = 1024
#     print(f"examples.keys(): {examples.keys()}")
    print(f"examples: {examples}")
    
    # Concatenate all the texts together for each example
    concatenated_examples = dict()
    for k in examples.keys():
        print(f"k: {k}")
        print(f"examples[k]: {examples[k]}")
        concatenated_examples[k] = list(chain(*examples[k]))
        
    # Compute the total length of all the text
    print(f"examples.keys(): {examples.keys()}")
    # "input_ids"
    print(f"list(examples.keys())[0]: {list(examples.keys())[0]}")
    total_length = len(concatenated_examples[list(examples.keys())[0]])
    
    # We drop the small remainder of the block
    # If total_length < block_size, we return an empty dict.
    total_length = (total_length // block_size) * block_size
    
    # Split into chunks of 1024
    result = dict()
    # Loop over the keys and texts in the concatenated examples
    print(f"concatenated_examples.items(): {concatenated_examples.items()}")
    for k, t in concatenated_examples.items():
        print(f"---for k: {k}, t: {t} in concatenated_examples.items()")
        # Divide each text into chunks of 1024
        chunks = list()
        for i in range(0, total_length, block_size):
            chunks.append(t[i : i + block_size])
        result[k] = chunks
    # Set the "labels" equal to the "input_ids"
    result["labels"] = result["input_ids"].copy()
    
    return result

In [49]:
# Chunk our datasets using the group_texts function
# note that `batched=True`, it will batch  group_texts's row values into groups
# original tokenized_dataset's structure: item["input_ids"] is a one dimentional list. after batch, it's a list of 2 items.
dataset = tokenized_dataset.map(group_texts, batched=True)
print(dataset)

                                                 

examples: {'input_ids': [[32, 8108, 12, 19246, 276, 7343, 286, 10442, 1222, 28715, 28788, 1108, 24897, 6372, 12, 15654, 36883, 357, 55, 5432, 8, 23805, 3051, 618, 25, 220, 1318, 389, 1115, 1388, 6982, 286, 1395, 5432, 25, 220, 4874, 262, 17412, 4226, 318, 25077, 11, 262, 15250, 460, 1620, 257, 4996, 286, 17412, 4568, 13, 383, 15250, 714, 4351, 2839, 1321, 11, 884, 355, 14746, 326, 743, 2291, 6246, 1321, 11, 422, 262, 3117, 338, 4572, 284, 262, 15250, 13, 383, 15250, 714, 3758, 17412, 7007, 284, 257, 3992, 2524, 319, 8378, 286, 262, 3117, 11, 543, 714, 307, 2592, 4923, 284, 262, 2524, 611, 262, 3117, 468, 18382, 18850, 284, 6687, 326, 2524, 13, 1380, 3929, 3434, 714, 307, 973, 284, 33836, 13467, 3992, 5043, 290, 6908, 262, 3117, 656, 8218, 257, 9206, 11, 5086, 262, 15250, 284, 13110, 262, 3117, 338, 1848, 319, 326, 3992, 2524, 13, 9461, 11, 262, 4226, 714, 14561, 257, 15131, 287, 262, 3992, 6444, 2346, 5457, 2263, 625, 262, 3117, 338, 4572, 11, 3360, 6412, 284, 355, 366, 19472, 12, 1525



In [13]:
# Set up our data collator for training. Since our model is PyTorch, we need to specify return_tensors as "pt"
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False, return_tensors="pt")

In [14]:
# Establish our training arguments
training_args = TrainingArguments(
    output_dir="finetune_gpt2",
    per_device_train_batch_size=1,
    save_strategy="no"
)

In [15]:
# Put everything into our Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset,
    data_collator=data_collator
)

In [16]:
# Run the trainer
trainer.train()

You're using a GPT2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss


TrainOutput(global_step=3, training_loss=3.286179224650065, metrics={'train_runtime': 69.9297, 'train_samples_per_second': 0.086, 'train_steps_per_second': 0.043, 'total_flos': 3135504384000.0, 'train_loss': 3.286179224650065, 'epoch': 3.0})

In [17]:
# Specify an input string
input_string = "Cross-Site Scripting is a vulnerability that"

# Tokenize our input string
input_ids = tokenizer(input_string, return_tensors="pt").input_ids

# Generate model output_ids
outputs = model.generate(
    input_ids,
    num_beams=10,
    num_return_sequences=1,
    no_repeat_ngram_size=1,
    remove_invalid_values=True,
)

# Decode the output tokens to text
output_text = tokenizer.decode(outputs[0], skip_special_tokens=True)

# Print our output!
print(output_text)

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Cross-Site Scripting is a vulnerability that allows remote attackers to execute arbitrary code, such as by
