In [1]:
!pip install datasets transformers
!pip install accelerate -U
!pip install langchain



In [2]:
!rm -rf dataset/ && unzip data.zip

Archive:  data.zip
   creating: dataset/processed/
  inflating: dataset/processed/13.txt  
  inflating: dataset/processed/104.txt  
  inflating: dataset/processed/34.txt  
  inflating: dataset/processed/169.txt  
  inflating: dataset/processed/109.txt  
  inflating: dataset/processed/91.txt  
  inflating: dataset/processed/111.txt  
  inflating: dataset/processed/70.txt  
  inflating: dataset/processed/120.txt  
  inflating: dataset/processed/51.txt  
  inflating: dataset/processed/9.txt  
  inflating: dataset/processed/106.txt  
  inflating: dataset/processed/168.txt  
  inflating: dataset/processed/72.txt  
  inflating: dataset/processed/161.txt  
  inflating: dataset/processed/146.txt  
  inflating: dataset/processed/73.txt  
  inflating: dataset/processed/61.txt  
  inflating: dataset/processed/174.txt  
  inflating: dataset/processed/133.txt  
  inflating: dataset/processed/25.txt  
  inflating: dataset/processed/105.txt  
  inflating: dataset/processed/16.txt  
  inflating: datas

In [3]:
import torch
import numpy as np
from datasets import load_dataset
from glob import glob
from transformers import AutoTokenizer, AutoModelForCausalLM
from transformers import DataCollatorForLanguageModeling
import matplotlib.pyplot as plt

In [4]:
from transformers import AutoTokenizer, AutoModelForCausalLM
model_name = "gpt2"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [5]:
from transformers import TrainingArguments, Trainer

In [6]:
text_files = glob('dataset/processed/*.txt')
split_point = int(0.9 * len(text_files))

ds = load_dataset(
    "text",
    data_files={
        "train": text_files[:split_point],
        "test": text_files[split_point:],
    },
    sample_by="document"
)


Resolving data files:   0%|          | 0/156 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/18 [00:00<?, ?it/s]

In [7]:
ds

DatasetDict({
    train: Dataset({
        features: ['text'],
        num_rows: 156
    })
    test: Dataset({
        features: ['text'],
        num_rows: 18
    })
})

In [8]:
#hyperparams

CONTEXT_LEN = 512
BATCH_SIZE = 1
LR = 2e-4
EPOCHS = 3

In [9]:
def tokenize(element):
    outputs = tokenizer(
        element["text"],
        truncation=True,
        max_length=CONTEXT_LEN,
        return_overflowing_tokens=True,
        return_length=True,
    )
    input_batch = []
    for length, input_ids in zip(outputs["length"], outputs["input_ids"]):
        if length == CONTEXT_LEN:
            input_batch.append(input_ids)
    return {"input_ids": input_batch}

In [10]:
tokenized_datasets = ds.map(
    tokenize, batched=True, remove_columns=ds["train"].column_names
)

Map:   0%|          | 0/18 [00:00<?, ? examples/s]

In [11]:
tokenized_datasets

DatasetDict({
    train: Dataset({
        features: ['input_ids'],
        num_rows: 3917
    })
    test: Dataset({
        features: ['input_ids'],
        num_rows: 430
    })
})

In [12]:
tokenizer.pad_token = tokenizer.eos_token
data_collator = DataCollatorForLanguageModeling(tokenizer, mlm=False)

In [13]:
args = TrainingArguments(
    output_dir="star-trek-tng-script-generator",
    overwrite_output_dir=True,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    evaluation_strategy="steps",
    eval_steps=500,
    logging_steps=500,
    num_train_epochs=EPOCHS,
    weight_decay=0.25,
    warmup_steps=50,
    lr_scheduler_type="cosine",
    learning_rate=LR,
    save_steps=500,
    fp16=False,
    push_to_hub=False,
)

trainer = Trainer(
    model=model,
    tokenizer=tokenizer,
    args=args,
    data_collator=data_collator,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["test"]
)

In [14]:
trainer.train()

You're using a GPT2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss,Validation Loss
500,3.1502,3.023273
1000,3.0538,2.972767
1500,2.9951,2.943702
2000,2.9891,2.912469
2500,2.9289,2.915861
3000,2.9091,2.900784
3500,2.8916,2.875232
4000,2.8122,2.888089
4500,2.5224,2.889636
5000,2.5284,2.866677


TrainOutput(global_step=11751, training_loss=2.531448085078198, metrics={'train_runtime': 2897.1651, 'train_samples_per_second': 4.056, 'train_steps_per_second': 4.056, 'total_flos': 3070442668032000.0, 'train_loss': 2.531448085078198, 'epoch': 3.0})

In [15]:
from transformers import pipeline

In [16]:
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
pipe = pipeline(
    "text-generation", model=model, device=device, tokenizer=tokenizer
)

In [17]:
def generate(text):
    encoded_prompt = tokenizer.encode(text, add_special_tokens=False, return_tensors="pt").to('cuda')

    output_tokens = model.generate(
        input_ids = encoded_prompt,
        max_length = 512,
        do_sample=True,
        num_return_sequences=1,
        pad_token_id=model.config.eos_token_id,
        top_k=50,
        top_p=0.95,
    )

    text_out = tokenizer.decode(output_tokens[0], clean_up_tokenization_spaces=True)

    return text_out


In [18]:
txt = """
DATA: The ship has gone into warp, sir.
RIKER: Who gave the command?
DATA: Apparently no one. Helm and navigation controls are not functioning. Our speed is now warp seven-point-three and holding.
PICARD: Picard to Engineering. Mister La Forge, what's going on down there?
"""

In [19]:
print(generate(txt))


DATA: The ship has gone into warp, sir.
RIKER: Who gave the command?
DATA: Apparently no one. Helm and navigation controls are not functioning. Our speed is now warp seven-point-three and holding.
PICARD: Picard to Engineering. Mister La Forge, what's going on down there?
GEORDI'S COM VOICE: I don't know... I've gone to check the power systems on the bridge, Captain... but we're losing too much power...
RIKER: Helm, stand by.
The ship goes into warp.
A beat, then WE SEE the IMAGE of an MALE STARFLEET FEMALE COM VOICE, a young, attractive woman in her mid-fifties. She wears a red dress -- blue, white, and black, with a gold "fem" badge, and carries a small WAGER.
The woman wears a blue-colored dress, carrying a bag.
The man does not look present.
Picard's in position, at his desk. He is studying the readouts. The door slides open. A small PANEL OPEN, revealing a CREWMAN. Picard turns on the alien -- a FEMALE CREWMEMBER.
PICARD: Mister Burke.
The stranger opens his hands and strokes his

In [20]:
from huggingface_hub import login

In [21]:
login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [23]:
trainer.push_to_hub()

events.out.tfevents.1706077411.2545872ab51a.1303.0:   0%|          | 0.00/10.2k [00:00<?, ?B/s]

training_args.bin:   0%|          | 0.00/4.60k [00:00<?, ?B/s]

Upload 4 LFS files:   0%|          | 0/4 [00:00<?, ?it/s]

model.safetensors:   0%|          | 0.00/498M [00:00<?, ?B/s]

events.out.tfevents.1706078332.2545872ab51a.5361.0:   0%|          | 0.00/14.7k [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/progs2002/star-trek-tng-script-generator/commit/cac99d8029d55ab52ce974569e2e16a72400a3de', commit_message='End of training', commit_description='', oid='cac99d8029d55ab52ce974569e2e16a72400a3de', pr_url=None, pr_revision=None, pr_num=None)