In [None]:
!pip install datasets transformers
!pip install accelerate -U
!pip install langchain



In [None]:
#!unzip processed.zip

In [None]:
import torch
import numpy as np
from datasets import load_dataset
from glob import glob
from transformers import AutoTokenizer, AutoModelForCausalLM
from transformers import DataCollatorForLanguageModeling
import matplotlib.pyplot as plt

In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM

tokenizer = AutoTokenizer.from_pretrained("distilgpt2")
model = AutoModelForCausalLM.from_pretrained("distilgpt2")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [None]:
from transformers import TrainingArguments, Trainer

In [None]:
text_files = glob('dataset/processed/*.txt')
split_point = int(0.8 * len(text_files))

ds = load_dataset("text", data_files={
    "train": text_files[:split_point],
    "test": text_files[split_point:],
}, sample_by="document")


Resolving data files:   0%|          | 0/139 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/35 [00:00<?, ?it/s]

In [None]:
ds

DatasetDict({
    train: Dataset({
        features: ['text'],
        num_rows: 139
    })
    test: Dataset({
        features: ['text'],
        num_rows: 35
    })
})

In [None]:
context_length = 256

def tokenize(element):
    outputs = tokenizer(
        element["text"],
        truncation=True,
        max_length=context_length,
        return_overflowing_tokens=True,
        return_length=True,
    )
    input_batch = []
    for length, input_ids in zip(outputs["length"], outputs["input_ids"]):
        if length == context_length:
            input_batch.append(input_ids)
    return {"input_ids": input_batch}

In [None]:
tokenized_datasets = ds.map(
    tokenize, batched=True, remove_columns=ds["train"].column_names
)

In [None]:
tokenized_datasets

DatasetDict({
    train: Dataset({
        features: ['input_ids'],
        num_rows: 8278
    })
    test: Dataset({
        features: ['input_ids'],
        num_rows: 2024
    })
})

In [None]:
len(tokenized_datasets['train'][0]['input_ids'])

256

In [None]:
tokenizer.pad_token = tokenizer.eos_token
data_collator = DataCollatorForLanguageModeling(tokenizer, mlm=False)

In [None]:
args = TrainingArguments(
    output_dir="star-trek-tng-script-generator",
    overwrite_output_dir=True,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    evaluation_strategy="steps",
    eval_steps=200,
    logging_steps=200,
    num_train_epochs=3,
    weight_decay=0.1,
    warmup_steps=20,
    lr_scheduler_type="cosine",
    learning_rate=1e-3,
    save_steps=500,
    fp16=False,
    push_to_hub=False,
)

trainer = Trainer(
    model=model,
    tokenizer=tokenizer,
    args=args,
    data_collator=data_collator,
    train_dataset=tokenized_datasets["train"].shuffle(),
    eval_dataset=tokenized_datasets["test"].shuffle(),
)

In [None]:
trainer.evaluate()

You're using a GPT2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


{'eval_loss': 3.82625150680542,
 'eval_runtime': 31.2972,
 'eval_samples_per_second': 64.67,
 'eval_steps_per_second': 4.058}

In [None]:
trainer.train()

Step,Training Loss,Validation Loss
200,3.1768,3.051923
400,2.9171,2.952019
600,2.7068,2.929824
800,2.5001,2.889929
1000,2.4656,2.839692
1200,2.1925,2.902951
1400,2.1326,2.884715


TrainOutput(global_step=1554, training_loss=2.537913436595077, metrics={'train_runtime': 1552.5233, 'train_samples_per_second': 15.996, 'train_steps_per_second': 1.001, 'total_flos': 1622260876050432.0, 'train_loss': 2.537913436595077, 'epoch': 3.0})

In [None]:
torch.cuda.empty_cache()

eval_results = trainer.evaluate()

In [None]:
eval_results

{'eval_loss': 2.8818860054016113,
 'eval_runtime': 33.2774,
 'eval_samples_per_second': 60.822,
 'eval_steps_per_second': 3.816,
 'epoch': 3.0}

In [None]:
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
pipe = pipeline(
    "text-generation", model=model, device=device, tokenizer=tokenizer
)

In [None]:
txt = """
PCIARD:
Deanna, your breasts look so full today.
"""

In [None]:
print(pipe(txt, num_return_sequences=1)[0]["generated_text"])

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.



PCIARD:
Deanna, your breasts look so full today.

Ogawa nods and steps over. Beverly stares after the
glass.

BEVERLY:
This is a lovely woman!
Her arms are


In [None]:
trainer.push_to_hub()