## Required Libraries

In [None]:
%pip install transformers datasets torch

Collecting transformers
  Downloading transformers-4.53.1-py3-none-any.whl.metadata (40 kB)
Collecting datasets
  Downloading datasets-3.6.0-py3-none-any.whl.metadata (19 kB)
Collecting huggingface-hub<1.0,>=0.30.0 (from transformers)
  Downloading huggingface_hub-0.33.2-py3-none-any.whl.metadata (14 kB)
Collecting regex!=2019.12.17 (from transformers)
  Downloading regex-2024.11.6-cp313-cp313-win_amd64.whl.metadata (41 kB)
Collecting tokenizers<0.22,>=0.21 (from transformers)
  Downloading tokenizers-0.21.2-cp39-abi3-win_amd64.whl.metadata (6.9 kB)
Collecting safetensors>=0.4.3 (from transformers)
  Using cached safetensors-0.5.3-cp38-abi3-win_amd64.whl.metadata (3.9 kB)
Collecting pyarrow>=15.0.0 (from datasets)
  Downloading pyarrow-20.0.0-cp313-cp313-win_amd64.whl.metadata (3.4 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp313-cp313-win_amd64.whl.metadata (1


[notice] A new release of pip is available: 24.3.1 -> 25.1.1
[notice] To update, run: python.exe -m pip install --upgrade pip


# Dataset PreProcessing

In [None]:
from datasets import Dataset
from transformers import AutoTokenizer


with open(r"C:\Users\Sam\Desktop\Instagram_Caption_GenAI_Project\Instagram_Caption_Generator_UsingGenAI\Instagram_Caption_Dataset.txt", "r", encoding="utf-8") as f:
    poems = [p.strip() for p in f.read().split("\n\n") if len(p.strip().split()) > 10]

dataset = Dataset.from_dict({"text": poems})
dataset = dataset.train_test_split(test_size=0.1)

tokenizer = AutoTokenizer.from_pretrained("gpt2")
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

def tokenize_function(example):
    tokens = tokenizer(
        example["text"],
        truncation=True,
        padding="max_length",
        max_length=128
    )
    tokens["labels"] = tokens["input_ids"].copy()
    return tokens


tokenized_datasets = dataset.map(tokenize_function, batched=True)



  from .autonotebook import tqdm as notebook_tqdm
To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Map: 100%|██████████| 338/338 [00:00<00:00, 8005.71 examples/s]
Map: 100%|██████████| 38/38 [00:00<00:00, 6004.05 examples/s]


In [None]:
%pip install hf_xet

Collecting hf_xet
  Downloading hf_xet-1.1.5-cp37-abi3-win_amd64.whl.metadata (883 bytes)
Downloading hf_xet-1.1.5-cp37-abi3-win_amd64.whl (2.7 MB)
   ---------------------------------------- 0.0/2.7 MB ? eta -:--:--
   ---------------------------------------- 0.0/2.7 MB ? eta -:--:--
   ---------------------------------------- 0.0/2.7 MB ? eta -:--:--
   ---------------------------------------- 0.0/2.7 MB ? eta -:--:--
   ---------------------------------------- 0.0/2.7 MB ? eta -:--:--
   --- ------------------------------------ 0.3/2.7 MB ? eta -:--:--
   ------- -------------------------------- 0.5/2.7 MB 1.2 MB/s eta 0:00:02
   ----------- ---------------------------- 0.8/2.7 MB 1.2 MB/s eta 0:00:02
   --------------- ------------------------ 1.0/2.7 MB 1.2 MB/s eta 0:00:02
   --------------- ------------------------ 1.0/2.7 MB 1.2 MB/s eta 0:00:02
   --------------- ------------------------ 1.0/2.7 MB 1.2 MB/s eta 0:00:02
   ------------------- -------------------- 1.3/2.7 MB 857


[notice] A new release of pip is available: 24.3.1 -> 25.1.1
[notice] To update, run: python.exe -m pip install --upgrade pip


# Fine Tuning GPT2 Model

In [None]:
from transformers import AutoModelForCausalLM, Trainer, TrainingArguments

model = AutoModelForCausalLM.from_pretrained("gpt2")

training_args = TrainingArguments(
    output_dir="./poetry-gpt2-finetuned",
    num_train_epochs=1,                  
    per_device_train_batch_size=10,      
    save_steps=1000,
    save_total_limit=1,
    logging_steps=100,
    learning_rate=2e-5,
    weight_decay=0.01,
    fp16=True,                         
    max_steps=1500,                       
    dataloader_num_workers=4,            
    gradient_accumulation_steps=1,       
    warmup_steps=50,                     
    logging_dir='./logs',
    report_to=None                      
)
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["test"],
)

trainer.train()
model.save_pretrained("./poetry-gpt2-finetuned")
tokenizer.save_pretrained("./poetry-gpt2-finetuned")


model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

In [None]:
%pip install --upgrade jupyter ipywidgets

^C
Note: you may need to restart the kernel to use updated packages.
