<a href="https://colab.research.google.com/github/shubhamsinghal03/creative-writing-assistant/blob/main/training_inference_1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install --upgrade fsspec==2024.9.0
!pip install torch transformers datasets flask

Collecting fsspec==2024.9.0
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Downloading fsspec-2024.9.0-py3-none-any.whl (179 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m179.3/179.3 kB[0m [31m8.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: fsspec
  Attempting uninstall: fsspec
    Found existing installation: fsspec 2024.10.0
    Uninstalling fsspec-2024.10.0:
      Successfully uninstalled fsspec-2024.10.0
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
gcsfs 2024.10.0 requires fsspec==2024.10.0, but you have fsspec 2024.9.0 which is incompatible.
torch 2.5.1+cu124 requires nvidia-cublas-cu12==12.4.5.8; platform_system == "Linux" and platform_machine == "x86_64", but you have nvidia-cublas-cu12 12.5.3.2 which is incompatible.
torch 2.5.1+cu124 requires nvidia-cuda-cupti-cu12==12.4.127; platfor

In [3]:
# This script handles the training and fine-tuning of the GPT-2 model.

from datasets import load_dataset
from transformers import GPT2Tokenizer, GPT2LMHeadModel, AdamW
from torch.utils.data import DataLoader
import torch

# Load dataset and tokenizer
dataset = load_dataset("wikitext", "wikitext-2-raw-v1")
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
tokenizer.pad_token = tokenizer.eos_token

# Split the dataset to use a smaller portion
# For example, use 10% of the training data
small_train_dataset = dataset["train"].train_test_split(test_size=0.9)["train"]

# Resize tokenizer vocabulary to include all tokens in the dataset
def tokenize_function(examples):
    return tokenizer(examples["text"],
                     truncation=True,
                     padding="max_length",
                     max_length=128,
                     return_tensors='pt')

# Tokenize the smaller dataset
tokenized_dataset = small_train_dataset.map(
    tokenize_function, batched=True,
    remove_columns=["text"])

tokenized_dataset.set_format(type="torch", columns=["input_ids", "attention_mask"])
train_loader = DataLoader(tokenized_dataset, batch_size=8, shuffle=True)

# Load model and optimizer
model = GPT2LMHeadModel.from_pretrained("gpt2")
optimizer = AdamW(model.parameters(), lr=5e-5)

# Training loop
device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device)

epochs = 3
for epoch in range(epochs):
    model.train()
    total_loss = 0
    for batch in train_loader:
        optimizer.zero_grad()
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = input_ids.clone()

        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        total_loss += loss.item()

        loss.backward()
        optimizer.step()

    print(f"Epoch {epoch + 1}, Loss: {total_loss / len(train_loader)}")

# Save the model
model.save_pretrained("models/creative_writing_gpt2")
tokenizer.save_pretrained("models/creative_writing_gpt2")

Map:   0%|          | 0/3671 [00:00<?, ? examples/s]

Epoch 1, Loss: 1.4090852644335081
Epoch 2, Loss: 1.2258222644543986
Epoch 3, Loss: 1.127451516671638


('models/creative_writing_gpt2/tokenizer_config.json',
 'models/creative_writing_gpt2/special_tokens_map.json',
 'models/creative_writing_gpt2/vocab.json',
 'models/creative_writing_gpt2/merges.txt',
 'models/creative_writing_gpt2/added_tokens.json')

In [10]:
from transformers import GPT2LMHeadModel, GPT2Tokenizer
import torch
import argparse

# Load fine-tuned model and tokenizer
model = GPT2LMHeadModel.from_pretrained("models/creative_writing_gpt2")
tokenizer = GPT2Tokenizer.from_pretrained("models/creative_writing_gpt2")

# Set the pad token to the EOS token to avoid warnings
tokenizer.pad_token = tokenizer.eos_token

device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device)

# Generate story
def generate_story(prompt):
    input_ids = tokenizer.encode(prompt, return_tensors="pt").to(device)
    attention_mask = torch.ones(input_ids.shape, device=device)  # Create an attention mask
    output = model.generate(input_ids,
                            max_length=200,
                            num_return_sequences=1,
                            no_repeat_ngram_size=2,  # Prevent repetition of n-grams
                            early_stopping=True,  # Stop early if EOS token is generated
                            pad_token_id=tokenizer.eos_token_id  # Set pad token ID
                            )
    return tokenizer.decode(output[0], skip_special_tokens=True)

prompt = "story about an old man"
story = generate_story(prompt)
print(story)



story about an old man named John , who was born in 1843 , is a recurring theme in the series . John is the son of a wealthy merchant , and his father is an accomplished lawyer . He is also a devout Christian , but his family is not well off , so he is forced to work for a small business . His father dies when he refuses to pay his bills , which John takes advantage of by hiring a lawyer , John 's brother , to help him out . The lawyer is hired by a local lawyer named William , a former lawyer who is now a successful lawyer in his own right . William is able to get John to sign a contract , promising him a job with a large firm , while John promises to stay in school , work at a restaurant , or work as a nurse . 

