In [3]:
import re
from transformers import GPT2Tokenizer
from datasets import Dataset
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import nltk

# Download NLTK data
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

# Initialize NLTK tools
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

# Function to clean and preprocess text
def preprocess_text(text):
    # Remove unnecessary characters
    text = re.sub(r'[^\w\s]', '', text)
    text = re.sub(r'\d+', '', text)
    
    # Tokenize the text
    tokens = word_tokenize(text)
    
    # Remove stop words and lemmatize
    tokens = [lemmatizer.lemmatize(token.lower()) for token in tokens if token.lower() not in stop_words]
    
    return ' '.join(tokens)

# Read the dataset
def read_dataset(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        content = file.read()
    
    # Split the content by genre
    pattern = r'<GENRE: (.*?)>\n(.*?)\nMoral: (.*?)\n'
    matches = re.findall(pattern, content, re.DOTALL)
    
    data = []
    for match in matches:
        genre, story, moral = match
        data.append({
            'genre': genre.strip(),
            'story': preprocess_text(story.strip()),
            'moral': preprocess_text(moral.strip())
        })
    
    return data

# Tokenize the dataset
def tokenize_data(data, tokenizer):
    tokenized_data = {'input_ids': [], 'attention_mask': []}
    for entry in data:
        input_text = f"<GENRE: {entry['genre']}>\n{entry['story']}\nMoral: {entry['moral']}\n"
        tokenized_input = tokenizer(input_text, padding='max_length', truncation=True, max_length=512)
        tokenized_data['input_ids'].append(tokenized_input['input_ids'])
        tokenized_data['attention_mask'].append(tokenized_input['attention_mask'])
    
    return tokenized_data

# Load and preprocess the dataset
file_path = 'stories.txt'
data = read_dataset(file_path)

# Initialize the tokenizer
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
tokenizer.pad_token = tokenizer.eos_token

# Tokenize the data
tokenized_data = tokenize_data(data, tokenizer)

# Convert to Hugging Face Dataset format
dataset = Dataset.from_dict(tokenized_data)

# Save the processed dataset
dataset.save_to_disk('processed_stories_dataset')

print("Dataset prepared and saved to 'processed_stories_dataset'")


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\SHYNI\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\SHYNI\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\SHYNI\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\SHYNI\AppData\Roaming\nltk_data...


Saving the dataset (0/1 shards):   0%|          | 0/74 [00:00<?, ? examples/s]

Dataset prepared and saved to 'processed_stories_dataset'


In [13]:
from transformers import GPT2Tokenizer

# Load the tokenizer
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')

# Add a pad token if it doesn't exist
if tokenizer.pad_token is None:
    tokenizer.add_special_tokens({'pad_token': '[PAD]'})

# Save the tokenizer with the new pad token
tokenizer.save_pretrained('gpt2_with_pad')


('gpt2_with_pad\\tokenizer_config.json',
 'gpt2_with_pad\\special_tokens_map.json',
 'gpt2_with_pad\\vocab.json',
 'gpt2_with_pad\\merges.txt',
 'gpt2_with_pad\\added_tokens.json')

In [14]:
from transformers import GPT2LMHeadModel, GPT2Tokenizer, Trainer, TrainingArguments, DataCollatorForLanguageModeling
import torch
from datasets import Dataset

# Load the modified tokenizer with padding token
tokenizer = GPT2Tokenizer.from_pretrained('gpt2_with_pad')
model = GPT2LMHeadModel.from_pretrained('gpt2')

# Load your preprocessed dataset
dataset = Dataset.load_from_disk('processed_stories_dataset')

# Convert dataset to PyTorch Dataset
class CustomDataset(torch.utils.data.Dataset):
    def __init__(self, dataset):
        self.dataset = dataset

    def __getitem__(self, idx):
        item = self.dataset[idx]
        item['input_ids'] = torch.tensor(item['input_ids'])
        item['attention_mask'] = torch.tensor(item['attention_mask'])
        return item

    def __len__(self):
        return len(self.dataset)

# Create custom dataset
train_dataset = CustomDataset(dataset)

# Fine-tuning parameters
training_args = TrainingArguments(
    output_dir='./results',
    overwrite_output_dir=True,
    num_train_epochs=3,
    per_device_train_batch_size=2,
    gradient_accumulation_steps=4,
    save_steps=10_000,
    save_total_limit=2,
)

# Data collator
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False,
)

# Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=train_dataset,
)

# Fine-tune the model
trainer.train()


Step,Training Loss


TrainOutput(global_step=27, training_loss=1.3223454510724102, metrics={'train_runtime': 1618.3994, 'train_samples_per_second': 0.137, 'train_steps_per_second': 0.017, 'total_flos': 56439078912000.0, 'train_loss': 1.3223454510724102, 'epoch': 2.918918918918919})

In [16]:
# Save the model and tokenizer
model_save_path = './final_model'
model.save_pretrained(model_save_path)
print(f"Model saved to {model_save_path}")


Model saved to ./final_model


In [5]:
from transformers import GPT2LMHeadModel, GPT2Tokenizer
model = GPT2LMHeadModel.from_pretrained(model_save_path)
tokenizer = GPT2Tokenizer.from_pretrained("./gpt2_with_pad")

print("Model and tokenizer loaded successfully")

ImportError: 
GPT2LMHeadModel requires the PyTorch library but it was not found in your environment. Checkout the instructions on the
installation page: https://pytorch.org/get-started/locally/ and follow the ones that match your environment.
Please note that you may need to restart your runtime after installation.


In [23]:
def generate_story(prompt, max_length=150, num_return_sequences=1, temperature=0.7, top_k=50, top_p=0.9, repetition_penalty=1.2):
    inputs = tokenizer(prompt, return_tensors='pt')
    outputs = model.generate(
        **inputs,
        max_length=max_length,
        num_return_sequences=num_return_sequences,
        temperature=temperature,
        top_k=top_k,
        top_p=top_p,
        repetition_penalty=repetition_penalty,
        pad_token_id=tokenizer.eos_token_id
    )
    
    stories = [tokenizer.decode(output, skip_special_tokens=True) for output in outputs]
    return stories


prompt = "Once upon a time in a magical forest"
stories = generate_story(prompt, temperature=0.7, top_k=50, top_p=0.9)

for i, story in enumerate(stories):
    print(f"Story {i + 1}:\n{story}\n")

Story 1:
Once upon a time in a magical forest, the young girl named Kirito was able to find her way back home.




In [None]:

# Interactive storytelling function
def interactive_storytelling():
    print("Hello! What story do you want to hear today?")
    print("Type the number to choose:")
    print("1. Fairy Tale")
    print("2. Adventure")
    print("3. Fantasy")
    print("4. Sci-Fi")
    print("5. Mystery")
    print("6. Animal Tale")
    print("7. Fable")
    print("8. Mythology")
    print("9. Historical Fiction")
    print("10. Humor")
    print("11. Friendship")
    print("12. Superheroes")
    print("13. Sports")
    print("14. Holidays")
    print("15. Bedtime")
    
    user_input = input("> ").strip()

    genre_prompts = {
        "1": ("fairy tale", "a brave knight, a clever princess, or a talking animal"),
        "2": ("adventure", "a thrilling journey, a treasure hunt, or a daring expedition"),
        "3": ("fantasy", "elves, dragons, or wizards"),
        "4": ("sci-fi", "distant planets, encounter aliens, or dive into futuristic technology"),
        "5": ("mystery", "uncover a hidden treasure, solve a crime, or reveal a secret"),
        "6": ("animal tale", "a wise owl, a brave lion, or a mischievous monkey"),
        "7": ("fable", "wisdom, kindness, or perseverance"),
        "8": ("mythology", "Greek, Norse, or Egyptian"),
        "9": ("historical fiction", "ancient civilizations, medieval kingdoms, or the roaring twenties"),
        "10": ("humor", "puns, slapstick comedy, or witty banter"),
        "11": ("friendship", "loyalty, compassion, or teamwork"),
        "12": ("superheroes", "flight, super strength, or invisibility"),
        "13": ("sports", "soccer, basketball, or swimming"),
        "14": ("holidays", "Halloween, Christmas, or New Year's Eve"),
        "15": ("bedtime", "dreamlands, whispering forests, or starlit skies")
    }

    if user_input in genre_prompts:
        genre, options = genre_prompts[user_input]
        print(f"Wonderful! Do you want a story about {options}?")
        specific_choice = input("> ").lower()
        prompt = f"<GENRE: {genre}> Once upon a time, in a magical land, there was a {specific_choice} who"
    else:
        print("That's not a valid choice. Please type a number from 1 to 15.")
        return

    # Generate the beginning of the story
    story_parts = [prompt]
    story = generate_response(prompt, model, tokenizer)
    print(story)
    
    while True:
        print("\nWhat happens next?")
        print("Type the number to choose:")
        print("1. The character encounters a challenge.")
        print("2. The character makes a new friend.")
        print("3. The character discovers something amazing.")
        print("4. Summarize the story and finish.")
        user_input = input("> ").strip()

        if user_input == "1":
            prompt = f"The {specific_choice} faced a great challenge. It was..."
            story_parts.append(prompt)
        elif user_input == "2":
            prompt = f"The {specific_choice} made a new friend. This friend was..."
            story_parts.append(prompt)
        elif user_input == "3":
            prompt = f"The {specific_choice} discovered something amazing. It was..."
            story_parts.append(prompt)
        elif user_input == "4":
            print("Summarizing the story and finishing it.")
            story_parts.append(story)
            summary = " ".join(story_parts)
            print(f"\nHere is the summary of your story:\n\n{summary}")
            break
        else:
            print("Invalid choice. Please type 1, 2, 3, or 4.")
            continue

        # Generate the next part of the story
        story = generate_response(prompt, model, tokenizer)
        print(story)

if __name__ == "__main__":

    interactive_storytelling()


In [2]:
!pip install transformers

Collecting transformers
  Downloading transformers-4.43.3-py3-none-any.whl (9.4 MB)
     ---------------------------------------- 9.4/9.4 MB 4.4 MB/s eta 0:00:00
Collecting tokenizers<0.20,>=0.19
  Downloading tokenizers-0.19.1-cp39-none-win_amd64.whl (2.2 MB)
     ---------------------------------------- 2.2/2.2 MB 4.7 MB/s eta 0:00:00
Collecting safetensors>=0.4.1
  Downloading safetensors-0.4.3-cp39-none-win_amd64.whl (287 kB)
     -------------------------------------- 287.9/287.9 kB 6.0 MB/s eta 0:00:00
Collecting huggingface-hub<1.0,>=0.23.2
  Using cached huggingface_hub-0.24.2-py3-none-any.whl (417 kB)
Collecting fsspec>=2023.5.0
  Using cached fsspec-2024.6.1-py3-none-any.whl (177 kB)
Installing collected packages: safetensors, fsspec, huggingface-hub, tokenizers, transformers
  Attempting uninstall: fsspec
    Found existing installation: fsspec 2022.7.1
    Uninstalling fsspec-2022.7.1:
      Successfully uninstalled fsspec-2022.7.1
Successfully installed fsspec-2024.6.1 hug

In [4]:
!pip install pytorch

Collecting pytorch
  Using cached pytorch-1.0.2.tar.gz (689 bytes)
  Preparing metadata (setup.py): started
  Preparing metadata (setup.py): finished with status 'done'
Building wheels for collected packages: pytorch
  Building wheel for pytorch (setup.py): started
  Building wheel for pytorch (setup.py): finished with status 'error'
  Running setup.py clean for pytorch
Failed to build pytorch
Installing collected packages: pytorch
  Running setup.py install for pytorch: started
  Running setup.py install for pytorch: finished with status 'error'


  error: subprocess-exited-with-error
  
  python setup.py bdist_wheel did not run successfully.
  exit code: 1
  
  [6 lines of output]
  Traceback (most recent call last):
    File "<string>", line 2, in <module>
    File "<pip-setuptools-caller>", line 34, in <module>
    File "C:\Users\SHYNI\AppData\Local\Temp\pip-install-iye9hyye\pytorch_beb20baf11594b6d8c08d1a83ce82361\setup.py", line 15, in <module>
      raise Exception(message)
  Exception: You tried to install "pytorch". The package named for PyTorch is "torch"
  [end of output]
  
  note: This error originates from a subprocess, and is likely not a problem with pip.
  ERROR: Failed building wheel for pytorch
  error: subprocess-exited-with-error
  
  Running setup.py install for pytorch did not run successfully.
  exit code: 1
  
  [6 lines of output]
  Traceback (most recent call last):
    File "<string>", line 2, in <module>
    File "<pip-setuptools-caller>", line 34, in <module>
    File "C:\Users\SHYNI\AppData\Local\Te