## Download Dataset

In [1]:
#!/bin/bash
!curl -L -o /kaggle/working/tinystories-narrative-classification.zip https://www.kaggle.com/api/v1/datasets/download/thedevastator/tinystories-narrative-classification

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
  0     0    0     0    0     0      0      0 --:--:-- --:--:-- --:--:--     0
100  575M  100  575M    0     0  38.6M      0  0:00:14  0:00:14 --:--:-- 41.9M


In [2]:
!unzip /kaggle/working/tinystories-narrative-classification.zip

Archive:  /kaggle/working/tinystories-narrative-classification.zip
  inflating: train.csv               
  inflating: validation.csv          


## Setting up environment

In [3]:
import pandas as pd
csv_data = pd.read_csv("train.csv")
csv_data.head()

Unnamed: 0,text
0,"One day, a little girl named Lily found a need..."
1,"Once upon a time, there was a little car named..."
2,"One day, a little fish named Fin was swimming ..."
3,"Once upon a time, in a land full of trees, the..."
4,"Once upon a time, there was a little girl name..."


In [4]:
import csv

# Open the CSV file
with open('/kaggle/working/train.csv', "r") as file:
    csv_reader = csv.reader(file)
    
    # Iterate through the rows
    for i, row in enumerate(csv_reader):
        if i == 1:  # Index 1 corresponds to the second row (0-based indexing)
            print(row)
            break  # Exit after printing the second row

['One day, a little girl named Lily found a needle in her room. She knew it was difficult to play with it because it was sharp. Lily wanted to share the needle with her mom, so she could sew a button on her shirt.\n\nLily went to her mom and said, "Mom, I found this needle. Can you share it with me and sew my shirt?" Her mom smiled and said, "Yes, Lily, we can share the needle and fix your shirt."\n\nTogether, they shared the needle and sewed the button on Lily\'s shirt. It was not difficult for them because they were sharing and helping each other. After they finished, Lily thanked her mom for sharing the needle and fixing her shirt. They both felt happy because they had shared and worked together.']


In [5]:
pip install nltk

Note: you may need to restart the kernel to use updated packages.


In [6]:
import nltk
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt_tab to /usr/share/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


True

In [7]:
!python -m spacy download en

[38;5;3m⚠ As of spaCy v3.0, shortcuts like 'en' are deprecated. Please use the
full pipeline package name 'en_core_web_sm' instead.[0m
Collecting en-core-web-sm==3.7.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m44.2 MB/s[0m eta [36m0:00:00[0m00:01[0m:01[0m
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


In [8]:
!rm /kaggle/working/output_story_dataset1.txt.gz
!rm /kaggle/working/output_story_dataset2.txt.gz
!rm /kaggle/working/story_dataset.txt.gz
!rm /kaggle/working/tinystories-narrative-classification.zip

rm: cannot remove '/kaggle/working/output_story_dataset1.txt.gz': No such file or directory
rm: cannot remove '/kaggle/working/output_story_dataset2.txt.gz': No such file or directory
rm: cannot remove '/kaggle/working/story_dataset.txt.gz': No such file or directory


## Create Dataset which can be understand to be trained by GPT2

In [9]:
import csv
import re
import spacy
import gzip
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk import download
from multiprocessing import Pool, cpu_count
import pandas as pd
import time

In [10]:
# Download NLTK resources
download('stopwords')
download('punkt')

# Load spaCy model for advanced NLP
nlp = spacy.load("en_core_web_sm")

# Define stopwords
stop_words = set(stopwords.words("english"))

[nltk_data] Downloading package stopwords to /usr/share/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /usr/share/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [11]:
# Function to extract keywords in order
def extract_ordered_terms(text):
    # Tokenize and clean the text
    tokens = word_tokenize(re.sub(r'[^\w\s]', '', text.lower()))
    filtered_tokens = [word for word in tokens if word not in stop_words and len(word) > 2]

    # Process text with spaCy
    doc = nlp(text)

    terms = []
    for token in doc:
        if token.text.lower() in filtered_tokens:
            if token.pos_ in {"VERB", "NOUN"}:  # Actions or objects
                terms.append(token.text)
        if token.ent_type_ in {"GPE", "LOC", "PERSON"}:  # Places or names
            terms.append(token.text)
        if token.text.lower() in {"happy", "sad", "angry", "excited", "scared", "love"}:  # Emotions
            terms.append(token.text)

    # Deduplicate while preserving order
    seen = set()
    ordered_terms = [term for term in terms if not (term in seen or seen.add(term))]
    return ordered_terms

# Function to process a single row
def process_row(row):
    story = row.strip().replace("\n", " ")
    if not story:
        return None  # Skip empty stories

    keywords = extract_ordered_terms(story)
    formatted_story = (
        f"<|startoftext|>Keywords: {', '.join(keywords)}\n"
        f"Story: {story}<|endoftext|>\n"
    )
    return formatted_story

# Function to process a batch of stories
def process_batch(batch):
    return [process_row(row) for row in batch if row]

# Function to process the dataset in chunks with parallel processing
def process_csv_in_chunks(input_file, output_file, chunksize=10000):
    num_cores = cpu_count()
    print(f"Using {num_cores} CPU cores for parallel processing.")

    # Calculate total rows for live status updates
    total_rows = sum(1 for _ in open(input_file)) - 1  # Subtract header row

    processed_count = 0
    start_time = time.time()

    with gzip.open(output_file, "wt") as output:
        for chunk_idx, chunk in enumerate(pd.read_csv(input_file, chunksize=chunksize)):
            stories = chunk['text'].dropna().tolist()  # Ensure no null values

            # Split stories into smaller batches for parallel processing
            batches = [stories[i:i + chunksize // num_cores] for i in range(0, len(stories), chunksize // num_cores)]

            with Pool(num_cores) as pool:
                results = pool.map(process_batch, batches)

            # Flatten the results and filter out None values
            flat_results = [item for sublist in results for item in sublist if item]

            # Write to output file
            output.writelines(flat_results)

            # Update progress
            processed_count += len(stories)
            elapsed_time = time.time() - start_time
            percentage_complete = (processed_count / total_rows) * 100
            print(f"Chunk {chunk_idx + 1}: Processed {processed_count}/{total_rows} stories ({percentage_complete:.2f}%) in {elapsed_time:.2f} seconds.")

In [12]:
# Main function to execute the process
def main():
    input_file = "/kaggle/working/train.csv"  # Update with your input file path
    output_file = "/kaggle/working/output_story_dataset1.txt.gz"  # Output as compressed file

    print("Starting dataset processing...")
    process_csv_in_chunks(input_file, output_file)
    print("Processing complete. Output saved to:", output_file)

In [None]:
if __name__ == "__main__":
    main()

## Create Dataset which can be understand to be trained by GPT2 2 - OLD Way

In [None]:
import csv
import re
import spacy
import gzip
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk import download
from multiprocessing import Pool, cpu_count
import pandas as pd
import time

In [None]:
# Download NLTK resources
download('stopwords')
download('punkt')

# Load spaCy model for advanced NLP
nlp = spacy.load("en_core_web_sm")

# Define stopwords
stop_words = set(stopwords.words("english"))

In [None]:
# Function to extract keywords in order
def extract_ordered_terms(text):
    # Tokenize and clean the text
    tokens = word_tokenize(re.sub(r'[^\w\s]', '', text.lower()))
    filtered_tokens = [word for word in tokens if word not in stop_words and len(word) > 2]

    # Process text with spaCy
    doc = nlp(text)

    terms = []
    for token in doc:
        if token.text.lower() in filtered_tokens:
            if token.pos_ in {"VERB", "NOUN"}:  # Actions or objects
                terms.append(token.text)
        if token.ent_type_ in {"GPE", "LOC", "PERSON"}:  # Places or names
            terms.append(token.text)
        if token.text.lower() in {"happy", "sad", "angry", "excited", "scared", "love"}:  # Emotions
            terms.append(token.text)

    # Deduplicate while preserving order
    seen = set()
    ordered_terms = [term for term in terms if not (term in seen or seen.add(term))]
    return ordered_terms

# Function to process a single row
def process_row(row):
    story = row.strip().replace("\n", " ")
    if not story:
        return None  # Skip empty stories

    keywords = extract_ordered_terms(story)
    formatted_story = (
        f"<|startoftext|>Keywords: {', '.join(keywords)}\n"
        f"Story: {story}<|endoftext|>\n"
    )
    return formatted_story

# Function to process a batch of stories
def process_batch(batch):
    return [process_row(row) for row in batch if row]

# Function to process the dataset in chunks with parallel processing
def process_csv_in_chunks(input_file, output_file, chunksize=10000):
    num_cores = cpu_count()
    print(f"Using {num_cores} CPU cores for parallel processing.")

    # Calculate total rows for live status updates
    total_rows = sum(1 for _ in open(input_file)) - 1  # Subtract header row

    processed_count = 0
    start_time = time.time()

    with gzip.open(output_file, "wt") as output:
        for chunk_idx, chunk in enumerate(pd.read_csv(input_file, chunksize=chunksize)):
            stories = chunk['text'].dropna().tolist()  # Ensure no null values

            # Split stories into smaller batches for parallel processing
            batches = [stories[i:i + chunksize // num_cores] for i in range(0, len(stories), chunksize // num_cores)]

            with Pool(num_cores) as pool:
                results = pool.map(process_batch, batches)

            # Flatten the results and filter out None values
            flat_results = [item for sublist in results for item in sublist if item]

            # Write to output file
            output.writelines(flat_results)

            # Update progress
            processed_count += len(stories)
            elapsed_time = time.time() - start_time
            percentage_complete = (processed_count / total_rows) * 100
            print(f"Chunk {chunk_idx + 1}: Processed {processed_count}/{total_rows} stories ({percentage_complete:.2f}%) in {elapsed_time:.2f} seconds.")

In [None]:
# Main function to execute the process
def main():
    input_file = "/kaggle/working/train.csv"  # Update with your input file path
    output_file = "/kaggle/working/output_story_dataset2.txt.gz"  # Output as compressed file

    print("Starting dataset processing...")
    process_csv_in_chunks(input_file, output_file)
    print("Processing complete. Output saved to:", output_file)

In [None]:
if __name__ == "__main__":
    main()

In [None]:
# import gzip
# import shutil

# def extract_gz(input_file, output_file):
#     """
#     Extracts a .gz file.

#     :param input_file: Path to the .gz file.
#     :param output_file: Path to save the extracted file.
#     """
#     with gzip.open(input_file, 'rb') as gz_file:
#         with open(output_file, 'wb') as out_file:
#             shutil.copyfileobj(gz_file, out_file)
#     print(f"Extracted {input_file} to {output_file}")

# # Example usage
# input_gz_file = "/kaggle/working/output_story_dataset.txt.gz"
# output_extracted_file = "/kaggle/working/output_story_dataset.txt"
# extract_gz(input_gz_file, output_extracted_file)


Extracted /kaggle/working/output_story_dataset.txt.gz to /kaggle/working/output_story_dataset.txt


## Try2

In [12]:
# import csv
# import re
# import spacy
# import gzip
# from nltk.corpus import stopwords
# from nltk.tokenize import word_tokenize
# from nltk import download
# from multiprocessing import Pool, cpu_count
# import pandas as pd
# import time
# from concurrent.futures import ThreadPoolExecutor
# import threading

In [None]:
# Download NLTK resources
download('stopwords')
download('punkt')

# Load spaCy model for advanced NLP
nlp = spacy.load("en_core_web_sm")

# Define stopwords
stop_words = set(stopwords.words("english"))

In [39]:
# Function to extract keywords in order
def extract_ordered_terms(text):
    # Tokenize and clean the text
    tokens = word_tokenize(re.sub(r'[^\w\s]', '', text.lower()))
    filtered_tokens = [word for word in tokens if word not in stop_words and len(word) > 2]

    # Process text with spaCy
    doc = nlp(text)

    terms = []
    for token in doc:
        if token.text.lower() in filtered_tokens:
            if token.pos_ in {"VERB", "NOUN"}:  # Actions or objects
                terms.append(token.text)
        if token.ent_type_ in {"GPE", "LOC", "PERSON"}:  # Places or names
            terms.append(token.text)
        if token.text.lower() in {"happy", "sad", "angry", "excited", "scared", "love"}:  # Emotions
            terms.append(token.text)

    # Deduplicate while preserving order
    seen = set()
    ordered_terms = [term for term in terms if not (term in seen or seen.add(term))]
    return ordered_terms

# Function to process a single row
def process_row(row):
    story = row.strip().replace("\n", " ")
    if not story:
        return None  # Skip empty stories

    keywords = extract_ordered_terms(story)
    formatted_story = f"<|keywords|>{', '.join(keywords)}<|story|>{story}<|endoftext|>"
    return formatted_story

# Function to process a batch of stories
def process_batch(batch):
    return [process_row(row) for row in batch if row]

# Function to process the dataset in chunks with parallel processing
def process_csv_in_chunks(input_file, output_file, chunksize=10000):
    num_cores = cpu_count()
    print(f"Using {num_cores} CPU cores for parallel processing.")

    # Calculate total rows for live status updates
    total_rows = sum(1 for _ in open(input_file)) - 1  # Subtract header row

    processed_count = 0
    start_time = time.time()

    with gzip.open(output_file, "wt") as output:
        for chunk_idx, chunk in enumerate(pd.read_csv(input_file, chunksize=chunksize)):
            stories = chunk['text'].dropna().tolist()  # Ensure no null values

            # Split stories into smaller batches for parallel processing
            batches = [stories[i:i + chunksize // num_cores] for i in range(0, len(stories), chunksize // num_cores)]

            with Pool(num_cores) as pool:
                results = pool.map(process_batch, batches)

            # Flatten the results and filter out None values
            flat_results = [item for sublist in results for item in sublist if item]

            # Write to output file
            output.writelines(flat_results)

            # Update progress
            processed_count += len(stories)
            elapsed_time = time.time() - start_time
            percentage_complete = (processed_count / total_rows) * 100
            print(f"Chunk {chunk_idx + 1}: Processed {processed_count}/{total_rows} stories ({percentage_complete:.2f}%) in {elapsed_time:.2f} seconds.")

In [40]:
# Main function to execute the process
def main():
    input_file = "/kaggle/working/train.csv"  # Update with your input file path
    output_file = "/kaggle/working/output_story_dataset.txt.gz"  # Output as compressed file

    print("Starting dataset processing...")
    process_csv_in_chunks(input_file, output_file)
    print("Processing complete. Output saved to:", output_file)

Processed 2800 / 2119719 rows


In [45]:
# if __name__ == "__main__":
#     main()
print("Helo world")

Helo world


In [47]:
# from transformers import GPT2Tokenizer

# tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
# tokenizer.add_special_tokens({"additional_special_tokens": ["<|keywords|>", "<|story|>", "<|endoftext|>"]})

# # Format each row with tokens
# formatted_data = []
# for _, row in csv_data.iterrows():
#     formatted_row = f"<|keywords|>{row['keywords']}<|story|>{row['text']}<|endoftext|>"
#     formatted_data.append(formatted_row)

# # Save to a text file
# with open("formatted_stories.txt", "w") as f:
#     f.write("\n".join(formatted_data))

In [48]:
# import pandas as pd
# import logging
# from concurrent.futures import ThreadPoolExecutor
# from threading import Lock

# lock = Lock()

# def process_batch(batch, batch_idx, file_name, total_rows):
#     keywords_batch = [extract_ordered_terms(text) for text in batch]
#     batch_df = pd.DataFrame(zip(keywords_batch, batch), columns=["keywords", "text"])
    
#     with lock:
#         batch_df.to_csv(file_name, mode='a', header=(batch_idx == 0), index=False)
#         processed_rows = (batch_idx + 1) * len(batch)
#         print(f"Processed {min(processed_rows, total_rows)} / {total_rows} rows", flush=True )

# def main():
#     # Load dataset
#     csv_data = pd.read_csv("train.csv")
#     stories = csv_data["text"].tolist()
#     total_rows = len(stories)
#     batch_size = 100
#     file_name = "keyword_story_pairs_final.csv"

#     # Clear file before writing
#     open(file_name, 'w').close()

#     with ThreadPoolExecutor() as executor:
#         futures = []
#         for i in range(0, total_rows, batch_size):
#             batch = stories[i:i+batch_size]
#             futures.append(executor.submit(process_batch, batch, i // batch_size, file_name, total_rows))
        
#         for future in futures:
#             future.result()

#     logging.info("Processing completed!")

In [50]:
# if __name__ == "__main__":
#     main()
print("Hello world")

Hello world
Processed 73100 / 2119719 rows
Processed 68500 / 2119719 rows
Processed 68600 / 2119719 rows


**# Model Training

In [12]:
from datasets import Dataset, DatasetDict
from transformers import GPT2Tokenizer, GPT2LMHeadModel, Trainer, TrainingArguments
import re
import json
import os

# Custom Dataset Class
class StoryDataset(Dataset):
    def __init__(self, file_path, tokenizer, max_length=128):
        self.tokenizer = tokenizer
        self.max_length = max_length
        self.examples = []

        # Read and parse the dataset
        with open(file_path, 'r', encoding='utf-8') as f:
            data = f.read()
        
        # Extract all keyword-story pairs
        pattern = re.compile(r'<\|keywords\|>(.*?)<\|story\|>(.*?)<\|endoftext\|>', re.DOTALL)
        matches = pattern.findall(data)

        i = 0
        for keywords, story in matches:
            # Format the input text with special tokens
            if(i<5):
                formatted_text = f"<|keywords|> {keywords.strip()} <|story|> {story.strip()} <|endoftext|>"
                self.examples.append(formatted_text)
                i = i+1
    
    def __len__(self):
        return len(self.examples)
    
    def __getitem__(self, idx):
        text = self.examples[idx]
        encoding = self.tokenizer(
            text,
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )
        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': encoding['input_ids'].flatten()  # Same as input_ids for language modeling
        }

# Initialize Tokenizer and Model
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
tokenizer.add_special_tokens({"pad_token": "<|pad|>"})
model = GPT2LMHeadModel.from_pretrained('gpt2')

# Add Special Tokens
special_tokens = {
    'additional_special_tokens': ['<|keywords|>', '<|story|>', '<|endoftext|>']
}
tokenizer.add_special_tokens(special_tokens)
model.resize_token_embeddings(len(tokenizer))

# Load the tokenized dataset
tokenized_dataset_10k = StoryDataset('/kaggle/input/storydataset2/formatted_text.txt', tokenizer)

# Save the tokenized dataset to disk (ensure that saving works)
os.makedirs("/kaggle/working/tokenized_dataset_10k", exist_ok=True)

# Save as JSON
with open("/kaggle/working/tokenized_dataset_10k/dataset.json", 'w') as f:
    json.dump(tokenized_dataset_10k.examples, f)

# Load the saved dataset
with open("/kaggle/working/tokenized_dataset_10k/dataset.json", 'r') as f:
    data = json.load(f)

# Convert back to a Huggingface Dataset format
dataset_dict = DatasetDict({
    "train": Dataset.from_dict({"text": data}),
    "test": Dataset.from_dict({"text": data}),
})

# Tokenize dataset for proper input formatting
def tokenize_function(examples):
    return tokenizer(examples['text'], padding='max_length', truncation=True, max_length=128)

# Apply the tokenizer
tokenized_dataset = dataset_dict.map(tokenize_function, batched=True)

# Split into training and validation
tokenized_dataset = tokenized_dataset["train"].train_test_split(test_size=0.1)

# Define Training Arguments
training_args = TrainingArguments(
    output_dir="./results",
    overwrite_output_dir=True,
    num_train_epochs=3,
    per_device_train_batch_size=4,
    save_steps=500,
    save_total_limit=2,
    logging_dir="./logs",
    evaluation_strategy="steps",
    eval_steps=500,
    load_best_model_at_end=True,
    remove_unused_columns=False  # Avoid removing unused columns
)

# Define Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    tokenizer=tokenizer
)

# Train the Model
trainer.train()

# Save the Model
model.save_pretrained("./fine_tuned_gpt2")
tokenizer.save_pretrained("./fine_tuned_gpt2")


KeyboardInterrupt: 

In [1]:
import re

def preprocess_data(file_path):
    # Read and parse the dataset
    with open(file_path, 'r', encoding='utf-8') as f:
        data = f.read()
    # Extract all keyword-story pairs
    pattern = re.compile(r'<\|keywords\|>(.*?)<\|story\|>(.*?)<\|endoftext\|>', re.DOTALL)
    matches = pattern.findall(data)

    data_pairs = []
    
    i = 0
    for keywords, story in matches:
        # Format the input text with special tokens
        if(i<5):
            formatted_text = f"<|keywords|> {keywords.strip()} <|story|> {story.strip()} <|endoftext|>"
            data_pairs.append(formatted_text)
            i = i+1
    return data_pairs



    
    # # Read the text file
    # with open(file_path, 'r') as file:
    #     dataset = file.readlines()

    # # Pattern to extract the data
    # pattern = r"<\|keywords\|>(.*?)<\|story\|>(.*?)<\|endoftext\|>"
    
    # data_pairs = []

    # i=0
    # # Extract keywords and story from each entry
    # for entry in dataset:
    #     match = re.search(pattern, entry)
    #     if match:
    #         keywords = match.group(1).strip()
    #         story = match.group(2).strip()
    #         # Format it for GPT-2
    #         if(i<5):
    #             data_pairs.append(f"<|keywords|> {keywords} <|story|> {story}")
    #             i= i+1
    
    # return data_pairs


In [2]:
from transformers import GPT2Tokenizer
from datasets import Dataset

# Load the GPT-2 tokenizer
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
# Add a padding token (GPT-2 does not have one by default)
tokenizer.pad_token = tokenizer.eos_token  # Use EOS token as padding

# Load and preprocess the dataset from the file
data_pairs = preprocess_data('/kaggle/input/storydataset2/formatted_text.txt')  # Replace with your file path

# Convert it to Hugging Face Dataset format
train_dataset = Dataset.from_dict({'text': data_pairs})

# Tokenize the dataset
# def tokenize_function(examples):
#     return tokenizer(examples['text'], padding="max_length", truncation=True)
def tokenize_function(examples):
    return tokenizer(examples['text'], padding=True, truncation=True, max_length=512)


train_dataset = train_dataset.map(tokenize_function, batched=True)


tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

Map:   0%|          | 0/5 [00:00<?, ? examples/s]

In [3]:
from transformers import GPT2LMHeadModel, Trainer, TrainingArguments

# Load the GPT-2 model
model = GPT2LMHeadModel.from_pretrained("gpt2")

# Define the training arguments
training_args = TrainingArguments(
    output_dir="./gpt2-finetuned",  # Directory where the model will be saved
    num_train_epochs=3,
    per_device_train_batch_size=2,
    save_steps=500,
    save_total_limit=2,
    logging_dir='./logs',
)

# Create the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
)

# Fine-tune the model
trainer.train()


model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


<IPython.core.display.Javascript object>

KeyboardInterrupt: 

In [52]:
# /kaggle/working/keyword_story_pairs_43.csv

Processed 68700 / 2119719 rows


In [51]:
# import pandas as pd
# import spacy

# # Load the CSV
# csv_data = pd.read_csv("/kaggle/working/keyword_story_pairs_6.csv")

Processed 73200 / 2119719 rows


In [53]:
# from transformers import GPT2Tokenizer

# tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
# tokenizer.add_special_tokens({"additional_special_tokens": ["<|keywords|>", "<|story|>", "<|endoftext|>"]})

# # Format each row with tokens
# formatted_data = []
# for _, row in csv_data.iterrows():
#     formatted_row = f"<|keywords|>{row['keywords']}<|story|>{row['text']}<|endoftext|>"
#     formatted_data.append(formatted_row)

# # Save to a text file
# with open("formatted_stories.txt", "w") as f:
#     f.write("\n".join(formatted_data))

Processed 68800 / 2119719 rows


In [1]:
# !gunzip /kaggle/working/output_story_dataset.txt.gz

In [2]:
from transformers import GPT2Tokenizer, GPT2LMHeadModel, Trainer, TrainingArguments
import torch
from torch.utils.data import Dataset
import re


In [3]:
# Custom Dataset Class
class StoryDataset(Dataset):
    def __init__(self, file_path, tokenizer, max_length=128):
        self.tokenizer = tokenizer
        self.max_length = max_length
        self.examples = []

        # Read and parse the dataset
        with open(file_path, 'r', encoding='utf-8') as f:
            data = f.read()
        
        # Extract all keyword-story pairs
        pattern = re.compile(r'<\|keywords\|>(.*?)<\|story\|>(.*?)<\|endoftext\|>', re.DOTALL)
        matches = pattern.findall(data)
        
        for keywords, story in matches:
            # Format the input text with special tokens
            formatted_text = f"<|keywords|> {keywords.strip()} <|story|> {story.strip()} <|endoftext|>"
            self.examples.append(formatted_text)
    
    def __len__(self):
        return len(self.examples)
    
    def __getitem__(self, idx):
        text = self.examples[idx]
        encoding = self.tokenizer(
            text,
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )
        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': encoding['input_ids'].flatten()
        }

In [6]:
# Initialize Tokenizer and Model
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
tokenizer.add_special_tokens({"pad_token": "<|pad|>"})
model = GPT2LMHeadModel.from_pretrained('gpt2')

# Add Special Tokens
special_tokens = {
    'additional_special_tokens': ['<|keywords|>', '<|story|>', '<|endoftext|>']
}
tokenizer.add_special_tokens(special_tokens)
model.resize_token_embeddings(len(tokenizer))

Embedding(50260, 768)

In [8]:
from datasets import load_dataset, Dataset, load_from_disk
# from datasets import load_from_disk
tokenized_dataset_10k  = StoryDataset('/kaggle/input/storydataset2/formatted_text.txt', tokenizer)

# Save the tokenized dataset to disk
tokenized_dataset_10k.save_to_disk("/kaggle/working/tokenized_dataset_10k")

from datasets import load_dataset, Dataset, load_from_disk
# from datasets import load_from_disk
tokenized_dataset_10k = load_from_disk("/kaggle/working/tokenized_dataset_10k")
# Split into training and validation
tokenized_dataset = tokenized_dataset_10k.train_test_split(test_size=0.1)

# # Split into training and validation
# tokenized_dataset = tokenized_dataset_10k.train_test_split(test_size=0.1)

# 4. Define Training Arguments
training_args = TrainingArguments(
    output_dir="./results",
    overwrite_output_dir=True,
    num_train_epochs=3,
    per_device_train_batch_size=4,
    save_steps=500,
    save_total_limit=2,
    logging_dir="./logs",
    evaluation_strategy="steps",
    eval_steps=500,
    load_best_model_at_end=True
)

# 5. Define Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    tokenizer=tokenizer
)

# 6. Train the Model
trainer.train()

# 7. Save the Model
model.save_pretrained("./fine_tuned_gpt2")
tokenizer.save_pretrained("./fine_tuned_gpt2")

AttributeError: 'StoryDataset' object has no attribute 'save_to_disk'

In [10]:
# Prepare Dataset
dataset = StoryDataset('/kaggle/input/storydataset2/formatted_text.txt', tokenizer)

# Training Arguments
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=1,
    per_device_train_batch_size=4,
    save_steps=500,
    logging_steps=10,  # Log more frequently
    learning_rate=5e-5,
    weight_decay=0.01,
    warmup_steps=100,
    logging_dir='./logs',  # Explicitly define logging directory
    report_to="all",  # Ensures logging is enabled
)

# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset,
)

In [11]:
# Train the Model
trainer.train()

<IPython.core.display.Javascript object>

KeyboardInterrupt: 

In [None]:
# Save the Model
model.save_pretrained('./trained_model')
tokenizer.save_pretrained('./trained_model')

In [None]:
# Generate Story from Keywords
def generate_story(keywords, max_length=100):
    prompt = f"<|keywords|> {keywords} <|story|>"
    input_ids = tokenizer.encode(prompt, return_tensors='pt')
    
    output = model.generate(
        input_ids,
        max_length=max_length,
        num_return_sequences=1,
        eos_token_id=tokenizer.eos_token_id,
        pad_token_id=tokenizer.eos_token_id,
        early_stopping=True
    )
    
    story = tokenizer.decode(output[0], skip_special_tokens=False)
    # Extract the generated story (remove the prompt and end token)
    story = story.split('<|story|>')[1].split('<|endoftext|>')[0].strip()
    return story

# Example Usage
keywords = "day, girl, playing, ball"
generated_story = generate_story(keywords)
print(generated_story)

In [2]:
from transformers import GPT2LMHeadModel, Trainer, TrainingArguments
from datasets import load_dataset

# Load the formatted dataset
dataset = load_dataset("text", data_files="/kaggle/input/storydataset2/formatted_text.txt")

# Tokenize the dataset
def tokenize_function(examples):
    return tokenizer(
        examples["text"],
        truncation=True,
        max_length=512,
        padding="max_length",
        return_tensors="pt"
    )

tokenized_dataset = dataset.map(tokenize_function, batched=True)

# Initialize the model
model = GPT2LMHeadModel.from_pretrained("gpt2")
model.resize_token_embeddings(len(tokenizer))  # Update for new tokens

# Training arguments
training_args = TrainingArguments(
    output_dir="./results",
    num_train_epochs=3,
    per_device_train_batch_size=4,
    learning_rate=3e-5,
    save_steps=500,
    logging_steps=100,
    fp16=True,  # Enable if using a GPU
)

# Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
)

# Train
trainer.train()

Generating train split: 0 examples [00:00, ? examples/s]

Map:   0%|          | 0/1 [00:00<?, ? examples/s]

NameError: name 'tokenizer' is not defined

In [12]:
# from transformers import GPT2Tokenizer, GPT2LMHeadModel, Trainer, TrainingArguments
# from datasets import load_dataset
# import pandas as pd

# # 1. Load and prepare the dataset
# csv_data = pd.read_csv("/kaggle/working/keyword_story_pairs_6.csv")

# # 2. Add special tokens to the tokenizer (including pad_token)
# tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
# tokenizer.add_special_tokens({
#     "pad_token": "<|pad|>",
#     "additional_special_tokens": ["<|keywords|>", "<|story|>", "<|endoftext|>"]
# })

# # 3. Resize model embeddings
# model = GPT2LMHeadModel.from_pretrained("gpt2")
# model.resize_token_embeddings(len(tokenizer))

# # 4. Format the dataset with special tokens
# formatted_data = []
# for story in csv_data["text"]:
#     # Simulate keywords (replace with your keyword extraction logic)
#     keywords = "magic, adventure, forest"  # Example placeholder
#     formatted_data.append(f"<|keywords|>{keywords}<|story|>{story}<|endoftext|>")

# # 5. Tokenize the dataset
# dataset = load_dataset("text", data_files={"train": formatted_data})



In [13]:
# def generate_story(keywords, max_length=300):
#     prompt = f"<|keywords|>{keywords}<|story|>"
#     input_ids = tokenizer.encode(prompt, return_tensors="pt")
    
#     output = model.generate(
#         input_ids,
#         max_length=max_length,
#         do_sample=True,
#         temperature=0.9,
#         top_p=0.92,
#         repetition_penalty=1.2,
#         pad_token_id=tokenizer.eos_token_id,
#         eos_token_id=tokenizer.eos_token_id
#     )
    
#     # Decode and clean the output
#     full_text = tokenizer.decode(output[0], skip_special_tokens=False)
#     story = full_text.split("<|story|>")[1].replace("<|endoftext|>", "").strip()
#     return story

In [14]:
# # Example usage
# keywords = "dragon, castle, magic, sword"
# print(generate_story(keywords))

## #######################

In [18]:
import pandas as pd
from transformers import GPT2Tokenizer, GPT2LMHeadModel

# 1. Load the CSV
csv_data = pd.read_csv("/kaggle/working/keyword_story_pairs_6.csv")

# 2. Format the data with special tokens and save to a text file
output_file = "/kaggle/working/formatted_stories.txt"
with open(output_file, "w") as f:
    for story in csv_data["text"]:
        # Simulate keywords (replace with your logic)
        keywords = "magic, adventure, forest"  # Example placeholder
        formatted_line = f"<|keywords|>{keywords}<|story|>{story}<|endoftext|>\n"
        f.write(formatted_line)

In [19]:
from datasets import load_dataset

# 3. Load the dataset from the saved file
dataset = load_dataset("text", data_files={"train": output_file})

Generating train split: 0 examples [00:00, ? examples/s]

In [20]:
# 4. Initialize tokenizer with pad_token
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
tokenizer.add_special_tokens({
    "pad_token": "<|pad|>",
    "additional_special_tokens": ["<|keywords|>", "<|story|>", "<|endoftext|>"]
})

# 5. Tokenize the dataset
def tokenize_function(examples):
    return tokenizer(
        examples["text"],
        truncation=True,
        max_length=512,
        padding="max_length",
        return_tensors="pt"
    )

tokenized_dataset = dataset.map(tokenize_function, batched=True)

Map:   0%|          | 0/623 [00:00<?, ? examples/s]

In [21]:
# 6. Train the model (same as before)
model = GPT2LMHeadModel.from_pretrained("gpt2")
model.resize_token_embeddings(len(tokenizer))

Embedding(50260, 768)

In [24]:
# Initialize the model
model = GPT2LMHeadModel.from_pretrained("gpt2")
model.resize_token_embeddings(len(tokenizer))  # Update for new tokens

# Training arguments
# training_args = TrainingArguments(
#     output_dir="./results",
#     num_train_epochs=3,
#     per_device_train_batch_size=4,
#     learning_rate=3e-5,
#     save_steps=500,
#     logging_steps=100,
#     fp16=True,  # Enable if using a GPU
# )

training_args = TrainingArguments(
    output_dir="./results",
    num_train_epochs=3,
    per_device_train_batch_size=4,
    learning_rate=3e-5,
    save_steps=500,
    logging_steps=10,  # Log more frequently
    # evaluation_strategy="steps",
    # eval_steps=50,  # Optional, if you have a validation dataset
    fp16=True,  
    # report_to="none",  # Prevents reporting to external services
)


# Trainer-
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
)

# Train
trainer.train()

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


<IPython.core.display.Javascript object>

Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/IPython/core/interactiveshell.py", line 3553, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-24-12a3e036867d>", line 38, in <cell line: 38>
    trainer.train()
  File "/usr/local/lib/python3.10/dist-packages/transformers/trainer.py", line 2164, in train
    return inner_training_loop(
  File "/usr/local/lib/python3.10/dist-packages/transformers/trainer.py", line 2429, in _inner_training_loop
    self.control = self.callback_handler.on_train_begin(args, self.state, self.control)
  File "/usr/local/lib/python3.10/dist-packages/transformers/trainer_callback.py", line 469, in on_train_begin
    return self.call_event("on_train_begin", args, state, control)
  File "/usr/local/lib/python3.10/dist-packages/transformers/trainer_callback.py", line 519, in call_event
    result = getattr(callback, event)(
  File "/usr/local/lib/python3.10/dist-packages/transformers/integra

TypeError: object of type 'NoneType' has no len()

In [None]:
def generate_story(keywords, max_length=300):
    prompt = f"<|keywords|>{keywords}<|story|>"
    input_ids = tokenizer.encode(prompt, return_tensors="pt")
    
    output = model.generate(
        input_ids,
        max_length=max_length,
        do_sample=True,
        temperature=0.9,
        top_p=0.92,
        repetition_penalty=1.2,
        pad_token_id=tokenizer.eos_token_id,
        eos_token_id=tokenizer.eos_token_id
    )
    
    # Decode and clean the output
    full_text = tokenizer.decode(output[0], skip_special_tokens=False)
    story = full_text.split("<|story|>")[1].replace("<|endoftext|>", "").strip()
    return story

# Example usage
keywords = "dragon, castle, magic, sword"
print(generate_story(keywords))