## Download Dataset

In [None]:
#!/bin/bash
!curl -L -o /kaggle/working/tinystories-narrative-classification.zip https://www.kaggle.com/api/v1/datasets/download/thedevastator/tinystories-narrative-classification

In [None]:
!unzip /kaggle/working/tinystories-narrative-classification.zip

## Setting up environment

In [None]:
import pandas as pd
csv_data = pd.read_csv("train.csv")
csv_data.head()

In [None]:
import csv

# Open the CSV file
with open('/kaggle/working/train.csv', "r") as file:
    csv_reader = csv.reader(file)
    
    # Iterate through the rows
    for i, row in enumerate(csv_reader):
        if i == 1:  # Index 1 corresponds to the second row (0-based indexing)
            print(row)
            break  # Exit after printing the second row

In [None]:
pip install nltk

In [None]:
import nltk
nltk.download('punkt_tab')

In [None]:
!python -m spacy download en

In [None]:
!rm /kaggle/working/output_story_dataset1.txt.gz
!rm /kaggle/working/output_story_dataset2.txt.gz
!rm /kaggle/working/story_dataset.txt.gz
!rm /kaggle/working/tinystories-narrative-classification.zip

## Create Dataset which can be understand to be trained by GPT2 2

In [None]:
import csv
import re
import spacy
import gzip
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk import download
from multiprocessing import Pool, cpu_count
import pandas as pd
import time

In [None]:
# Download NLTK resources
download('stopwords')
download('punkt')

# Load spaCy model for advanced NLP
nlp = spacy.load("en_core_web_sm")

# Define stopwords
stop_words = set(stopwords.words("english"))

In [None]:
# Function to extract keywords in order
def extract_ordered_terms(text):
    # Tokenize and clean the text
    tokens = word_tokenize(re.sub(r'[^\w\s]', '', text.lower()))
    filtered_tokens = [word for word in tokens if word not in stop_words and len(word) > 2]

    # Process text with spaCy
    doc = nlp(text)

    terms = []
    for token in doc:
        if token.text.lower() in filtered_tokens:
            if token.pos_ in {"VERB", "NOUN"}:  # Actions or objects
                terms.append(token.text)
        if token.ent_type_ in {"GPE", "LOC", "PERSON"}:  # Places or names
            terms.append(token.text)
        if token.text.lower() in {"happy", "sad", "angry", "excited", "scared", "love"}:  # Emotions
            terms.append(token.text)

    # Deduplicate while preserving order
    seen = set()
    ordered_terms = [term for term in terms if not (term in seen or seen.add(term))]
    return ordered_terms

# Function to process a single row
def process_row(row):
    story = row.strip().replace("\n", " ")
    if not story:
        return None  # Skip empty stories

    keywords = extract_ordered_terms(story)
    formatted_story = (
        f"<|startoftext|>Keywords: {', '.join(keywords)}\n"
        f"Story: {story}<|endoftext|>\n"
    )
    return formatted_story

# Function to process a batch of stories
def process_batch(batch):
    return [process_row(row) for row in batch if row]

# Function to process the dataset in chunks with parallel processing
def process_csv_in_chunks(input_file, output_file, chunksize=10000):
    num_cores = cpu_count()
    print(f"Using {num_cores} CPU cores for parallel processing.")

    # Calculate total rows for live status updates
    total_rows = sum(1 for _ in open(input_file)) - 1  # Subtract header row

    processed_count = 0
    start_time = time.time()

    with gzip.open(output_file, "wt") as output:
        for chunk_idx, chunk in enumerate(pd.read_csv(input_file, chunksize=chunksize)):
            stories = chunk['text'].dropna().tolist()  # Ensure no null values

            # Split stories into smaller batches for parallel processing
            batches = [stories[i:i + chunksize // num_cores] for i in range(0, len(stories), chunksize // num_cores)]

            with Pool(num_cores) as pool:
                results = pool.map(process_batch, batches)

            # Flatten the results and filter out None values
            flat_results = [item for sublist in results for item in sublist if item]

            # Write to output file
            output.writelines(flat_results)

            # Update progress
            processed_count += len(stories)
            elapsed_time = time.time() - start_time
            percentage_complete = (processed_count / total_rows) * 100
            print(f"Chunk {chunk_idx + 1}: Processed {processed_count}/{total_rows} stories ({percentage_complete:.2f}%) in {elapsed_time:.2f} seconds.")

In [None]:
# Main function to execute the process
def main():
    input_file = "/kaggle/working/train.csv"  
    output_file = "/kaggle/working/output_story_dataset2.txt.gz" 

    print("Starting dataset processing...")
    process_csv_in_chunks(input_file, output_file)
    print("Processing complete. Output saved to:", output_file)

In [None]:
if __name__ == "__main__":
    main()