In [None]:
%pip install tensorflow numpy requests bs4

import os
import requests
import time
from bs4 import BeautifulSoup
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Embedding, Dense

In [10]:
# Directory to store scraped data
data_dir = "scraped_data"
os.makedirs(data_dir, exist_ok=True)

# List of URLs to scrape (kids story related articles)
story_urls = [
    "https://www.storyberries.com/",
    "http://www.magickeys.com/books/",
    "https://www.mainlesson.com/",
    "https://www.storynory.com/",
    "https://www.worldoftales.com/",
    "https://www.freekidsbooks.org/",
    "https://www.fairytales.biz/",
    "https://americanliterature.com/childrens-stories",
    "https://www.kidsworldfun.com/story-contest/",
    "https://www.shortkidstories.com/"
]

# Function to extract text content from a webpage
def scrape_story(url, file_index):
    try:
        response = requests.get(url, timeout=10)
        response.raise_for_status()
        soup = BeautifulSoup(response.text, "html.parser")

        # Extracting text content - Adjust selectors based on website structure
        paragraphs = soup.find_all("p")
        story_text = "\n".join([para.get_text() for para in paragraphs if len(para.get_text()) > 30])

        # Save story to a text file
        if story_text:
            file_path = os.path.join(data_dir, f"story_{file_index}.txt")
            with open(file_path, "w", encoding="utf-8") as file:
                file.write(story_text)
            print(f"Saved: {file_path}")

    except requests.exceptions.RequestException as e:
        print(f"Error scraping {url}: {e}")

# Iterative scraping of story websites
for i, url in enumerate(story_urls):
    scrape_story(url, i)
    time.sleep(2)  # To avoid getting blocked

print("Scraping completed. Stories saved in 'scraped_stories' directory.")

Saved: scraped_data\story_0.txt
Saved: scraped_data\story_1.txt
Saved: scraped_data\story_2.txt
Saved: scraped_data\story_3.txt
Saved: scraped_data\story_4.txt
Saved: scraped_data\story_5.txt
Error scraping https://www.fairytales.biz/: 406 Client Error: Not Acceptable for url: https://www.fairytales.biz/
Saved: scraped_data\story_7.txt
Saved: scraped_data\story_8.txt
Saved: scraped_data\story_9.txt
Scraping completed. Stories saved in 'scraped_stories' directory.


In [11]:
# Read all files and combine text
def load_data(directory):
    text_data = ""
    for filename in os.listdir(directory):
        with open(os.path.join(directory, filename), "r", encoding="utf-8") as file:
            text_data += file.read() + "\n"
    return text_data

# Load and preprocess text
text = load_data(data_dir)
print(f"Total characters in text: {len(text)}")

Total characters in text: 33523


In [12]:
# Tokenization
tokenizer = Tokenizer()
tokenizer.fit_on_texts([text])
total_words = len(tokenizer.word_index) + 1

# Generate input sequences
input_sequences = []
for line in text.split('.'):
    token_list = tokenizer.texts_to_sequences([line])[0]
    for i in range(1, len(token_list)):
        input_sequences.append(token_list[:i+1])

# Padding sequences
max_seq_length = max(len(seq) for seq in input_sequences)
input_sequences = pad_sequences(input_sequences, maxlen=max_seq_length, padding='pre')

# Split into X (features) and y (labels)
X, y = input_sequences[:, :-1], input_sequences[:, -1]
y = tf.keras.utils.to_categorical(y, num_classes=total_words)

X.shape, y.shape


((5054, 179), (5054, 1894))

In [None]:
# Define the LSTM Model
model = Sequential([
    Embedding(total_words, 50, input_length=max_seq_length-1),
    LSTM(100, return_sequences=True),
    LSTM(100),
    Dense(100, activation='relu'),
    Dense(total_words, activation='softmax')
])

# Compile the model
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

# Train the model
model.fit(X, y, epochs=100, verbose=1)

Epoch 1/100
[1m158/158[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m27s[0m 153ms/step - accuracy: 0.0220 - loss: 7.2044
Epoch 2/100
[1m158/158[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 102ms/step - accuracy: 0.0305 - loss: 6.6705
Epoch 3/100
[1m158/158[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m19s[0m 120ms/step - accuracy: 0.0322 - loss: 6.4381
Epoch 4/100
[1m158/158[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m45s[0m 283ms/step - accuracy: 0.0344 - loss: 6.2356
Epoch 5/100
[1m158/158[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m38s[0m 237ms/step - accuracy: 0.0414 - loss: 6.0248
Epoch 6/100
[1m158/158[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m17s[0m 106ms/step - accuracy: 0.0421 - loss: 5.8614
Epoch 7/100
[1m158/158[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m17s[0m 107ms/step - accuracy: 0.0480 - loss: 5.6693
Epoch 8/100
[1m158/158[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m18s[0m 114ms/step - accuracy: 0.0519 - loss: 5.4465
Epoch 9/

In [None]:
# Text generation function
def generate_text(seed_text, next_words=10):
    for _ in range(next_words):
        token_list = tokenizer.texts_to_sequences([seed_text])[0]
        token_list = pad_sequences([token_list], maxlen=max_seq_length-1, padding='pre')
        predicted = np.argmax(model.predict(token_list), axis=-1)
        for word, index in tokenizer.word_index.items():
            if index == predicted:
                seed_text += " " + word
                break
    return seed_text

# Generate text from a seed phrase
print(generate_text("A boy named Ivan   ", next_words=50))