In [3]:
from openai import OpenAI
from dotenv import load_dotenv
import json
import requests

In [4]:
def load_prompt(file):
    with open(file, 'r') as f:
        return f.read()

In [5]:
def get_text_snapshot(web_url):
    """
    Fetch a text snapshot of the webpage using r.jina.ai.
    
    Args:
        web_url (str): The URL of the webpage to process.
        
    Returns:
        str: The cleaned text content from the webpage, or an error message.
    """
    try:
        # Construct the API URL
        api_url = f"https://r.jina.ai/{web_url}"
        
        # Make a GET request to fetch the cleaned content
        response = requests.get(api_url)
        response.raise_for_status()  # Raise an HTTPError for bad responses (4xx and 5xx)

        # Return the text content of the response
        return response.text
    except requests.exceptions.RequestException as e:
        return f"Error fetching text snapshot: {e}"

In [6]:
load_dotenv()
client = OpenAI()

def find_available_content(url):
    completion = client.chat.completions.create(
        model="gpt-4o",
        messages=[
            {
                "role": "system",
                "content": [
                    {
                        "type": "text",
                        "text": load_prompt('./2024-11-19-content_finder.md')
                    }
                ]
            },
            {
                "role": "user",
                "content": [
                    {
                        "type": "text",
                        "text": get_text_snapshot(url)
                    }
                ]
            }
        ],
        temperature=0.0,
        response_format={
            "type": "json_schema",
            "json_schema": {
                "name": "categories_dict",
                "schema": {
                    "type": "object",
                    "properties": {
                        "categories": {
                            "description": "The categories of contents to be collected, in a dictionary format. The key is the category name, and the value is the url to contents.",
                            "type": "object"
                        }
                    },
                    "additionalProperties": False
                }
            }
        }
    )
    return completion

completion_category = find_available_content("https://www.freechildrenstories.com/")
completion_category.choices[0].message.content

'{"categories":{"Popular Stories for Kids":"https://www.freechildrenstories.com/","The Robot Bedtime Book":"https://www.freechildrenstories.com/the-robot-bedtime-book-1","The Journey of the Noble Gnarble":"https://www.freechildrenstories.com/the-journey-of-the-noble-gnarble"}}'

In [7]:
categories = json.loads(completion_category.choices[0].message.content).get('categories', {})
categories

{'Popular Stories for Kids': 'https://www.freechildrenstories.com/',
 'The Robot Bedtime Book': 'https://www.freechildrenstories.com/the-robot-bedtime-book-1',
 'The Journey of the Noble Gnarble': 'https://www.freechildrenstories.com/the-journey-of-the-noble-gnarble'}

In [7]:
def GPT_boolean(url, query):
    """
    Checks if the given URL's content matches the query description.

    Args:
        url (str): The URL to evaluate.
        query (str): The query description to validate against the URL.

    Returns:
        bool: True if the URL content matches the query description, False otherwise.
    """
    try:
        # OpenAI completion API call
        completion = client.chat.completions.create(
            model="gpt-4o",
            messages=[
            {"role": "system", "content": "Some times AI returns incorrect urls, You are a helpful assistant that evaluates that checks the webpage snap shot and determine if there actually stories (with title and urls) in that page. Return 'True' if the content is relevant to the query and 'False' otherwise. return false if page not found or story not found."},
            {"role": "user", "content": get_text_snapshot(url)},
            {"role": "user", "content": query}
            ],
            temperature=0.0,
            response_format={
            "type": "json_schema",
            "json_schema": {
                "name": "GPT_boolean",
                "schema": {
                "type": "object",
                "properties": {
                    "decision": {
                    "description": "Whether or not the URL is related to the query.",
                    "type": "boolean"
                    }
                },
                "additionalProperties": False
                }
            }
            }
        )

        # Extract the model's response
        response = json.loads(completion.choices[0].message.content)
        decision = response.get("decision", None)

        # Return True or False based on the response
        if isinstance(decision, bool):
            return decision
        else:
            raise ValueError("Unexpected response from GPT model.")
    
    except Exception as e:
        print(f"Error occurred: {e}")
        return False


GPT_boolean('https://storiestogrowby.org/fairy-tales-for-kids/', "this website contains > 10 stories (with title and urls)")

True

In [8]:
for category, category_url in categories.copy().items():
    if GPT_boolean(category_url, "Does this website contain > 10 stories (with title and urls)?"):
        print(f"The category '{category}' contains stories for kids.")
    else:
        print(f"The category '{category}' does not contain stories for kids.")
        del categories[category]

categories

The category 'Bedtime Stories' contains stories for kids.
The category 'Fairy Tales' contains stories for kids.
The category 'Folktales' contains stories for kids.
The category 'Holiday Stories' contains stories for kids.
The category 'Moral Stories' contains stories for kids.


{'Bedtime Stories': 'https://storiestogrowby.org/bedtime-stories-for-kids/',
 'Fairy Tales': 'https://storiestogrowby.org/fairy-tales-for-kids/',
 'Folktales': 'https://storiestogrowby.org/world-tales-for-kids',
 'Holiday Stories': 'https://storiestogrowby.org/free-holiday-stories/',
 'Moral Stories': 'https://storiestogrowby.org/classroom-challenge-topics-for-teachers/'}

In [9]:
def get_titles_and_urls(url):
    completion = client.chat.completions.create(
        model="gpt-4o",
        messages=[
            {
                "role": "system",
                "content": [
                    {
                        "type": "text",
                        "text": load_prompt('./2024-11-19-get_title_and_url.md')
                    }
                ]
            },
            {
                "role": "user",
                "content": [
                    {
                        "type": "text",
                        "text": get_text_snapshot(url)
                    }
                ]
            }
        ],
        temperature=0.0,
        response_format={
        "type": "json_schema",
        "json_schema": {
            "name": "titles_dict",
            "schema": {
                "type": "object",
                "properties": {
                    "titles_and_urls": {
                        "description": "A dictionary where keys are content titles and values are the corresponding URLs.",
                        "type": "object",
                        "additionalProperties": {  # Enforce key-value pairs in titles_and_urls
                            "type": "string",  # Each value (URL) must be a string
                            "description": "The URL corresponding to the title."
                        }
                    },
                    "next_page": {
                        "description": "The URL of the next page to fetch titles and URLs from. Empty means no more pages.",
                        "type": "string"
                    }
                },
                "required": ["titles_and_urls", "next_page"],  # Ensure both keys are present
                "additionalProperties": False  # Disallow extra fields
            }
        }
    }
)
    
    return completion


## test on one category
#completion_titles = get_titles_and_urls(categories['Moral Stories'])
#completion_titles.choices[0].message.content

In [10]:
#titles_and_urls = json.loads(completion_titles.choices[0].message.content).get('titles_and_urls', {})
#titles_and_urls

In [11]:
for category, category_url in categories.items():
    print(category)
    print(category_url)

Bedtime Stories
https://storiestogrowby.org/bedtime-stories-for-kids/
Fairy Tales
https://storiestogrowby.org/fairy-tales-for-kids/
Folktales
https://storiestogrowby.org/world-tales-for-kids
Holiday Stories
https://storiestogrowby.org/free-holiday-stories/
Moral Stories
https://storiestogrowby.org/classroom-challenge-topics-for-teachers/


In [12]:
titles_and_urls = {}  

for category, category_url in categories.items():
    completion_titles = get_titles_and_urls(category_url)
    titles_and_urls_json = json.loads(completion_titles.choices[0].message.content).get('titles_and_urls', {})
    print(f"Category: {category}")
    
    for title, url in titles_and_urls_json.items():
        titles_and_urls[title] = (url, category)
        print(f"{title}: {titles_and_urls[title]}")

Category: Bedtime Stories
Cinderella (Classic): ('https://storiestogrowby.org/story/cinderella-fairy-tale-english-story-for-kids/', 'Bedtime Stories')
Hansel and Gretel: ('https://storiestogrowby.org/story/hansel-and-gretel-bedtime-stories-for-kids/', 'Bedtime Stories')
Rumpelstiltskin: ('https://storiestogrowby.org/story/early-reader-rumpelstiltskin-fairy-tale-english-stories-kids/', 'Bedtime Stories')
Beauty and the Beast: ('https://storiestogrowby.org/story/beauty-the-beast/', 'Bedtime Stories')
Pinocchio: ('https://storiestogrowby.org/story/pinocchio-fairy-tale-story-english-kids/', 'Bedtime Stories')
The Ugly Duckling: ('https://storiestogrowby.org/story/the-ugly-duckling-story-a-fairy-tale-story-for-kids/', 'Bedtime Stories')
Sleeping Beauty: ('https://storiestogrowby.org/story/sleeping-beauty-fairy-tale-story-bedtime-stories-for-kids/', 'Bedtime Stories')
Rapunzel: ('https://storiestogrowby.org/story/early-reader-rapunzel-fairy-tale-story-kids/', 'Bedtime Stories')
The Little Me

In [13]:
titles_and_urls

{'Cinderella (Classic)': ('https://storiestogrowby.org/story/cinderella-fairy-tale-english-story-for-kids/',
  'Fairy Tales'),
 'Hansel and Gretel': ('https://storiestogrowby.org/story/hansel-and-gretel-bedtime-stories-for-kids/',
  'Bedtime Stories'),
 'Rumpelstiltskin': ('https://storiestogrowby.org/story/early-reader-rumpelstiltskin-fairy-tale-english-stories-kids/',
  'Bedtime Stories'),
 'Beauty and the Beast': ('https://storiestogrowby.org/story/beauty-the-beast/',
  'Fairy Tales'),
 'Pinocchio': ('https://storiestogrowby.org/story/pinocchio-fairy-tale-story-english-kids/',
  'Bedtime Stories'),
 'The Ugly Duckling': ('https://storiestogrowby.org/story/the-ugly-duckling-story-a-fairy-tale-story-for-kids/',
  'Bedtime Stories'),
 'Sleeping Beauty': ('https://storiestogrowby.org/story/sleeping-beauty-fairy-tale-story-bedtime-stories-for-kids/',
  'Fairy Tales'),
 'Rapunzel': ('https://storiestogrowby.org/story/early-reader-rapunzel-fairy-tale-story-kids/',
  'Fairy Tales'),
 'The L

In [14]:
story_url = titles_and_urls['The Girl and the Puma']

completion = client.chat.completions.create(
        model="gpt-4o",
        messages=[
            {
                "role": "system",
                "content": [
                    {
                        "type": "text",
                        "text": load_prompt('./2024-11-19-get_content.md')
                    }
                ]
            },
            {
                "role": "user",
                "content": [
                    {
                        "type": "text",
                        "text": get_text_snapshot(story_url[0])
                    }
                ]
            }
        ],
        temperature=0.0,
        response_format={
            "type": "json_schema",
            "json_schema": {
                "name": "story_dict",
                "schema": {
                    "type": "object",
                    "properties": {
                        "content": {
                            "description": "A dictionary containing the extracted story information.",
                            "type": "object",
                            "properties": {
                                "title": {
                                    "description": "The title of the story.",
                                    "type": "string"
                                },
                                "author": {
                                    "description": "The author of the story, if available. Empty string if unknown.",
                                    "type": "string"
                                },
                                "content": {
                                    "description": "The main content of the story.",
                                    "type": "string"
                                }
                            },
                            "required": ["title", "author", "content"],
                            "additionalProperties": False  # No extra fields allowed
                        },
                        "next_page": {
                            "description": "The URL of the next page to fetch more content from. Empty string means no more pages.",
                            "type": "string"
                        }
                    },
                    "required": ["content", "next_page"],  # Ensure these keys are always present
                    "additionalProperties": False  # Disallow extra fields in the top-level object
                }
            }
}
)


In [15]:
json.loads(completion.choices[0].message.content)

{'content': {'title': 'The Girl and the Puma',
  'author': '',
  'content': "HEAR A FAMOUS LEGEND FROM ARGENTINA, about a powerful friendship between a girl and a puma.\n\nFive hundred years ago when the Spanish entered South America, Native American tribes often fought back against the invaders. One way tribes could put pressure on the Spanish was to surround their settlements. This is what happened in the early 1500's when Maldonado, a Spanish girl, was 15 years old.\n\nNative Americans of the Querandí tribe had encircled the Spanish settlement where Maldonado lived. Before long, the food supply of the Spanish settlers was depleted. The people faced starvation. They begged their captain to allow them to take their chances and leave the settlement in search of food - but this the captain would not allow.\n\nFamished, Maldonado escaped the settlement and fled into the jungle. As night fell, she heard with alarm the calls of wild animals. Where could she safely sleep for the night? But 

In [16]:
def get_content(url):
    completion = client.chat.completions.create(
            model="gpt-4o",
            messages=[
                {
                    "role": "system",
                    "content": [
                        {
                            "type": "text",
                            "text": load_prompt('./2024-11-19-get_content.md')
                        }
                    ]
                },
                {
                    "role": "user",
                    "content": [
                        {
                            "type": "text",
                            "text": get_text_snapshot(url)
                        }
                    ]
                }
            ],
            temperature=0.0,
            response_format={
                "type": "json_schema",
                "json_schema": {
                    "name": "story_dict",
                    "schema": {
                        "type": "object",
                        "properties": {
                            "content": {
                                "description": "A dictionary containing the extracted story information.",
                                "type": "object",
                                "properties": {
                                    "title": {
                                        "description": "The title of the story.",
                                        "type": "string"
                                    },
                                    "author": {
                                        "description": "The author of the story, if available. Empty string if unknown.",
                                        "type": "string"
                                    },
                                    "content": {
                                        "description": "The main content of the story.",
                                        "type": "string"
                                    }
                                },
                                "required": ["title", "author", "content"],
                                "additionalProperties": False  # No extra fields allowed
                            },
                            "next_page": {
                                "description": "The URL of the next page to fetch more content from. Empty string means no more pages.",
                                "type": "string"
                            }
                        },
                        "required": ["content", "next_page"],  # Ensure these keys are always present
                        "additionalProperties": False  # Disallow extra fields in the top-level object
                    }
                }
    }
    )
    return completion


In [17]:
stories = {}
for title, (url, category) in titles_and_urls.items():
    print(f"Title: {title}")
    print(f"Category: {category}")
    print(f"URL: {url}")
    print("")
    completion = get_content(url)
    content = json.loads(completion.choices[0].message.content).get('content', {})
    next_page = json.loads(completion.choices[0].message.content).get('next_page', "") # should be empty

    stories[title] = content

Title: Cinderella (Classic)
Category: Fairy Tales
URL: https://storiestogrowby.org/story/cinderella-fairy-tale-english-story-for-kids/

Title: Hansel and Gretel
Category: Bedtime Stories
URL: https://storiestogrowby.org/story/hansel-and-gretel-bedtime-stories-for-kids/

Title: Rumpelstiltskin
Category: Bedtime Stories
URL: https://storiestogrowby.org/story/early-reader-rumpelstiltskin-fairy-tale-english-stories-kids/



KeyboardInterrupt: 

In [20]:
import pandas as pd
df = pd.DataFrame(stories).T
df.to_csv('stories.csv')

Unnamed: 0,title,author,content
Cinderella (Classic),Cinderella (Classic),Elaine Lindy,"Once upon a time, a girl named Cinderella live..."
Hansel and Gretel,Hansel and Gretel,,Once upon a time a brother and sister named Ha...


: 

In [1]:
import pandas as pd

df = pd.read_csv('stories.csv', index_col=0)
df

Unnamed: 0,title,author,content
Izzy’s Rainbow,"Once upon a time, there was a little girl name...","Once upon a time, there was a little girl name...","Once upon a time, there was a little girl name..."
When the Button Tree Flowers,When the Button Tree Flowers,Jade Maitre,"Once upon a time, there was a little boy named..."
The House on Chicken Legs,The following story is a Dark Retelling Fairy ...,The following story is a Dark Retelling Fairy ...,The following story is a Dark Retelling Fairy ...
The Beast-Skinned Demon and the Blood Flowers,The following story is a Dark Retelling Fairy ...,The following story is a Dark Retelling Fairy ...,The following story is a Dark Retelling Fairy ...
Luna Altoona and the Toothache Potion,Luna Altoona and the Toothache Potion,Prarthana Gururaj,"Once upon a time, in a small town, there lived..."
The Dark Is Friendly,Bedtime Story for Kids by Jade Maitre\n\nIllus...,Bedtime Story for Kids by Jade Maitre\n\nIllus...,Bedtime Story for Kids by Jade Maitre\n\nIllus...
An Old Lady Who Swallowed A Fly,The classic nursery rhyme... where one silly s...,The classic nursery rhyme... where one silly s...,The classic nursery rhyme... where one silly s...
Halloween Around the World,Halloween is celebrated in various ways around...,Halloween is celebrated in various ways around...,Halloween is celebrated in various ways around...
Doodlebugs,"Meet the Doodlebugs—a group of quirky, adventu...","Meet the Doodlebugs—a group of quirky, adventu...","Meet the Doodlebugs—a group of quirky, adventu..."
Snow White and the Seven Dwarfs,"Once upon a time in the middle of winter, when...","Once upon a time in the middle of winter, when...","Once upon a time in the middle of winter, when..."
