In [2]:
import requests
from bs4 import BeautifulSoup
import json

# Base URL of the website
base_url = "https://www.commonsensemedia.org"

# Function to scrape and process each review URL
def scrape_and_process_url(title, url):
    # Full URL
    full_url = base_url + url

    # Determine if it's a movie or TV show
    if "/movie-reviews/" in url:
        content_type = "Movie"
    elif "/tv-reviews/" in url:
        content_type = "TV Show"
    else:
        content_type = "Unknown"

    # Fetch the content of the page
    response = requests.get(full_url)
    response.raise_for_status()  # Check if the request was successful

    # Parse the HTML content using BeautifulSoup
    soup = BeautifulSoup(response.content, 'html.parser')

    # Extract the title
    movie_title = title

    # Extract the rating age from the <span class="rating__age">
    rating_age_element = soup.select_one('span.rating__age')
    rating_age = rating_age_element.get_text(strip=True) if rating_age_element else 'No rating age found'

    # Extract the review content
    review_content_div = soup.select_one("#review-view-content-grid > div.row")

    # Find all child divs with the class 'content-grid-item content-grid-item--shadow'
    child_divs = review_content_div.find_all('div', class_='content-grid-item content-grid-item--shadow')

    # Extract the 'data-text' attribute, rating, and label from each child div
    reviews = []
    for idx, child_div in enumerate(child_divs, 1):
        data_text = child_div.get('data-text', 'No content found')
        data_text_cleaned = data_text.replace('<p>', '').replace('</p>', '')

        # Find the rating information within the same div
        rating_icons = child_div.select('span.rating__score i.active')
        rating = len(rating_icons) - 1  # Adjust rating by subtracting 1

        # Find the label within the same div
        label_span = child_div.select_one('span.rating__label')
        label = label_span.get_text(strip=True) if label_span else 'No label found'

        reviews.append({
            'content': data_text_cleaned,
            'rating': rating,
            'label': label
        })

    return {
        'title': movie_title,
        'content_type': content_type,
        'rating_age': rating_age,
        'reviews': reviews
    }

# Function to scrape URLs from the list pages
def scrape_list_page(list_url):
    response = requests.get(list_url)
    response.raise_for_status()
    
    # Parse the list page
    soup = BeautifulSoup(response.content, 'html.parser')
    
    # Find all titles and URLs on the list page
    titles_and_urls = []
    for item in soup.select('div.list-item'):
        title = item.select_one('h2').get_text(strip=True)
        url = item.select_one('a')['href']
        titles_and_urls.append((title, url))
    
    return titles_and_urls

# Function to scrape URLs from the search results page
def scrape_search_results(search_url):
    response = requests.get(search_url)
    response.raise_for_status()
    
    # Parse the search results page
    soup = BeautifulSoup(response.content, 'html.parser')
    
    # Find all titles and URLs on the search result page
    titles_and_urls = []
    for result in soup.select('div.search-result-item'):
        title = result.select_one('a.search-result-item__title').get_text(strip=True)
        url = result.select_one('a.search-result-item__title')['href']
        titles_and_urls.append((title, url))
    
    return titles_and_urls

# URLs of the lists you want to scrape
list_urls = [
    "https://www.commonsensemedia.org/best-of-lists/best-education-kids-animation-tv-shows",
    "https://www.commonsensemedia.org/lists/offbeat-animated-movies"
]

# Search result URL
search_url = "https://www.commonsensemedia.org/search/animated%20shows"

# Process each list URL
all_reviews = []

# Scrape from list URLs
for list_url in list_urls:
    titles_and_urls = scrape_list_page(list_url)
    
    for title, url in titles_and_urls:
        print(f"Processing: {title}")
        review_data = scrape_and_process_url(title, url)
        all_reviews.append(review_data)

# Scrape from search results URL
titles_and_urls = scrape_search_results(search_url)

for title, url in titles_and_urls:
    print(f"Processing: {title}")
    review_data = scrape_and_process_url(title, url)
    all_reviews.append(review_data)

# Save all reviews to a JSON file
with open('all_reviews_content.json', 'w') as file:
    json.dump(all_reviews, file, indent=4)

# Optionally, print the results
for review_data in all_reviews:
    print(f"Title: {review_data['title']}")
    print(f"Content Type: {review_data['content_type']}")
    print(f"Rating Age: {review_data['rating_age']}\n")
    
    for idx, review in enumerate(review_data['reviews'], 1):
        print(f"Review {idx}:")
        print(f"Label: {review['label']}")
        print(f"Content: {review['content']}")
        print(f"Rating: {review['rating']} out of 5\n")
    print("\n")


In [3]:
import requests
from bs4 import BeautifulSoup

# Test URL (replace with an actual review URL)
test_url = "https://www.commonsensemedia.org/tv-reviews/bluey"  # Example URL

# Fetch the content of the page
response = requests.get(test_url)
response.raise_for_status()  # Check if the request was successful

# Parse the HTML content using BeautifulSoup
soup = BeautifulSoup(response.content, 'html.parser')

# Print the full HTML content to check the structure
print(soup.prettify())


<!DOCTYPE html>
<html dir="ltr" lang="en" prefix="og: https://ogp.me/ns#">
 <head>
  <meta charset="utf-8"/>
  <meta content="Positive family, social themes in Aussie pup's adventures. Read Common Sense Media's Bluey review, age rating, and parents guide." name="description"/>
  <meta content="index, follow" name="robots"/>
  <link href="https://www.commonsensemedia.org/tv-reviews/bluey" rel="canonical"/>
  <link href="https://www.commonsensemedia.org/sites/default/files/styles/social_share_image/public/screenshots/csm-tv/bluey-tv-scene-1.jpg" rel="image_src"/>
  <link href="/themes/custom/common_sense/images/favicons/favicon-16x16.png" rel="icon" sizes="16x16"/>
  <link href="/themes/custom/common_sense/images/favicons/favicon-32x32.png" rel="icon" sizes="32x32"/>
  <link href="/themes/custom/common_sense/images/favicons/favicon-96x96.png" rel="icon" sizes="96x96"/>
  <link href="/themes/custom/common_sense/images/favicons/favicon-192x192.png" rel="icon" sizes="192x192"/>
  <link href

In [6]:
import requests
from bs4 import BeautifulSoup

# Base URL of the website
base_url = "https://www.commonsensemedia.org"

# Function to scrape and process each review URL
def scrape_and_process_url(url):
    # Full URL
    full_url = base_url + url

    # Determine if it's a movie or TV show
    if "/movie-reviews/" in url:
        content_type = "Movie"
    elif "/tv-reviews/" in url:
        content_type = "TV Show"
    else:
        content_type = "Unknown"

    # Fetch the content of the page
    response = requests.get(full_url)
    response.raise_for_status()  # Check if the request was successful

    # Parse the HTML content using BeautifulSoup
    soup = BeautifulSoup(response.content, 'html.parser')

    # Extract the title using the new selector for <h3> tag
    title_element = soup.select_one('h3.review-title a.link--title')
    movie_title = title_element.get_text(strip=True) if title_element else 'No title found'

    # Extract the rating age using the updated selector
    rating_age_element = soup.select_one('span.rating__age')
    rating_age = rating_age_element.get_text(strip=True) if rating_age_element else 'No rating age found'

    # Extract the review content
    review_content_div = soup.select_one("#review-view-content-grid > div.row")

    # Find all child divs with the class 'content-grid-item content-grid-item--shadow'
    child_divs = review_content_div.find_all('div', class_='content-grid-item content-grid-item--shadow')

    # Extract the 'data-text' attribute, rating, and label from each child div
    reviews = []
    for idx, child_div in enumerate(child_divs, 1):
        data_text = child_div.get('data-text', 'No content found')
        data_text_cleaned = data_text.replace('<p>', '').replace('</p>', '')

        # Find the rating information within the same div
        rating_icons = child_div.select('span.rating__score i.active')
        rating = len(rating_icons) - 1  # Adjust rating by subtracting 1

        # Find the label within the same div
        label_span = child_div.select_one('span.rating__label')
        label = label_span.get_text(strip=True) if label_span else 'No label found'

        reviews.append({
            'content': data_text_cleaned,
            'rating': rating,
            'label': label
        })

    return {
        'title': movie_title,
        'content_type': content_type,
        'rating_age': rating_age,
        'reviews': reviews
    }

# Example of how to run this function
test_url = "/tv-reviews/bluey"  # Replace this with the actual URL slug
review_data = scrape_and_process_url(test_url)

# Output the extracted data for inspection
print(f"Title: {review_data['title']}")
print(f"Content Type: {review_data['content_type']}")
print(f"Rating Age: {review_data['rating_age']}")
for idx, review in enumerate(review_data['reviews'], 1):
    print(f"Review {idx}:")
    print(f"Label: {review['label']}")
    print(f"Content: {review['content']}")
    print(f"Rating: {review['rating']} out of 5\n")


Title: The Berenstain Bears
Content Type: TV Show
Rating Age: age 4+


In [7]:
import requests
from bs4 import BeautifulSoup
import json

# Base URL of the website
base_url = "https://www.commonsensemedia.org"

# Function to scrape and process each review URL
def scrape_and_process_url(url):
    # Full URL
    full_url = base_url + url
    print(f"Scraping URL: {full_url}")  # Debugging: Print each URL being scraped

    # Determine if it's a movie or TV show
    if "/movie-reviews/" in url:
        content_type = "Movie"
    elif "/tv-reviews/" in url:
        content_type = "TV Show"
    else:
        content_type = "Unknown"

    # Fetch the content of the page
    response = requests.get(full_url)
    response.raise_for_status()  # Check if the request was successful

    # Parse the HTML content using BeautifulSoup
    soup = BeautifulSoup(response.content, 'html.parser')

    # Extract the title using the new selector for <h3> tag
    title_element = soup.select_one('h3.review-title a.link--title')
    movie_title = title_element.get_text(strip=True) if title_element else 'No title found'

    # Extract the rating age using the updated selector
    rating_age_element = soup.select_one('span.rating__age')
    rating_age = rating_age_element.get_text(strip=True) if rating_age_element else 'No rating age found'

    print(f"Title: {movie_title}, Rating Age: {rating_age}")  # Debugging: Print title and rating age

    # Extract the review content (if available)
    review_content_div = soup.select_one("#review-view-content-grid > div.row")
    reviews = []
    if review_content_div:
        child_divs = review_content_div.find_all('div', class_='content-grid-item content-grid-item--shadow')
        for idx, child_div in enumerate(child_divs, 1):
            data_text = child_div.get('data-text', 'No content found')
            data_text_cleaned = data_text.replace('<p>', '').replace('</p>', '')

            # Find the rating information within the same div
            rating_icons = child_div.select('span.rating__score i.active')
            rating = len(rating_icons) - 1  # Adjust rating by subtracting 1

            # Find the label within the same div
            label_span = child_div.select_one('span.rating__label')
            label = label_span.get_text(strip=True) if label_span else 'No label found'

            reviews.append({
                'content': data_text_cleaned,
                'rating': rating,
                'label': label
            })
    else:
        reviews.append({
            'content': 'No reviews found',
            'rating': 'N/A',
            'label': 'N/A'
        })

    return {
        'title': movie_title,
        'content_type': content_type,
        'rating_age': rating_age,
        'reviews': reviews
    }

# Function to scrape URLs from the list pages
def scrape_list_page(list_url):
    print(f"Scraping list page: {list_url}")  # Debugging: Print the list page being scraped
    response = requests.get(list_url)
    response.raise_for_status()
    
    # Parse the list page
    soup = BeautifulSoup(response.content, 'html.parser')
    
    # Find all titles and URLs on the list page
    titles_and_urls = []
    for item in soup.select('div.list-item'):
        title = item.select_one('h2').get_text(strip=True)
        url = item.select_one('a')['href']
        titles_and_urls.append((title, url))
        print(f"Found URL: {url} for Title: {title}")  # Debugging: Print each found title and URL
    
    return titles_and_urls

# Function to scrape URLs from the search results page
def scrape_search_results(search_url):
    print(f"Scraping search results page: {search_url}")  # Debugging: Print the search results page being scraped
    response = requests.get(search_url)
    response.raise_for_status()
    
    # Parse the search results page
    soup = BeautifulSoup(response.content, 'html.parser')
    
    # Find all titles and URLs on the search result page
    titles_and_urls = []
    for result in soup.select('div.search-result-item'):
        title = result.select_one('a.search-result-item__title').get_text(strip=True)
        url = result.select_one('a.search-result-item__title')['href']
        titles_and_urls.append((title, url))
        print(f"Found URL: {url} for Title: {title}")  # Debugging: Print each found title and URL
    
    return titles_and_urls

# URLs of the lists you want to scrape
list_urls = [
    "https://www.commonsensemedia.org/best-of-lists/best-education-kids-animation-tv-shows",
    "https://www.commonsensemedia.org/lists/offbeat-animated-movies"
]

# Search result URL
search_url = "https://www.commonsensemedia.org/search/animated%20shows"

# Process each list URL and search URL
all_reviews = []

# Scrape from list URLs
for list_url in list_urls:
    titles_and_urls = scrape_list_page(list_url)
    
    for title, url in titles_and_urls:
        print(f"Processing: {title} - {url}")  # Debugging: Print each title being processed
        review_data = scrape_and_process_url(url)
        all_reviews.append(review_data)

# Scrape from search results URL
titles_and_urls = scrape_search_results(search_url)

for title, url in titles_and_urls:
    print(f"Processing: {title} - {url}")  # Debugging: Print each title being processed
    review_data = scrape_and_process_url(url)
    all_reviews.append(review_data)

# Save all reviews to a JSON file
with open('all_reviews_content.json', 'w') as file:
    json.dump(all_reviews, file, indent=4)

# Optionally, print the results for each review
for review_data in all_reviews:
    print(f"Title: {review_data['title']}")
    print(f"Content Type: {review_data['content_type']}")
    print(f"Rating Age: {review_data['rating_age']}")
    for idx, review in enumerate(review_data['reviews'], 1):
        print(f"Review {idx}:")
        print(f"Label: {review['label']}")
        print(f"Content: {review['content']}")
        print(f"Rating: {review['rating']} out of 5\n")


Scraping list page: https://www.commonsensemedia.org/best-of-lists/best-education-kids-animation-tv-shows
Scraping list page: https://www.commonsensemedia.org/lists/offbeat-animated-movies
Scraping search results page: https://www.commonsensemedia.org/search/animated%20shows


In [8]:
import requests
from bs4 import BeautifulSoup

# Base URL of the website
base_url = "https://www.commonsensemedia.org"

# Function to scrape and process each review URL
def scrape_and_process_url(url):
    # Full URL
    full_url = base_url + url
    print(f"Scraping URL: {full_url}")  # Debugging: Print each URL being scraped

    # Fetch the content of the page
    response = requests.get(full_url)
    response.raise_for_status()  # Check if the request was successful

    # Parse the HTML content using BeautifulSoup
    soup = BeautifulSoup(response.content, 'html.parser')

    # Extract the title from <h3 class="review-title">
    title_element = soup.select_one('h3.review-title a.link--title')
    movie_title = title_element.get_text(strip=True) if title_element else 'No title found'

    # Extract the age rating from <span class="rating__age">
    rating_age_element = soup.select_one('span.rating__age')
    rating_age = rating_age_element.get_text(strip=True) if rating_age_element else 'No rating age found'

    # Extract the star rating from <span class="rating__score"> (count active stars)
    rating_stars_element = soup.select('span.rating__score i.icon-star-solid.active')
    star_rating = len(rating_stars_element) if rating_stars_element else 'No star rating found'

    print(f"Title: {movie_title}, Age Rating: {rating_age}, Star Rating: {star_rating}")  # Debugging

    # Return the scraped data
    return {
        'title': movie_title,
        'rating_age': rating_age,
        'star_rating': star_rating
    }

# Example URL
test_url = "/movie-reviews/i-saw-the-tv-glow"

# Scrape and print the data for the test URL
review_data = scrape_and_process_url(test_url)
print(review_data)


Scraping URL: https://www.commonsensemedia.org/movie-reviews/i-saw-the-tv-glow
Title: We're All Going to the World's Fair, Age Rating: age 13+, Star Rating: 32
{'title': "We're All Going to the World's Fair", 'rating_age': 'age 13+', 'star_rating': 32}


In [9]:
import requests
from bs4 import BeautifulSoup

# Base URL of the website
base_url = "https://www.commonsensemedia.org"

# Function to scrape titles and age ratings from the search results page
def scrape_titles_and_ratings(search_url):
    # Full URL
    full_url = base_url + search_url
    print(f"Scraping search results page: {full_url}")  # Debugging: Print the search URL being scraped

    # Fetch the content of the page
    response = requests.get(full_url)
    response.raise_for_status()  # Check if the request was successful

    # Parse the HTML content using BeautifulSoup
    soup = BeautifulSoup(response.content, 'html.parser')

    # Find all blocks that contain titles and age ratings
    results = soup.select('div.review-info.col')
    
    titles_and_ratings = []
    
    # Loop through all the results and extract title and age rating
    for result in results:
        # Extract the title
        title_element = result.select_one('h3.review-title a.link--title')
        movie_title = title_element.get_text(strip=True) if title_element else 'No title found'

        # Extract the age rating
        rating_age_element = result.select_one('span.rating__age')
        rating_age = rating_age_element.get_text(strip=True) if rating_age_element else 'No rating age found'

        # Append the extracted data to the list
        titles_and_ratings.append({
            'title': movie_title,
            'rating_age': rating_age
        })

    # Return the list of titles and age ratings
    return titles_and_ratings

# URL of the search results page
search_url = "/search/animated%20tv%20shows"

# Scrape and print the titles and age ratings
titles_and_ratings = scrape_titles_and_ratings(search_url)

# Print each extracted title and age rating
for item in titles_and_ratings:
    print(f"Title: {item['title']}, Age Rating: {item['rating_age']}")


Scraping search results page: https://www.commonsensemedia.org/search/animated%20tv%20shows
Title: I Saw the TV Glow, Age Rating: age 13+
Title: Quiet on Set: The Dark Side of Kids TV, Age Rating: age 15+
Title: Zombies: The Re-Animated Series, Age Rating: age 8+
Title: Ark: The Animated Series, Age Rating: age 14+
Title: The 8 Show, Age Rating: age 15+
Title: The Magic Prank Show with Justin Willman, Age Rating: age 12+
Title: Jerrod Carmichael Reality Show, Age Rating: age 15+
Title: MLB The Show 24, Age Rating: age 8+
Title: Sail Me Away Home: Show Me a Sign, Book 3, Age Rating: age 8+
Title: Set Me Free: Show Me a Sign, Book 2, Age Rating: age 8+
Title: Show Me a Sign, Book 1, Age Rating: age 8+
Title: The Vince Staples Show, Age Rating: age 15+
Title: The Addams Family (Animated TV Series), Age Rating: age 7+
Title: Rabbids Invasion: The Interactive TV Show, Age Rating: age 8+
Title: Catfish: The TV Show, Age Rating: age 14+
Title: Spider-Man (1981 TV Show), Age Rating: age 7+


In [11]:
import requests
from bs4 import BeautifulSoup

# Base URL of the website
base_url = "https://www.commonsensemedia.org"

# Function to scrape links to TV show lists from the search results page
def scrape_list_links(search_url):
    # Full URL
    full_url = base_url + search_url
    print(f"Scraping search results page: {full_url}")

    # Fetch the content of the page
    response = requests.get(full_url)
    response.raise_for_status()  # Check if the request was successful

    # Parse the HTML content using BeautifulSoup
    soup = BeautifulSoup(response.content, 'html.parser')

    # Find all the teaser blocks that contain lists of TV shows
    list_links = []

    # Select the correct blocks containing the links
    results = soup.select('div.site-search-teaser h3.site-search-teaser-title a.link--title')

    # Loop through and extract links to lists
    for result in results:
        title = result.get_text(strip=True)
        url = result['href']
        list_links.append({
            'title': title,
            'url': base_url + url  # Construct the full URL
        })

    return list_links

# URL of the search results page
search_url = "/search/animated+tv+shows"

# Scrape and print the links to lists
list_links = scrape_list_links(search_url)

# Print each link to a list of TV shows
for item in list_links:
    print(f"List Title: {item['title']}, URL: {item['url']}")


Scraping search results page: https://www.commonsensemedia.org/search/animated+tv+shows
List Title: Best Tween TV Shows, URL: https://www.commonsensemedia.org/lists/best-tween-tv-shows
List Title: Educational TV Shows for Kids, URL: https://www.commonsensemedia.org/lists/educational-tv-shows-for-kids
List Title: Family TV Shows to Watch Together, URL: https://www.commonsensemedia.org/lists/family-tv-shows-to-watch-together
List Title: Marvel Cinematic Universe (MCU) Movies and TV Shows in Order, URL: https://www.commonsensemedia.org/lists/marvel-cinematic-universe-mcu-movies-and-tv-shows-in-order


In [12]:
import requests
from bs4 import BeautifulSoup

# Base URL pattern for paginated search results
base_url = "https://www.commonsensemedia.org/search/category/tv/page/{}/sort/date-desc/animated+tv+shows"

# Function to scrape titles and age ratings from a single page
def scrape_page(page_number):
    # Format the URL for the current page
    url = base_url.format(page_number)
    print(f"Scraping page: {url}")

    # Fetch the content of the page
    response = requests.get(url)
    response.raise_for_status()  # Check if the request was successful

    # Parse the HTML content using BeautifulSoup
    soup = BeautifulSoup(response.content, 'html.parser')

    # Find all blocks that contain titles and age ratings
    results = soup.select('div.review-rating')

    titles_and_ratings = []

    # Loop through each result to extract the title and age rating
    for result in results:
        # Extract the title
        title_element = result.find_previous('h3', class_='review-title').find('a', class_='link--title')
        title = title_element.get_text(strip=True) if title_element else 'No title found'

        # Extract the age rating
        age_rating_element = result.select_one('span.rating__age')
        age_rating = age_rating_element.get_text(strip=True) if age_rating_element else 'No age rating found'

        # Append the result
        titles_and_ratings.append({
            'title': title,
            'age_rating': age_rating
        })

    return titles_and_ratings

# Function to scrape multiple pages, starting from page 1
def scrape_all_pages():
    page_number = 1
    all_titles_and_ratings = []

    # Continue scraping until no results are found
    while True:
        titles_and_ratings = scrape_page(page_number)

        # If no results are found, break the loop
        if not titles_and_ratings:
            print(f"No more results found on page {page_number}. Stopping.")
            break

        # Add the results to the full list
        all_titles_and_ratings.extend(titles_and_ratings)
        page_number += 1

    return all_titles_and_ratings

# Scrape all pages and get the titles and age ratings
all_titles_and_ratings = scrape_all_pages()

# Print the results
for item in all_titles_and_ratings:
    print(f"Title: {item['title']}, Age Rating: {item['age_rating']}")


Scraping page: https://www.commonsensemedia.org/search/category/tv/page/1/sort/date-desc/animated+tv+shows
Scraping page: https://www.commonsensemedia.org/search/category/tv/page/2/sort/date-desc/animated+tv+shows
Scraping page: https://www.commonsensemedia.org/search/category/tv/page/3/sort/date-desc/animated+tv+shows
Scraping page: https://www.commonsensemedia.org/search/category/tv/page/4/sort/date-desc/animated+tv+shows
Scraping page: https://www.commonsensemedia.org/search/category/tv/page/5/sort/date-desc/animated+tv+shows
Scraping page: https://www.commonsensemedia.org/search/category/tv/page/6/sort/date-desc/animated+tv+shows
Scraping page: https://www.commonsensemedia.org/search/category/tv/page/7/sort/date-desc/animated+tv+shows
Scraping page: https://www.commonsensemedia.org/search/category/tv/page/8/sort/date-desc/animated+tv+shows
Scraping page: https://www.commonsensemedia.org/search/category/tv/page/9/sort/date-desc/animated+tv+shows
Scraping page: https://www.commonsens

In [13]:
import requests
from bs4 import BeautifulSoup

# Base URL pattern for paginated search results
base_url = "https://www.commonsensemedia.org/search/category/tv/page/{}/sort/date-desc/animated+tv+shows"

# Function to scrape titles and age ratings from a single page
def scrape_page(page_number):
    # Format the URL for the current page
    url = base_url.format(page_number)
    print(f"Scraping page: {url}")

    # Fetch the content of the page
    response = requests.get(url)
    response.raise_for_status()  # Check if the request was successful

    # Parse the HTML content using BeautifulSoup
    soup = BeautifulSoup(response.content, 'html.parser')

    # Find all blocks that contain titles and age ratings
    results = soup.select('div.site-search-teaser')  # Select each review teaser block

    titles_and_ratings = []

    # Loop through each result to extract the title and age rating
    for result in results:
        # Extract the title from <h3 class="review-title">
        title_element = result.select_one('h3.review-title a.link--title')
        title = title_element.get_text(strip=True) if title_element else 'No title found'

        # Extract the age rating from <span class="rating__age">
        age_rating_element = result.select_one('span.rating__age')
        age_rating = age_rating_element.get_text(strip=True) if age_rating_element else 'No age rating found'

        # Append the result
        titles_and_ratings.append({
            'title': title,
            'age_rating': age_rating
        })

    return titles_and_ratings

# Function to scrape multiple pages, starting from page 1
def scrape_all_pages():
    page_number = 1
    all_titles_and_ratings = []

    # Continue scraping until no results are found
    while True:
        titles_and_ratings = scrape_page(page_number)

        # If no results are found, break the loop
        if not titles_and_ratings:
            print(f"No more results found on page {page_number}. Stopping.")
            break

        # Add the results to the full list
        all_titles_and_ratings.extend(titles_and_ratings)
        page_number += 1

    return all_titles_and_ratings

# Scrape all pages and get the titles and age ratings
all_titles_and_ratings = scrape_all_pages()

# Print the results
for item in all_titles_and_ratings:
    print(f"Title: {item['title']}, Age Rating: {item['age_rating']}")


Scraping page: https://www.commonsensemedia.org/search/category/tv/page/1/sort/date-desc/animated+tv+shows
Scraping page: https://www.commonsensemedia.org/search/category/tv/page/2/sort/date-desc/animated+tv+shows
Scraping page: https://www.commonsensemedia.org/search/category/tv/page/3/sort/date-desc/animated+tv+shows
Scraping page: https://www.commonsensemedia.org/search/category/tv/page/4/sort/date-desc/animated+tv+shows
Scraping page: https://www.commonsensemedia.org/search/category/tv/page/5/sort/date-desc/animated+tv+shows
Scraping page: https://www.commonsensemedia.org/search/category/tv/page/6/sort/date-desc/animated+tv+shows
Scraping page: https://www.commonsensemedia.org/search/category/tv/page/7/sort/date-desc/animated+tv+shows
Scraping page: https://www.commonsensemedia.org/search/category/tv/page/8/sort/date-desc/animated+tv+shows
Scraping page: https://www.commonsensemedia.org/search/category/tv/page/9/sort/date-desc/animated+tv+shows
Scraping page: https://www.commonsens

In [14]:
import requests
from bs4 import BeautifulSoup
import json

# Base URL pattern for paginated search results
base_url = "https://www.commonsensemedia.org/search/category/tv/page/{}/sort/date-desc/animated+tv+shows"

# Function to scrape titles and age ratings from a single page
def scrape_page(page_number):
    # Format the URL for the current page
    url = base_url.format(page_number)
    print(f"Scraping page: {url}")

    # Fetch the content of the page
    response = requests.get(url)
    response.raise_for_status()  # Check if the request was successful

    # Parse the HTML content using BeautifulSoup
    soup = BeautifulSoup(response.content, 'html.parser')

    # Find all blocks that contain titles and age ratings
    results = soup.select('div.site-search-teaser')  # Select each review teaser block

    titles_and_ratings = []

    # Loop through each result to extract the title and age rating
    for result in results:
        # Extract the title from <h3 class="review-title">
        title_element = result.select_one('h3.review-title a.link--title')
        title = title_element.get_text(strip=True) if title_element else 'No title found'

        # Extract the age rating from <span class="rating__age">
        age_rating_element = result.select_one('span.rating__age')
        age_rating = age_rating_element.get_text(strip=True) if age_rating_element else 'No age rating found'

        # Append the result
        titles_and_ratings.append({
            'title': title,
            'age_rating': age_rating
        })

    return titles_and_ratings

# Function to scrape multiple pages, starting from page 1
def scrape_all_pages():
    page_number = 1
    all_titles_and_ratings = []

    # Continue scraping until no results are found
    while True:
        titles_and_ratings = scrape_page(page_number)

        # If no results are found, break the loop
        if not titles_and_ratings:
            print(f"No more results found on page {page_number}. Stopping.")
            break

        # Add the results to the full list
        all_titles_and_ratings.extend(titles_and_ratings)
        page_number += 1

    return all_titles_and_ratings

# Scrape all pages and get the titles and age ratings
all_titles_and_ratings = scrape_all_pages()

# Save the results to a JSON file
output_file = "titles_and_age_ratings.json"
with open(output_file, 'w') as file:
    json.dump(all_titles_and_ratings, file, indent=4)

print(f"Data saved to {output_file}")


Scraping page: https://www.commonsensemedia.org/search/category/tv/page/1/sort/date-desc/animated+tv+shows
Scraping page: https://www.commonsensemedia.org/search/category/tv/page/2/sort/date-desc/animated+tv+shows
Scraping page: https://www.commonsensemedia.org/search/category/tv/page/3/sort/date-desc/animated+tv+shows
Scraping page: https://www.commonsensemedia.org/search/category/tv/page/4/sort/date-desc/animated+tv+shows
Scraping page: https://www.commonsensemedia.org/search/category/tv/page/5/sort/date-desc/animated+tv+shows
Scraping page: https://www.commonsensemedia.org/search/category/tv/page/6/sort/date-desc/animated+tv+shows
Scraping page: https://www.commonsensemedia.org/search/category/tv/page/7/sort/date-desc/animated+tv+shows
Scraping page: https://www.commonsensemedia.org/search/category/tv/page/8/sort/date-desc/animated+tv+shows
Scraping page: https://www.commonsensemedia.org/search/category/tv/page/9/sort/date-desc/animated+tv+shows
Scraping page: https://www.commonsens

In [15]:
import requests
from bs4 import BeautifulSoup
import json

# Base URL pattern for paginated search results
# This URL pattern follows the format you provided, with the page number being dynamic
base_url = "https://www.commonsensemedia.org/search/category/tv/genre/kids-animation-52/page/{}/sort/date-desc"

# Function to scrape titles and age ratings from a single page
def scrape_page(page_number):
    # Format the URL for the current page
    url = base_url.format(page_number)
    print(f"Scraping page: {url}")

    # Fetch the content of the page
    response = requests.get(url)
    response.raise_for_status()  # Check if the request was successful

    # Parse the HTML content using BeautifulSoup
    soup = BeautifulSoup(response.content, 'html.parser')

    # Find all blocks that contain titles and age ratings
    results = soup.select('div.site-search-teaser')  # Select each review teaser block

    titles_and_ratings = []

    # Loop through each result to extract the title and age rating
    for result in results:
        # Extract the title from <h3 class="review-title">
        title_element = result.select_one('h3.review-title a.link--title')
        title = title_element.get_text(strip=True) if title_element else 'No title found'

        # Extract the age rating from <span class="rating__age">
        age_rating_element = result.select_one('span.rating__age')
        age_rating = age_rating_element.get_text(strip=True) if age_rating_element else 'No age rating found'

        # Append the result
        titles_and_ratings.append({
            'title': title,
            'age_rating': age_rating
        })

    return titles_and_ratings

# Function to scrape multiple pages, starting from page 1
def scrape_all_pages():
    page_number = 1
    all_titles_and_ratings = []

    # Continue scraping until no results are found
    while True:
        titles_and_ratings = scrape_page(page_number)

        # If no results are found, break the loop
        if not titles_and_ratings:
            print(f"No more results found on page {page_number}. Stopping.")
            break

        # Add the results to the full list
        all_titles_and_ratings.extend(titles_and_ratings)
        page_number += 1

    return all_titles_and_ratings

# Scrape all pages and get the titles and age ratings
all_titles_and_ratings = scrape_all_pages()

# Save the results to a JSON file
output_file = "kids_animation_titles_and_age_ratings.json"
with open(output_file, 'w') as file:
    json.dump(all_titles_and_ratings, file, indent=4)

print(f"Data saved to {output_file}")


Scraping page: https://www.commonsensemedia.org/search/category/tv/genre/kids-animation-52/page/1/sort/date-desc
Scraping page: https://www.commonsensemedia.org/search/category/tv/genre/kids-animation-52/page/2/sort/date-desc
Scraping page: https://www.commonsensemedia.org/search/category/tv/genre/kids-animation-52/page/3/sort/date-desc
Scraping page: https://www.commonsensemedia.org/search/category/tv/genre/kids-animation-52/page/4/sort/date-desc
Scraping page: https://www.commonsensemedia.org/search/category/tv/genre/kids-animation-52/page/5/sort/date-desc
Scraping page: https://www.commonsensemedia.org/search/category/tv/genre/kids-animation-52/page/6/sort/date-desc
Scraping page: https://www.commonsensemedia.org/search/category/tv/genre/kids-animation-52/page/7/sort/date-desc
Scraping page: https://www.commonsensemedia.org/search/category/tv/genre/kids-animation-52/page/8/sort/date-desc
Scraping page: https://www.commonsensemedia.org/search/category/tv/genre/kids-animation-52/page/

In [16]:
import requests
from bs4 import BeautifulSoup
import json

# Base URL pattern for paginated search results
base_url = "https://www.commonsensemedia.org/search/category/tv/genre/kids-animation-52/page/{}/sort/date-desc"

# Function to scrape titles and links from a single page
def scrape_titles_and_links(page_number):
    url = base_url.format(page_number)
    print(f"Scraping page: {url}")
    
    response = requests.get(url)
    response.raise_for_status()
    
    soup = BeautifulSoup(response.content, 'html.parser')
    results = soup.select('div.site-search-teaser')  # Select each review teaser block

    titles_and_links = []

    # Loop through each result to extract title and link
    for result in results:
        title_element = result.select_one('h3.review-title a.link--title')
        if title_element:
            title = title_element.get_text(strip=True)
            link = "https://www.commonsensemedia.org" + title_element['href']  # Full URL
            titles_and_links.append({
                'title': title,
                'link': link
            })

    return titles_and_links

# Function to scrape the detailed review information from the TV show page
def scrape_review_details(review_url):
    print(f"Scraping review page: {review_url}")
    
    response = requests.get(review_url)
    response.raise_for_status()
    
    soup = BeautifulSoup(response.content, 'html.parser')
    
    # Extract the release year
    year_element = soup.select_one('div.review-product-summary span')
    release_year = year_element.get_text(strip=True) if year_element else 'No release year found'
    
    # Extract the age rating
    age_rating_element = soup.select_one('div.review-rating span.rating__age')
    age_rating = age_rating_element.get_text(strip=True) if age_rating_element else 'No age rating found'
    
    # Extract the details for violence, sex, language, etc.
    details = []
    rating_criteria = soup.select('div.content-grid-item')  # Select each content grid item
    
    for item in rating_criteria:
        label_element = item.select_one('span.rating__label')
        rating_element = item.select_one('span.rating__score i.active')
        description_element = item.select_one('div.content-grid-content p')

        if label_element and rating_element and description_element:
            label = label_element.get_text(strip=True)
            rating = len(item.select('span.rating__score i.active'))  # Number of active dots (stars)
            description = description_element.get_text(strip=True)

            details.append({
                'label': label,
                'rating': rating,
                'description': description
            })

    return {
        'release_year': release_year,
        'age_rating': age_rating,
        'details': details
    }

# Function to scrape multiple pages
def scrape_all_pages(last_page=5):
    page_number = 1
    all_reviews = []

    while page_number <= last_page:
        titles_and_links = scrape_titles_and_links(page_number)
        
        if not titles_and_links:
            print(f"No more results found on page {page_number}. Stopping.")
            break
        
        # For each title and link, follow the link and scrape the review details
        for item in titles_and_links:
            review_details = scrape_review_details(item['link'])
            review_data = {
                'title': item['title'],
                'link': item['link'],
                'release_year': review_details['release_year'],
                'age_rating': review_details['age_rating'],
                'details': review_details['details']
            }
            all_reviews.append(review_data)
        
        page_number += 1

    return all_reviews

# Scrape all pages and get the review details
all_reviews_data = scrape_all_pages(last_page=5)  # You can adjust the last_page number

# Save the results to a JSON file
output_file = "detailed_reviews.json"
with open(output_file, 'w') as file:
    json.dump(all_reviews_data, file, indent=4)

print(f"Data saved to {output_file}")


Scraping page: https://www.commonsensemedia.org/search/category/tv/genre/kids-animation-52/page/1/sort/date-desc
Scraping review page: https://www.commonsensemedia.org/tv-reviews/barbie-and-stacie-to-the-rescue
Scraping review page: https://www.commonsensemedia.org/tv-reviews/future-chicken
Scraping review page: https://www.commonsensemedia.org/tv-reviews/zuhu-ka-zalzala
Scraping review page: https://www.commonsensemedia.org/tv-reviews/pokemon-horizons
Scraping review page: https://www.commonsensemedia.org/tv-reviews/hot-wheels-lets-race
Scraping review page: https://www.commonsensemedia.org/tv-reviews/megamind-rules
Scraping review page: https://www.commonsensemedia.org/tv-reviews/iwaju
Scraping review page: https://www.commonsensemedia.org/tv-reviews/snoopy-presents-welcome-home-franklin
Scraping review page: https://www.commonsensemedia.org/tv-reviews/caillou-0
Scraping review page: https://www.commonsensemedia.org/tv-reviews/beas-block
Scraping review page: https://www.commonsensem

In [17]:
import requests
from bs4 import BeautifulSoup
import json

# Base URL pattern for paginated search results
base_url = "https://www.commonsensemedia.org/search/category/tv/genre/kids-animation-52/page/{}/sort/date-desc"

# Function to scrape titles and links from a single page
def scrape_titles_and_links(page_number):
    url = base_url.format(page_number)
    print(f"Scraping page: {url}")
    
    response = requests.get(url)
    response.raise_for_status()
    
    soup = BeautifulSoup(response.content, 'html.parser')
    results = soup.select('div.site-search-teaser')  # Select each review teaser block

    titles_and_links = []

    # Loop through each result to extract title and link
    for result in results:
        title_element = result.select_one('h3.review-title a.link--title')
        if title_element:
            title = title_element.get_text(strip=True)
            link = "https://www.commonsensemedia.org" + title_element['href']  # Full URL
            titles_and_links.append({
                'title': title,
                'link': link
            })

    return titles_and_links

# Function to scrape the detailed review information from the TV show page
def scrape_review_details(review_url):
    print(f"Scraping review page: {review_url}")
    
    response = requests.get(review_url)
    response.raise_for_status()
    
    soup = BeautifulSoup(response.content, 'html.parser')
    
    # Extract the release year
    year_element = soup.select_one('div.review-product-summary span')
    release_year = year_element.get_text(strip=True) if year_element else 'No release year found'
    
    # Extract the age rating
    age_rating_element = soup.select_one('div.review-rating span.rating__age')
    age_rating = age_rating_element.get_text(strip=True) if age_rating_element else 'No age rating found'
    
    # Extract the details for violence, sex, language, etc.
    details = []
    rating_criteria = soup.select('div.content-grid-item')  # Select each content grid item
    
    for item in rating_criteria:
        # Extract the label (e.g., Violence & Scariness)
        label_element = item.select_one('span.rating__label')
        label = label_element.get_text(strip=True) if label_element else 'No label found'

        # Extract the number of active circles (rating)
        rating = len(item.select('span.rating__score i.active'))

        # Extract the description of the rating
        description_element = item.select_one('div.content-grid-content')
        description = description_element.get_text(strip=True) if description_element else 'No description found'

        # Add each rating detail (label, rating, description)
        details.append({
            'label': label,
            'rating': rating,
            'description': description
        })

    return {
        'release_year': release_year,
        'age_rating': age_rating,
        'details': details
    }

# Function to scrape multiple pages
def scrape_all_pages(last_page=5):
    page_number = 1
    all_reviews = []

    while page_number <= last_page:
        titles_and_links = scrape_titles_and_links(page_number)
        
        if not titles_and_links:
            print(f"No more results found on page {page_number}. Stopping.")
            break
        
        # For each title and link, follow the link and scrape the review details
        for item in titles_and_links:
            review_details = scrape_review_details(item['link'])
            review_data = {
                'title': item['title'],
                'link': item['link'],
                'release_year': review_details['release_year'],
                'age_rating': review_details['age_rating'],
                'details': review_details['details']
            }
            all_reviews.append(review_data)
        
        page_number += 1

    return all_reviews

# Scrape all pages and get the review details
all_reviews_data = scrape_all_pages(last_page=5)  # You can adjust the last_page number

# Save the results to a JSON file
output_file = "detailed_reviews_with_ratings.json"
with open(output_file, 'w') as file:
    json.dump(all_reviews_data, file, indent=4)

print(f"Data saved to {output_file}")


Scraping page: https://www.commonsensemedia.org/search/category/tv/genre/kids-animation-52/page/1/sort/date-desc
Scraping review page: https://www.commonsensemedia.org/tv-reviews/barbie-and-stacie-to-the-rescue
Scraping review page: https://www.commonsensemedia.org/tv-reviews/future-chicken
Scraping review page: https://www.commonsensemedia.org/tv-reviews/zuhu-ka-zalzala
Scraping review page: https://www.commonsensemedia.org/tv-reviews/pokemon-horizons
Scraping review page: https://www.commonsensemedia.org/tv-reviews/hot-wheels-lets-race
Scraping review page: https://www.commonsensemedia.org/tv-reviews/megamind-rules
Scraping review page: https://www.commonsensemedia.org/tv-reviews/iwaju
Scraping review page: https://www.commonsensemedia.org/tv-reviews/snoopy-presents-welcome-home-franklin
Scraping review page: https://www.commonsensemedia.org/tv-reviews/caillou-0
Scraping review page: https://www.commonsensemedia.org/tv-reviews/beas-block
Scraping review page: https://www.commonsensem

In [19]:
import requests
from bs4 import BeautifulSoup
import json

# Base URL pattern for paginated search results
base_url = "https://www.commonsensemedia.org/search/category/tv/genre/kids-animation-52/page/{}/sort/date-desc"

# List of labels we are interested in extracting
rating_labels = [
    "Violence & Scariness", "Language", "Sex, Romance & Nudity", "Products & Purchases",
    "Drinking, Drugs & Smoking", "Positive Role Models", "Positive Messages", 
    "Educational Value", "Diverse Representations"
]

# Function to scrape titles and links from a single page
def scrape_titles_and_links(page_number):
    url = base_url.format(page_number)
    print(f"Scraping page: {url}")
    
    response = requests.get(url)
    response.raise_for_status()
    
    soup = BeautifulSoup(response.content, 'html.parser')
    results = soup.select('div.site-search-teaser')  # Select each review teaser block

    titles_and_links = []

    # Loop through each result to extract title and link
    for result in results:
        title_element = result.select_one('h3.review-title a.link--title')
        if title_element:
            title = title_element.get_text(strip=True)
            link = "https://www.commonsensemedia.org" + title_element['href']  # Full URL
            titles_and_links.append({
                'title': title,
                'link': link
            })

    return titles_and_links

# Function to scrape the detailed review information from the TV show page
def scrape_review_details(review_url):
    print(f"Scraping review page: {review_url}")
    
    response = requests.get(review_url)
    response.raise_for_status()
    
    soup = BeautifulSoup(response.content, 'html.parser')
    
    # Extract the release year
    year_element = soup.select_one('div.review-product-summary span')
    release_year = year_element.get_text(strip=True) if year_element else 'No release year found'
    
    # Extract the age rating
    age_rating_element = soup.select_one('div.review-rating span.rating__age')
    age_rating = age_rating_element.get_text(strip=True) if age_rating_element else 'No age rating found'
    
    # Extract the details for all the rating criteria (e.g., Violence & Scariness, Language, etc.)
    details = []
    
    # Locate all buttons containing the rating information
    buttons = soup.select('button.rating--interactive')

    for button in buttons:
        # Extract the label (e.g., "Violence & Scariness")
        label_element = button.select_one('span.rating__label')
        if label_element:
            label_text = label_element.get_text(strip=True)
            # If the label is one of the ones we care about
            if label_text in rating_labels:
                # Count the number of active dots (rating), subtract 1 if needed
                active_dots = max(len(button.select('span.rating__score i.active')) - 1, 0)
                
                # Extract the description
                description_element = button.select_one('span.rating__teaser p')
                description = description_element.get_text(strip=True) if description_element else 'No description found'
                
                # Append the result
                details.append({
                    'label': label_text,
                    'rating': active_dots,
                    'description': description
                })
    
    return {
        'release_year': release_year,
        'age_rating': age_rating,
        'details': details
    }

# Function to scrape multiple pages
def scrape_all_pages(last_page=5):
    page_number = 1
    all_reviews = []

    while page_number <= last_page:
        titles_and_links = scrape_titles_and_links(page_number)
        
        if not titles_and_links:
            print(f"No more results found on page {page_number}. Stopping.")
            break
        
        # For each title and link, follow the link and scrape the review details
        for item in titles_and_links:
            review_details = scrape_review_details(item['link'])
            review_data = {
                'title': item['title'],
                'link': item['link'],
                'release_year': review_details['release_year'],
                'age_rating': review_details['age_rating'],
                'details': review_details['details']
            }
            all_reviews.append(review_data)
        
        page_number += 1

    return all_reviews

# Scrape all pages and get the review details
all_reviews_data = scrape_all_pages(last_page=5)  # You can adjust the last_page number

# Save the results to a JSON file
output_file = "detailed_reviews_with_ratings.json"
with open(output_file, 'w') as file:
    json.dump(all_reviews_data, file, indent=4)

print(f"Data saved to {output_file}")


Scraping page: https://www.commonsensemedia.org/search/category/tv/genre/kids-animation-52/page/1/sort/date-desc
Scraping review page: https://www.commonsensemedia.org/tv-reviews/barbie-and-stacie-to-the-rescue
Scraping review page: https://www.commonsensemedia.org/tv-reviews/future-chicken
Scraping review page: https://www.commonsensemedia.org/tv-reviews/zuhu-ka-zalzala
Scraping review page: https://www.commonsensemedia.org/tv-reviews/pokemon-horizons
Scraping review page: https://www.commonsensemedia.org/tv-reviews/hot-wheels-lets-race
Scraping review page: https://www.commonsensemedia.org/tv-reviews/megamind-rules
Scraping review page: https://www.commonsensemedia.org/tv-reviews/iwaju
Scraping review page: https://www.commonsensemedia.org/tv-reviews/snoopy-presents-welcome-home-franklin
Scraping review page: https://www.commonsensemedia.org/tv-reviews/caillou-0
Scraping review page: https://www.commonsensemedia.org/tv-reviews/beas-block
Scraping review page: https://www.commonsensem