In [4]:
import requests
from bs4 import BeautifulSoup
import json

# Base URL pattern for paginated search results
base_url = "https://www.commonsensemedia.org/search/category/tv/genre/kids-animation-52/page/{}/sort/date-desc"

# List of labels we are interested in extracting
rating_labels = [
    "Violence & Scariness", "Language", "Sex, Romance & Nudity", "Products & Purchases",
    "Drinking, Drugs & Smoking", "Positive Role Models", "Positive Messages", 
    "Educational Value", "Diverse Representations"
]

# Function to scrape titles and links from a single page
def scrape_titles_and_links(page_number):
    url = base_url.format(page_number)
    print(f"Scraping page: {url}")
    
    response = requests.get(url)
    response.raise_for_status()
    
    soup = BeautifulSoup(response.content, 'html.parser')
    results = soup.select('div.site-search-teaser')  # Select each review teaser block

    titles_and_links = []

    # Loop through each result to extract title and link
    for result in results:
        title_element = result.select_one('h3.review-title a.link--title')
        if title_element:
            title = title_element.get_text(strip=True)
            link = "https://www.commonsensemedia.org" + title_element['href']  # Full URL
            titles_and_links.append({
                'title': title,
                'link': link
            })

    return titles_and_links

# Function to scrape the detailed review information from the TV show page
def scrape_review_details(review_url):
    print(f"Scraping review page: {review_url}")
    
    response = requests.get(review_url)
    response.raise_for_status()
    
    soup = BeautifulSoup(response.content, 'html.parser')
    
    # Extract the release year
    product_summary = soup.select_one('div.review-product-summary')
    if product_summary:
        year_elements = product_summary.find_all('span')
        release_year = year_elements[-1].get_text(strip=True) if year_elements else 'No release year found'
    else:
        release_year = 'No release year found'
    
    # Extract the age rating
    age_rating_element = soup.select_one('div.review-rating span.rating__age')
    age_rating = age_rating_element.get_text(strip=True) if age_rating_element else 'No age rating found'
    
    # Extract the details for all the rating criteria (e.g., Violence & Scariness, Language, etc.)
    details = []
    
    # Locate all buttons containing the rating information
    buttons = soup.select('button.rating--interactive')

    for button in buttons:
        # Extract the label (e.g., "Violence & Scariness")
        label_element = button.select_one('span.rating__label')
        if label_element:
            label_text = label_element.get_text(strip=True)
            # If the label is one of the ones we care about
            if label_text in rating_labels:
                # Count the number of active dots (rating), subtract 1 if needed
                active_dots = max(len(button.select('span.rating__score i.active')) - 1, 0)
                
                # Extract the description
                description_element = button.select_one('span.rating__teaser p')
                description = description_element.get_text(strip=True) if description_element else 'No description found'
                
                # Append the result
                details.append({
                    'label': label_text,
                    'rating': active_dots,
                    'description': description
                })
    
    return {
        'release_year': release_year,
        'age_rating': age_rating,
        'details': details
    }

# Function to scrape multiple pages
def scrape_all_pages(last_page=50):
    page_number = 1
    all_reviews = []

    while page_number <= last_page:
        titles_and_links = scrape_titles_and_links(page_number)
        
        if not titles_and_links:
            print(f"No more results found on page {page_number}. Stopping.")
            break
        
        # For each title and link, follow the link and scrape the review details
        for item in titles_and_links:
            review_details = scrape_review_details(item['link'])
            review_data = {
                'title': item['title'],
                'link': item['link'],
                'release_year': review_details['release_year'],
                'age_rating': review_details['age_rating'],
                'details': review_details['details']
            }
            all_reviews.append(review_data)
        
        page_number += 1

    return all_reviews

# Scrape all pages and get the review details
all_reviews_data = scrape_all_pages(last_page=5)  # You can adjust the last_page number

# Save the results to a JSON file
output_file = "detailed_reviews_with_ratings.json"
with open(output_file, 'w') as file:
    json.dump(all_reviews_data, file, indent=4)

print(f"Data saved to {output_file}")


Scraping page: https://www.commonsensemedia.org/search/category/tv/genre/kids-animation-52/page/1/sort/date-desc
Scraping review page: https://www.commonsensemedia.org/tv-reviews/barbie-and-stacie-to-the-rescue
Scraping review page: https://www.commonsensemedia.org/tv-reviews/future-chicken
Scraping review page: https://www.commonsensemedia.org/tv-reviews/zuhu-ka-zalzala
Scraping review page: https://www.commonsensemedia.org/tv-reviews/pokemon-horizons
Scraping review page: https://www.commonsensemedia.org/tv-reviews/hot-wheels-lets-race
Scraping review page: https://www.commonsensemedia.org/tv-reviews/megamind-rules
Scraping review page: https://www.commonsensemedia.org/tv-reviews/iwaju
Scraping review page: https://www.commonsensemedia.org/tv-reviews/snoopy-presents-welcome-home-franklin
Scraping review page: https://www.commonsensemedia.org/tv-reviews/caillou-0
Scraping review page: https://www.commonsensemedia.org/tv-reviews/beas-block
Scraping review page: https://www.commonsensem

In [5]:
import requests
from bs4 import BeautifulSoup
import json

# Base URL pattern for paginated search results
base_url = "https://www.commonsensemedia.org/search/category/tv/genre/kids-animation-52/page/{}/sort/date-desc"

# List of labels we are interested in extracting
rating_labels = [
    "Violence & Scariness", "Language", "Sex, Romance & Nudity", "Products & Purchases",
    "Drinking, Drugs & Smoking", "Positive Role Models", "Positive Messages", 
    "Educational Value", "Diverse Representations"
]

# Function to scrape titles and links from a single page
def scrape_titles_and_links(page_number):
    url = base_url.format(page_number)
    print(f"Scraping page: {url}")
    
    response = requests.get(url)
    response.raise_for_status()
    
    soup = BeautifulSoup(response.content, 'html.parser')
    results = soup.select('div.site-search-teaser')  # Select each review teaser block

    titles_and_links = []

    # Loop through each result to extract title and link
    for result in results:
        title_element = result.select_one('h3.review-title a.link--title')
        if title_element:
            title = title_element.get_text(strip=True)
            link = "https://www.commonsensemedia.org" + title_element['href']  # Full URL
            titles_and_links.append({
                'title': title,
                'link': link
            })

    # Check if there's a "next page" link
    next_page_element = soup.select_one('li.pagination__next')
    has_next_page = next_page_element is not None

    return titles_and_links, has_next_page

# Function to scrape the detailed review information from the TV show page
def scrape_review_details(review_url):
    print(f"Scraping review page: {review_url}")
    
    response = requests.get(review_url)
    response.raise_for_status()
    
    soup = BeautifulSoup(response.content, 'html.parser')
    
    # Extract the release year
    product_summary = soup.select_one('div.review-product-summary')
    if product_summary:
        year_elements = product_summary.find_all('span')
        release_year = year_elements[-1].get_text(strip=True) if year_elements else 'No release year found'
    else:
        release_year = 'No release year found'
    
    # Extract the age rating
    age_rating_element = soup.select_one('div.review-rating span.rating__age')
    age_rating = age_rating_element.get_text(strip=True) if age_rating_element else 'No age rating found'
    
    # Extract the details for all the rating criteria (e.g., Violence & Scariness, Language, etc.)
    details = []
    
    # Locate all buttons containing the rating information
    buttons = soup.select('button.rating--interactive')

    for button in buttons:
        # Extract the label (e.g., "Violence & Scariness")
        label_element = button.select_one('span.rating__label')
        if label_element:
            label_text = label_element.get_text(strip=True)
            # If the label is one of the ones we care about
            if label_text in rating_labels:
                # Count the number of active dots (rating), subtract 1 if needed
                active_dots = max(len(button.select('span.rating__score i.active')) - 1, 0)
                
                # Extract the description
                description_element = button.select_one('span.rating__teaser p')
                description = description_element.get_text(strip=True) if description_element else 'No description found'
                
                # Append the result
                details.append({
                    'label': label_text,
                    'rating': active_dots,
                    'description': description
                })
    
    return {
        'release_year': release_year,
        'age_rating': age_rating,
        'details': details
    }

# Function to scrape multiple pages
def scrape_all_pages():
    page_number = 1
    all_reviews = []
    has_next_page = True

    while has_next_page:
        titles_and_links, has_next_page = scrape_titles_and_links(page_number)
        
        if not titles_and_links:
            print(f"No more results found on page {page_number}. Stopping.")
            break
        
        # For each title and link, follow the link and scrape the review details
        for item in titles_and_links:
            review_details = scrape_review_details(item['link'])
            review_data = {
                'title': item['title'],
                'link': item['link'],
                'release_year': review_details['release_year'],
                'age_rating': review_details['age_rating'],
                'details': review_details['details']
            }
            all_reviews.append(review_data)
        
        page_number += 1

    return all_reviews

# Scrape all pages and get the review details
all_reviews_data = scrape_all_pages()

# Save the results to a JSON file
output_file = "detailed_reviews_with_ratings.json"
with open(output_file, 'w') as file:
    json.dump(all_reviews_data, file, indent=4)

print(f"Data saved to {output_file}")


Scraping page: https://www.commonsensemedia.org/search/category/tv/genre/kids-animation-52/page/1/sort/date-desc
Scraping review page: https://www.commonsensemedia.org/tv-reviews/barbie-and-stacie-to-the-rescue
Scraping review page: https://www.commonsensemedia.org/tv-reviews/future-chicken
Scraping review page: https://www.commonsensemedia.org/tv-reviews/zuhu-ka-zalzala
Scraping review page: https://www.commonsensemedia.org/tv-reviews/pokemon-horizons
Scraping review page: https://www.commonsensemedia.org/tv-reviews/hot-wheels-lets-race
Scraping review page: https://www.commonsensemedia.org/tv-reviews/megamind-rules
Scraping review page: https://www.commonsensemedia.org/tv-reviews/iwaju
Scraping review page: https://www.commonsensemedia.org/tv-reviews/snoopy-presents-welcome-home-franklin
Scraping review page: https://www.commonsensemedia.org/tv-reviews/caillou-0
Scraping review page: https://www.commonsensemedia.org/tv-reviews/beas-block
Scraping review page: https://www.commonsensem