In [None]:
%pip install requests beautifulsoup4

In [None]:
### SCRAPE RATINGS

In [22]:
# Import the necessary libraries
import requests
from bs4 import BeautifulSoup

# URL of the page to scrape
url = "https://www.commonsensemedia.org/movie-reviews/bad-boys-ride-or-die"

# Fetch the content of the page
response = requests.get(url)
response.raise_for_status()  # Check if the request was successful

# Parse the HTML content using BeautifulSoup
soup = BeautifulSoup(response.content, 'html.parser')

# Extract the movie title
title_element = soup.select_one("#content > div > div.one-column.row > div > div > div:nth-child(1) > div > div.row > div.col-8 > div > h1")
movie_title = title_element.get_text(strip=True) if title_element else 'No title found'

# Extract the rating age
rating_age_element = soup.select_one("#rating-a3770ea7-2bac-45f7-8736-b532ca04300a > span")
rating_age = rating_age_element.get_text(strip=True) if rating_age_element else 'No rating age found'

# Use the JavaScript path provided to find the specific content
review_content_div = soup.select_one("#review-view-content-grid > div.row")

# Find all child divs with the class 'content-grid-item content-grid-item--shadow'
child_divs = review_content_div.find_all('div', class_='content-grid-item content-grid-item--shadow')

# Extract the 'data-text' attribute, rating, and label from each child div
reviews = []
for idx, child_div in enumerate(child_divs, 1):
    data_text = child_div.get('data-text', 'No content found')
    data_text_cleaned = data_text.replace('<p>', '').replace('</p>', '')
    
    # Find the rating information within the same div
    rating_icons = child_div.select('span.rating__score i.active')
    rating = len(rating_icons) - 1  # Adjust rating by subtracting 1
    
    # Find the label within the same div
    label_span = child_div.select_one('span.rating__label')
    label = label_span.get_text(strip=True) if label_span else 'No label found'
    
    reviews.append({'data_text': data_text_cleaned, 'rating': rating, 'label': label})

# Print the movie title and rating age
print(f"Movie Title: {movie_title}")
print(f"Rating Age: {rating_age}\n")

# Print the extracted reviews
for idx, review in enumerate(reviews, 1):
    print(f"Review {idx}:")
    print(f"Label: {review['label']}")
    print(f"Content: {review['data_text']}")
    print(f"Rating: {review['rating']} out of 5\n")

# Optionally, save the extracted reviews to a file
with open('review_content.txt', 'w') as file:
    file.write(f"Movie Title: {movie_title}\n")
    file.write(f"Rating Age: {rating_age}\n\n")
    for idx, review in enumerate(reviews, 1):
        file.write(f"Review {idx}:\n")
        file.write(f"Label: {review['label']}\n")
        file.write(f"Content: {review['data_text']}\n")
        file.write(f"Rating: {review['rating']} out of 5\n\n")


Movie Title: Bad Boys: Ride or Die
Rating Age: age 16+

Review 1:
Label: Positive Messages
Content: Like the other films in the series, the theme of family (and bonds of friendship) is present underneath the over-the-top action. Characters are encouraged to form better relationships with loved ones, and the main characters both have a strong sense of duty and devotion to their superiors -- even though they blow everything up and face no consequences, they're trying to do the right thing. Anxiety is treated lightly/dismissively.

Rating: 2 out of 5

Review 2:
Label: Positive Role Models
Content: The main characters are likable, but they're also unapologetically violent and crude and never face any consequences. Even their family-like bond now seems stretched thin. One character struggles with the misconception that he's "cursed" and brings violence down upon his loved ones; he begins to suffer panic attacks, which isn't taken/discussed seriously and is solved with a simple slap on the f

In [None]:
# SCRAPE LINKS

In [2]:
import requests
from bs4 import BeautifulSoup

def extract_links(url, filter_prefixes):
    response = requests.get(url)
    response.raise_for_status()  # Check if the request was successful
    soup = BeautifulSoup(response.content, 'html.parser')
    links = soup.find_all('a', class_='link--title')
    filtered_links = [{'title': link.get_text(strip=True), 'url': link['href']} for link in links if any(link['href'].startswith(prefix) for prefix in filter_prefixes)]
    return filtered_links

# URLs of the pages to scrape
movie_url = "https://www.commonsensemedia.org/search/animated%20tv%20shows"
# tv_url = "https://www.commonsensemedia.org/lists/offbeat-animated-movies"

# Filter prefixes
filter_prefixes = ['/movie-reviews', '/tv-reviews']

# Extract links from both URLs
movie_links = extract_links(movie_url, filter_prefixes)
tv_links = extract_links(tv_url, filter_prefixes)

# Combine the lists
all_links = movie_links + tv_links

# Print the list of titles and URLs
print("List of movie and TV review links:")
for link in all_links:
    print(f"Title: {link['title']}, URL: {link['url']}")

# Optionally, save the list of links to a file
with open('review_links.txt', 'w') as file:
    for link in all_links:
        file.write(f"Title: {link['title']}, URL: {link['url']}\n")


List of movie and TV review links:
Title: I Saw the TV Glow, URL: /movie-reviews/i-saw-the-tv-glow
Title: Quiet on Set: The Dark Side of Kids TV, URL: /tv-reviews/quiet-on-set-the-dark-side-of-kids-tv
Title: Zombies: The Re-Animated Series, URL: /tv-reviews/zombies-the-re-animated-series
Title: Ark: The Animated Series, URL: /tv-reviews/ark-the-animated-series
Title: The 8 Show, URL: /tv-reviews/the-8-show
Title: The Magic Prank Show with Justin Willman, URL: /tv-reviews/the-magic-prank-show-with-justin-willman
Title: Jerrod Carmichael Reality Show, URL: /tv-reviews/jerrod-carmichael-reality-show
Title: The Vince Staples Show, URL: /tv-reviews/the-vince-staples-show
Title: The Addams Family (Animated TV Series), URL: /tv-reviews/the-addams-family-animated-tv-series
Title: Catfish: The TV Show, URL: /tv-reviews/catfish-the-tv-show
Title: Spider-Man (1981 TV Show), URL: /tv-reviews/spider-man-1981-tv-show
Title: Ape Star, URL: /movie-reviews/ape-star
Title: Ernest & Celestine, URL: /movi

In [None]:
## SCRAPE RATINGS FOR A WHOLE TXT LIST

In [29]:
# Import the necessary libraries
import requests
from bs4 import BeautifulSoup

# Base URL of the website
base_url = "https://www.commonsensemedia.org"

# Function to scrape and process each URL
def scrape_and_process_url(title, url):
    # Full URL
    full_url = base_url + url

    # Fetch the content of the page
    response = requests.get(full_url)
    response.raise_for_status()  # Check if the request was successful

    # Parse the HTML content using BeautifulSoup
    soup = BeautifulSoup(response.content, 'html.parser')

    # Extract the movie title
    movie_title = title

    # Extract the rating age
    rating_age_element = soup.select_one("#rating-a3770ea7-2bac-45f7-8736-b532ca04300a > span")
    rating_age = rating_age_element.get_text(strip=True) if rating_age_element else 'No rating age found'

    # Use the JavaScript path provided to find the specific content
    review_content_div = soup.select_one("#review-view-content-grid > div.row")

    # Find all child divs with the class 'content-grid-item content-grid-item--shadow'
    child_divs = review_content_div.find_all('div', class_='content-grid-item content-grid-item--shadow')

    # Extract the 'data-text' attribute, rating, and label from each child div
    reviews = []
    for idx, child_div in enumerate(child_divs, 1):
        data_text = child_div.get('data-text', 'No content found')
        data_text_cleaned = data_text.replace('<p>', '').replace('</p>', '')

        # Find the rating information within the same div
        rating_icons = child_div.select('span.rating__score i.active')
        rating = len(rating_icons) - 1  # Adjust rating by subtracting 1

        # Find the label within the same div
        label_span = child_div.select_one('span.rating__label')
        label = label_span.get_text(strip=True) if label_span else 'No label found'

        reviews.append({'data_text': data_text_cleaned, 'rating': rating, 'label': label})

    return {
        'movie_title': movie_title,
        'rating_age': rating_age,
        'reviews': reviews
    }

# Read URLs from review_links.txt
with open('review_links.txt', 'r') as file:
    lines = file.readlines()

# Process each URL and store results
all_reviews = []

for line in lines:
    line = line.strip()  # Remove any leading/trailing whitespace
    if line:
        title_part, url_part = line.split(', URL: ')
        title = title_part.replace('Title: ', '').strip()
        url = url_part.strip()
        review_data = scrape_and_process_url(title, url)
        all_reviews.append(review_data)

# Print the results
for review_data in all_reviews:
    print(f"Title: {review_data['movie_title']}")
    print(f"Rating Age: {review_data['rating_age']}\n")
    
    for idx, review in enumerate(review_data['reviews'], 1):
        print(f"Review {idx}:")
        print(f"Label: {review['label']}")
        print(f"Content: {review['data_text']}")
        print(f"Rating: {review['rating']} out of 5\n")
    print("\n")

# Optionally, save all reviews to a file
with open('all_reviews_content.txt', 'w') as file:
    for review_data in all_reviews:
        file.write(f"Title: {review_data['movie_title']}\n")
        file.write(f"Rating Age: {review_data['rating_age']}\n\n")
        for idx, review in enumerate(review_data['reviews'], 1):
            file.write(f"Review {idx}:\n")
            file.write(f"Label: {review['label']}\n")
            file.write(f"Content: {review['data_text']}\n")
            file.write(f"Rating: {review['rating']} out of 5\n\n")


Title: Power of the Dream
Rating Age: No rating age found

Review 1:
Label: Positive Messages
Content: Women deserve equal rights and equal pay. Black lives matter as much as any other lives. Individual acts can have collective impact.

Rating: 4 out of 5

Review 2:
Label: Positive Role Models
Content: Professional women basketball players stand up for what they believe in, despite a risk to their own livelihoods. They talk about a sisterhood and the importance of using their platform to advocate for social justice issues of importance to their communities, particularly "Black and Brown women."

Rating: 4 out of 5

Review 3:
Label: Diverse Representations
Content: The WNBA started in the 1990s and is comprised predominantly of Black women. Many of the players are gay, leading women interviewed to say they see parallels between struggles around sexuality, gender, race, and class. They decide to use their platform for good and dedicate their season to social justice, engaging in activism

In [None]:
### FINAL SCRAPPING

In [6]:
# Import the necessary libraries
import requests
from bs4 import BeautifulSoup
import json

# Base URL of the website
base_url = "https://www.commonsensemedia.org"

# Function to scrape and process each URL
def scrape_and_process_url(title, url):
    # Full URL
    full_url = base_url + url

    # Determine if it's a movie or TV show
    if "/movie-reviews/" in url:
        content_type = "Movie"
    elif "/tv-reviews/" in url:
        content_type = "TV Show"
    else:
        content_type = "Unknown"

    # Fetch the content of the page
    response = requests.get(full_url)
    response.raise_for_status()  # Check if the request was successful

    # Parse the HTML content using BeautifulSoup
    soup = BeautifulSoup(response.content, 'html.parser')

    # Extract the movie title
    movie_title = title

    # Extract the rating age using the provided CSS selector
    rating_age_element = soup.select_one("#content > div > div.one-column.row > div > div > div:nth-child(1) > div > div.row > div.col-8 > div > div.review-rating")
    rating_age = rating_age_element.get_text(strip=True) if rating_age_element else 'No rating age found'

    # Use the JavaScript path provided to find the specific content
    review_content_div = soup.select_one("#review-view-content-grid > div.row")

    # Find all child divs with the class 'content-grid-item content-grid-item--shadow'
    child_divs = review_content_div.find_all('div', class_='content-grid-item content-grid-item--shadow')

    # Extract the 'data-text' attribute, rating, and label from each child div
    reviews = []
    for idx, child_div in enumerate(child_divs, 1):
        data_text = child_div.get('data-text', 'No content found')
        data_text_cleaned = data_text.replace('<p>', '').replace('</p>', '')

        # Find the rating information within the same div
        rating_icons = child_div.select('span.rating__score i.active')
        rating = len(rating_icons) - 1  # Adjust rating by subtracting 1

        # Find the label within the same div
        label_span = child_div.select_one('span.rating__label')
        label = label_span.get_text(strip=True) if label_span else 'No label found'

        reviews.append({
            'content': data_text_cleaned,
            'rating': rating,
            'label': label
        })

    return {
        'title': movie_title,
        'content_type': content_type,
        'rating_age': rating_age,
        'reviews': reviews
    }

# Read URLs from review_links.txt
with open('review_links.txt', 'r') as file:
    lines = file.readlines()

# Process each URL and store results
all_reviews = []

for line in lines:
    line = line.strip()  # Remove any leading/trailing whitespace
    if line:
        title_part, url_part = line.split(', URL: ')
        title = title_part.replace('Title: ', '').strip()
        url = url_part.strip()
        review_data = scrape_and_process_url(title, url)
        all_reviews.append(review_data)

# Save all reviews to a JSON file
with open('all_reviews_content.json', 'w') as file:
    json.dump(all_reviews, file, indent=4)

# Optionally, print the results
for review_data in all_reviews:
    print(f"Title: {review_data['title']}")
    print(f"Content Type: {review_data['content_type']}")
    print(f"Rating Age: {review_data['rating_age']}\n")
    
    for idx, review in enumerate(review_data['reviews'], 1):
        print(f"Review {idx}:")
        print(f"Label: {review['label']}")
        print(f"Content: {review['content']}")
        print(f"Rating: {review['rating']} out of 5\n")
    print("\n")


Title: I Saw the TV Glow
Content Type: Movie
Rating Age: No rating age found



Title: Quiet on Set: The Dark Side of Kids TV
Content Type: TV Show
Rating Age: No rating age found



Title: Zombies: The Re-Animated Series
Content Type: TV Show
Rating Age: No rating age found



Title: Ark: The Animated Series
Content Type: TV Show
Rating Age: No rating age found



Title: The 8 Show
Content Type: TV Show
Rating Age: No rating age found



Title: The Magic Prank Show with Justin Willman
Content Type: TV Show
Rating Age: No rating age found



Title: Jerrod Carmichael Reality Show
Content Type: TV Show
Rating Age: No rating age found



Title: The Vince Staples Show
Content Type: TV Show
Rating Age: No rating age found



Title: The Addams Family (Animated TV Series)
Content Type: TV Show
Rating Age: No rating age found



Title: Catfish: The TV Show
Content Type: TV Show
Rating Age: No rating age found



Title: Spider-Man (1981 TV Show)
Content Type: TV Show
Rating Age: No rating age f

In [None]:
##### CURRRENT JSON DATA 

In [5]:
import json
import pandas as pd

# Load the JSON data from the file
with open('all_reviews_content.json', 'r') as file:
    all_reviews = json.load(file)

# Create a list to store the data for the table
table_data = []

# Iterate through the JSON data and extract the necessary information
for review_data in all_reviews:
    name = review_data['title']
    content_type = review_data['content_type']
    rating_age = review_data['rating_age']
    table_data.append({'Name': name, 'Type': content_type, 'Age Classification': rating_age})

# Create a DataFrame from the table data
df = pd.DataFrame(table_data)

# Sort the DataFrame by 'Type' and 'Age Classification'
df_sorted = df.sort_values(by=['Type', 'Age Classification'])

# Display the sorted DataFrame
ionally, save the sorted DataFrame to a CSV file
df_sorted.to_csv('sorted_reviews_table.csv', index=False)


SyntaxError: invalid syntax (1504396567.py, line 25)

In [4]:
import json
import pandas as pd
import re

# Load JSON data from the file
with open('all_reviews_content.json', 'r') as file:
    data = json.load(file)

# Process the data to create a structured DataFrame
processed_data = []
for entry in data:
    type_ = entry['content_type']
    age_classification = re.sub(r'\D', '', entry['rating_age'])  # Remove non-numeric characters
    processed_data.append({
        'Title': entry['title'],
        'Type': type_,
        'Age Classification': int(age_classification)  # Convert to integer for proper sorting
    })

# Create a DataFrame from the processed data
df = pd.DataFrame(processed_data)

# Count the number of movies and TV shows per age classification
counts = df.groupby(['Type', 'Age Classification']).size().reset_index(name='Count')

# Sort by Type and Age Classification
sorted_counts = counts.sort_values(by=['Type', 'Age Classification'])

# Add a column for the sum of counts per type
sorted_counts['Total Count per Type'] = sorted_counts.groupby('Type')['Count'].transform('sum')

# Display the final table
print(sorted_counts)

# Optionally, save the final table to a CSV file
sorted_counts.to_csv('sorted_counts.csv', index=False)


KeyboardInterrupt: 