In [2]:
pip install scrapy

Collecting scrapy
  Downloading Scrapy-2.11.2-py2.py3-none-any.whl.metadata (5.3 kB)
Collecting Twisted>=18.9.0 (from scrapy)
  Downloading twisted-24.10.0-py3-none-any.whl.metadata (20 kB)
Collecting cssselect>=0.9.1 (from scrapy)
  Downloading cssselect-1.2.0-py2.py3-none-any.whl.metadata (2.2 kB)
Collecting itemloaders>=1.0.1 (from scrapy)
  Downloading itemloaders-1.3.2-py3-none-any.whl.metadata (3.9 kB)
Collecting parsel>=1.5.0 (from scrapy)
  Downloading parsel-1.9.1-py2.py3-none-any.whl.metadata (11 kB)
Collecting queuelib>=1.4.2 (from scrapy)
  Downloading queuelib-1.7.0-py2.py3-none-any.whl.metadata (5.7 kB)
Collecting service-identity>=18.1.0 (from scrapy)
  Downloading service_identity-24.2.0-py3-none-any.whl.metadata (5.1 kB)
Collecting w3lib>=1.17.0 (from scrapy)
  Downloading w3lib-2.2.1-py3-none-any.whl.metadata (2.1 kB)
Collecting zope.interface>=5.1.0 (from scrapy)
  Downloading zope.interface-7.1.1-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_

In [1]:
import scrapy
from scrapy.crawler import CrawlerProcess
from scrapy.utils.project import get_project_settings

class MovieSpider(scrapy.Spider):
    name = "movie_spider"
    start_urls = ["https://www.rottentomatoes.com/browse/movies_at_home/sort:popular"]
    custom_settings = {
        'FEED_URI': 'movies.json',
        'FEED_FORMAT': 'json',
    }
    count = 0
    max_count = 500000

    def parse(self, response):
        movie_links = response.css("a.js-tile-link::attr(href)").getall()
        for link in movie_links:
            if self.count >= self.max_count:
                break
            url = response.urljoin(link)
            yield scrapy.Request(url, callback=self.parse_movie_details)

    def parse_movie_details(self, response):
        if self.count >= self.max_count:
            return
        self.count += 1
        yield {
            'url': response.url,
            'title': response.css("h1.mop-ratings-wrap__title::text").get(),
            'description': response.css("div.movie_synopsis::text").get().strip(),
            'genre': response.css("div.meta-value.genre a::text").getall(),
            'duration': response.css("time::text").get(),
            'director': response.css("a[slot=director]::text").getall(),
            'actors': response.css("a[slot=cast]::text").getall(),
            'user_score': response.css("span.mop-ratings-wrap__percentage::text").getall()[0],
            'critic_score': response.css("span.mop-ratings-wrap__percentage::text").getall()[1],
        }

class CriticReviewSpider(scrapy.Spider):
    name = "critic_review_spider"
    start_urls = ["https://www.rottentomatoes.com/critics/latest_reviews"]
    custom_settings = {
        'FEED_URI': 'critic_reviews.json',
        'FEED_FORMAT': 'json',
    }
    count = 0
    max_count = 500000

    def parse(self, response):
        review_links = response.css("a[href*='/critic-reviews/']::attr(href)").getall()
        for link in review_links:
            if self.count >= self.max_count:
                break
            url = response.urljoin(link)
            yield scrapy.Request(url, callback=self.parse_review_details)

    def parse_review_details(self, response):
        if self.count >= self.max_count:
            return
        self.count += 1
        yield {
            'url': response.url,
            'critic_name': response.css("h1.critic-name::text").get(),
            'date': response.css("time::text").get(),
            'rating': response.css("div.critic-score span::text").get(),
            'review_content': response.css("div.the_review::text").get(),
        }

if __name__ == "__main__":
    process = CrawlerProcess(get_project_settings())
    process.crawl(MovieSpider)
    process.crawl(CriticReviewSpider)
    process.start()


ModuleNotFoundError: No module named 'scrapy'