<a href="https://colab.research.google.com/github/slapazromero/Ejercicio_Scrapy/blob/main/Scrapy.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install scrapy

In [2]:
!scrapy startproject proyecto_imdb

New Scrapy project 'proyecto_imdb', using template directory '/usr/local/lib/python3.8/dist-packages/scrapy/templates/project', created in:
    /content/proyecto_imdb

You can start your first spider with:
    cd proyecto_imdb
    scrapy genspider example example.com


In [3]:
!cd /content/proyecto_imdb/proyecto_imdb && scrapy genspider imdb https://www.imdb.com

Created spider 'imdb' using template 'basic' in module:
  proyecto_imdb.spiders.imdb


In [4]:
%%writefile /content/proyecto_imdb/proyecto_imdb/spiders/imdb.py
import scrapy
from proyecto_imdb.items import ProyectoImdbItem

class ImdbSpider(scrapy.Spider):
  name = 'imdb'
  allowed_domains = ['www.imdb.com']
  start_urls = ['https://www.imdb.com/search/title/?title_type=movie&genres=comedy&ref_=adv_prv']
  custom_settings = {'FEED_FORMAT':'json','FEED_URI':'IMDB.json'}
  count = 0

  def parse(self, response):
    for index, href in zip(response.css("h3.lister-item-header span:nth-child(1)::text").getall(), response.css("h3.lister-item-header a::attr(href)").getall()):
      yield response.follow(url=href, callback=self.parse_movie, meta = {'index': index})

    if self.count < 15 and response.css('.nav a.next-page::attr(href)').get() is not None:
      self.count += 1
      yield response.follow(url=response.css('.nav a.next-page::attr(href)').get(), callback=self.parse)
      
  def parse_movie(self, response):  
    item = ProyectoImdbItem()
    item['position'] = response.meta.get('index')
    item['title'] = response.xpath('//h1[@data-testid="hero-title-block__title"]/text()').get()
    item['directors'] = response.css('.sc-bfec09a1-8 > li:nth-child(1) >  div > ul > li > a::text').getall()
    item['writers'] = response.css('.sc-bfec09a1-8 > li:nth-child(2) >  div > ul > li > a::text').getall()
    item['stars'] = response.css('.sc-bfec09a1-1::text').getall()
    item['user_reviews'] = response.css('.sc-3ff39621-0 > li:nth-child(1) .score::text').get()
    item['critic_reviews'] = response.css(".sc-3ff39621-0 > li:nth-child(2) .score::text").get()
    item['metascore'] = response.css(".sc-3ff39621-0 > li:nth-child(3) .score::text").get()
    return item


Overwriting /content/proyecto_imdb/proyecto_imdb/spiders/imdb.py


In [5]:
%%writefile /content/proyecto_imdb/proyecto_imdb/items.py
# Define here the models for your scraped items
#
# See documentation in:
# https://docs.scrapy.org/en/latest/topics/items.html

import scrapy

class ProyectoImdbItem(scrapy.Item):
  # define the fields for your item here like:
  position = scrapy.Field()
  title = scrapy.Field()
  directors = scrapy.Field()
  writers = scrapy.Field()
  stars = scrapy.Field()
  user_reviews = scrapy.Field()
  critic_reviews = scrapy.Field()
  metascore = scrapy.Field()

Overwriting /content/proyecto_imdb/proyecto_imdb/items.py


In [6]:
%%writefile /content/proyecto_imdb/proyecto_imdb/settings.py
# Scrapy settings for proyecto_imdb project
#
# For simplicity, this file contains only settings considered important or
# commonly used. You can find more settings consulting the documentation:
#
#     https://docs.scrapy.org/en/latest/topics/settings.html
#     https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
#     https://docs.scrapy.org/en/latest/topics/spider-middleware.html

BOT_NAME = 'proyecto_imdb'

SPIDER_MODULES = ['proyecto_imdb.spiders']
NEWSPIDER_MODULE = 'proyecto_imdb.spiders'


# Crawl responsibly by identifying yourself (and your website) on the user-agent
USER_AGENT = 'Mozilla/5.0 (X11; Linux x86_64; rv:91.0) Gecko/20100101 Firefox/91.0'

# Obey robots.txt rules
ROBOTSTXT_OBEY = False

# Configure maximum concurrent requests performed by Scrapy (default: 16)
#CONCURRENT_REQUESTS = 32

# Configure a delay for requests for the same website (default: 0)
# See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs
#DOWNLOAD_DELAY = 3
# The download delay setting will honor only one of:
#CONCURRENT_REQUESTS_PER_DOMAIN = 16
#CONCURRENT_REQUESTS_PER_IP = 16

# Disable cookies (enabled by default)
#COOKIES_ENABLED = False

# Disable Telnet Console (enabled by default)
#TELNETCONSOLE_ENABLED = False

# Override the default request headers:
DEFAULT_REQUEST_HEADERS = {
   'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
   'Accept-Language': 'es',
}

# Enable or disable spider middlewares
# See https://docs.scrapy.org/en/latest/topics/spider-middleware.html
#SPIDER_MIDDLEWARES = {
#    'proyecto_imdb.middlewares.ProyectoImdbSpiderMiddleware': 543,
#}

# Enable or disable downloader middlewares
# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
#DOWNLOADER_MIDDLEWARES = {
#    'proyecto_imdb.middlewares.ProyectoImdbDownloaderMiddleware': 543,
#}

# Enable or disable extensions
# See https://docs.scrapy.org/en/latest/topics/extensions.html
#EXTENSIONS = {
#    'scrapy.extensions.telnet.TelnetConsole': None,
#}

# Configure item pipelines
# See https://docs.scrapy.org/en/latest/topics/item-pipeline.html
#ITEM_PIPELINES = {
#    'proyecto_imdb.pipelines.ProyectoImdbPipeline': 300,
#}

# Enable and configure the AutoThrottle extension (disabled by default)
# See https://docs.scrapy.org/en/latest/topics/autothrottle.html
#AUTOTHROTTLE_ENABLED = True
# The initial download delay
#AUTOTHROTTLE_START_DELAY = 5
# The maximum download delay to be set in case of high latencies
#AUTOTHROTTLE_MAX_DELAY = 60
# The average number of requests Scrapy should be sending in parallel to
# each remote server
#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
# Enable showing throttling stats for every response received:
#AUTOTHROTTLE_DEBUG = False

# Enable and configure HTTP caching (disabled by default)
# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
#HTTPCACHE_ENABLED = True
#HTTPCACHE_EXPIRATION_SECS = 0
#HTTPCACHE_DIR = 'httpcache'
#HTTPCACHE_IGNORE_HTTP_CODES = []
#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'

# Set settings whose default value is deprecated to a future-proof value
REQUEST_FINGERPRINTER_IMPLEMENTATION = '2.7'
TWISTED_REACTOR = 'twisted.internet.asyncioreactor.AsyncioSelectorReactor'


Overwriting /content/proyecto_imdb/proyecto_imdb/settings.py


In [8]:
!cd /content/proyecto_imdb/proyecto_imdb && scrapy crawl imdb

[1;30;43mSe han truncado las últimas 5000 líneas del flujo de salida.[0m
           'Robbie Gee',
           'Nicholas Woodeson',
           'Alex Jordan',
           'Stewart Gilchrist',
           'Jim Broadbent'],
 'title': 'Paddington 2',
 'user_reviews': '296',
 'writers': ['Paul King', 'Simon Farnaby', 'Michael Bond']}
2022-12-01 17:17:14 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://www.imdb.com/title/tt0805647/?ref_=adv_li_tt> (referer: https://www.imdb.com/search/title/?title_type=movie&genres=comedy&start=551&ref_=adv_nxt)
2022-12-01 17:17:14 [scrapy.core.scraper] DEBUG: Scraped from <200 https://www.imdb.com/title/tt13055780/?ref_=adv_li_tt>
{'critic_reviews': '22',
 'directors': ['Shaun Paul Piccinino'],
 'metascore': None,
 'position': '552.',
 'stars': ['Lauren Swickard',
           'Josh Swickard',
           'Ali Afshar',
           'David Del Rio',
           'Katelyn Epperly',
           'Amanda Detmer',
           'Natalia Mann',
           'Gunnar Anderso