### Naver Movie
- robots.txt 설정으로 크롤링 안됩니다.

#### 1. 프로젝트 생성

In [3]:
!python3 -m scrapy startproject naver_movie

New Scrapy project 'naver_movie', using template directory '/home/ubuntu/.local/lib/python3.6/site-packages/scrapy/templates/project', created in:
    /home/ubuntu/crawling/naver_movie

You can start your first spider with:
    cd naver_movie
    scrapy genspider example example.com


#### 2. Items 설정
- https://movie.naver.com/movie/running/current.nhn
- 제목, 관객수, 평점

In [None]:
# !cat naver_movie/naver_movie/items.py

In [4]:
%%writefile naver_movie/naver_movie/items.py
import scrapy

class NaverMovieItem(scrapy.Item):
    title = scrapy.Field()
    count = scrapy.Field()
    star = scrapy.Field()

Overwriting naver_movie/naver_movie/items.py


#### 3. xpath 확인

In [5]:
import requests
import scrapy
from scrapy.http import TextResponse

In [6]:
# 링크
req = requests.get("https://movie.naver.com/movie/running/current.nhn")
response = TextResponse(req.url, body=req.text, encoding="utf-8")

In [7]:
links = response.xpath(
    '//*[@id="content"]/div[1]/div[1]/div[3]/ul/li/dl/dt/a/@href').extract()
len(links), links[0]

(121, '/movie/bi/mi/basic.naver?code=200894')

In [8]:
link = response.urljoin(links[0])
link

'https://movie.naver.com/movie/bi/mi/basic.naver?code=200894'

In [10]:
# 상세 데이터 수집
req = requests.get(link)
response = TextResponse(req.url, body=req.text, encoding="utf-8")

In [19]:
title = response.xpath('//*[@id="content"]/div[1]/div[2]/div[1]/h3/a[1]/text()').extract()[0]
count = response.xpath('//*[@id="content"]/div[1]/div[2]/div[1]/dl/dd[5]/div/p[2]/text()').extract()[0]
star = response.xpath('//*[@id="actualPointPersentBasic"]/div/em/text()').extract()
star = "".join(star)
#title, count, star
title, count, star

('극장판 포켓몬스터: 정글의 아이, 코코', '22,093명', '10.0')

#### 4. spider 작성

In [21]:
%%writefile naver_movie/naver_movie/spiders/spider.py
import scrapy
from naver_movie.items import NaverMovieItem


class MovieSpider(scrapy.Spider):
    name = "NaverMovie"
    allow_domain = ["https://movie.naver.com"]
    start_urls = ["https://movie.naver.com/movie/running/current.nhn"]

    def parse(self, response):
        links = response.xpath('//*[@id="content"]/div[1]/div[1]/div[3]/ul/li/dl/dt/a/@href').extract()
        for link in links:
            link = response.urljoin(link)
            yield scrapy.Request(link, callback=self.parse_page_contents)

    def parse_page_contents(self, response):
        item = NaverMovieItem()
        item["title"] = response.xpath('//*[@id="content"]/div[1]/div[2]/div[1]/h3/a[1]/text()').extract()[0]
        try:
            item["count"] = response.xpath('//*[@id="content"]/div[1]/div[2]/div[1]/dl/dd[5]/div/p[2]/text()').extract()[0]
        except:
            item["count"] = "0명"
        star = response.xpath('//*[@id="actualPointPersentBasic"]/div/em/text()').extract()
        item["star"] = "".join(star)
        yield item

Writing naver_movie/naver_movie/spiders/spider.py


#### 5. Scrapy 실행

In [26]:
%%writefile ./naver_movie/run.sh
cd naver_movie
python3 -m scrapy crawl NaverMovie -o naver_movie.csv

Overwriting ./naver_movie/run.sh


In [27]:
!chmod +x ./naver_movie/run.sh

In [28]:
!./naver_movie/run.sh

2021-09-18 19:56:03 [scrapy.utils.log] INFO: Scrapy 2.5.0 started (bot: naver_movie)
2021-09-18 19:56:03 [scrapy.utils.log] INFO: Versions: lxml 4.6.3.0, libxml2 2.9.10, cssselect 1.1.0, parsel 1.6.0, w3lib 1.22.0, Twisted 17.9.0, Python 3.6.9 (default, Jan 26 2021, 15:33:00) - [GCC 8.4.0], pyOpenSSL 17.5.0 (OpenSSL 1.1.1  11 Sep 2018), cryptography 2.1.4, Platform Linux-5.4.0-1054-aws-x86_64-with-Ubuntu-18.04-bionic
2021-09-18 19:56:03 [scrapy.utils.log] DEBUG: Using reactor: twisted.internet.epollreactor.EPollReactor
2021-09-18 19:56:03 [scrapy.crawler] INFO: Overridden settings:
{'BOT_NAME': 'naver_movie',
 'NEWSPIDER_MODULE': 'naver_movie.spiders',
 'ROBOTSTXT_OBEY': True,
 'SPIDER_MODULES': ['naver_movie.spiders']}
2021-09-18 19:56:03 [scrapy.extensions.telnet] INFO: Telnet Password: 33b73a3c0933bf4f
2021-09-18 19:56:03 [scrapy.middleware] INFO: Enabled extensions:
['scrapy.extensions.corestats.CoreStats',
 'scrapy.extensions.telnet.TelnetConsole',
 'scrapy.extensions.memusage.Mem

In [31]:
import pandas as pd

In [32]:
pd.read_csv('./naver_movie/naver_movie.csv')

# empty! no data was crawled, because of robots.txt restriction

EmptyDataError: No columns to parse from file

#### 6. settings.py 파일 변경
- Forbidden by robots.txt

In [35]:
!head -n 20 naver_movie/naver_movie/settings.py | tail -n 5

# Crawl responsibly by identifying yourself (and your website) on the user-agent
#USER_AGENT = 'naver_movie (+http://www.yourdomain.com)'

# Obey robots.txt rules
ROBOTSTXT_OBEY = True


sed : ROBOTS를 찾아서 바꾸라는 뜻. 엄청 자주 쓰는 shell command

In [36]:
!sed -i 's/ROBOTSTXT_OBEY = True/ROBOTSTXT_OBEY = False/' naver_movie/naver_movie/settings.py