In [190]:
db=MySQLdb.connect(host='localhost', # connect는 함수 인자가 4개
                   user='root',
                   passwd='0',
                   db='naver_movie') 
db.set_character_set('utf8')

In [191]:
db

<_mysql.connection open to 'localhost' at 8101808>

In [192]:
cur=db.cursor()

In [193]:
query = """
        CREATE TABLE IF NOT EXISTS naver_movie
            (code    INTEGER      NOT NULL      PRIMARY KEY,
             title   VARCHAR(255) NOT NULL,
             story   TEXT,
             open_date  DATE,
             created_at    DATETIME DEFAULT CURRENT_TIMESTAMP)
             ENGINE INNODB;
        """ 
cur.execute(query)

  # Remove the CWD from sys.path while we load stuff.


0

In [194]:
query = """
        CREATE TABLE IF NOT EXISTS movie_comment(
            writer  VARCHAR(100),
            content TEXT       NOT NULL,
            score    INTEGER    NOT NULL,
            write_date DATETIME,
            movie_code INTEGER,         
            created_at DATETIME DEFAULT CURRENT_TIMESTAMP,
            PRIMARY KEY (writer, write_date),
            FOREIGN KEY(movie_code)
            REFERENCES naver_movie(code) on delete cascade)
            ENGINE INNODB;
        """
cur.execute(query)

  


0

## 영화 크롤링

In [195]:
from bs4 import BeautifulSoup
import MySQLdb
import pandas as pd
import requests


In [196]:
def get_movie_list(start_page=1, end_page=1): # default 값이 1
    from datetime import date
    url="https://movie.naver.com/movie/sdb/rank/rmovie.nhn"
    date=date.today().strftime("%Y%m%d")
    params={
        'sel':'pnt',
        'date':date
    }
    movie_list=[]
    for i in range(start_page, end_page+1):
        params['page'] =i
        resp=requests.get(url, params)
        soup=BeautifulSoup(resp.content, 'html.parser')
        list_ranking= soup.find('table', class_="list_ranking").find_all('tr')
        for movie_tr in list_ranking:
            try:
                code=movie_tr.find('a')['href'].split('=')[-1]
                title=movie_tr.find('a').text

                movie_list.append({
                    'code':code,
                    'title':title
                })
            except Exception:
                pass
    return movie_list

In [197]:
def get_movie_info(movie): 
    code=movie.get('code') # 딕셔너리 형태도 가능
    title=movie.get('title')
    base_url="https://movie.naver.com/movie/bi/mi/basic.nhn"
    params={
        'code':code
    }
    resp=requests.get(base_url, params)
    soup=BeautifulSoup(resp.content)
    open_date_tag=soup.find('dl',class_='info_spec').find('dd').findAll('span')[-1]
#     open_date=open_date_tag.text
    open_date=""
    for date in open_date_tag.find_all('a'):
        open_date+= date.text
    open_date='-'.join(open_date.strip().split('.'))
    
    story=soup.find('div',class_='story_area')
    movie['story']=story
    movie['open_date']=open_date
    
    return movie

In [198]:
movie_list=get_movie_list()
movie_list[0]

{'code': '171539', 'title': '그린 북'}

In [199]:
# query = """
#         CREATE TABLE IF NOT EXISTS naver_movie
#             (code    INTEGER      NOT NULL      PRIMARY KEY,
#              url     VARCHAR(255) NOT NULL,
#              created_at    DATETIME DEFAULT CURRENT_TIMESTAMP,
#              title   VARCHAR(255) NOT NULL,
#              story   TEXT,
#              opened  DATETIME)
#              ENGINE INNODB;
#         """ 
def insert_movie(movie):
    code=movie.get('code')
    title=movie.get('title')
    story=movie.get('story')
    
    story=str(story)
    open_date=movie.get('open_date')
   
    story=db.escape_string(story)
    query="""
    INSERT INTO naver_movie(code,title,story,open_date)
    VALUES('{0}','{1}',%s, '{2}')
    """.format(code, title, open_date)
    
    cur.execute(query,[story])
    db.commit()

In [200]:
from MySQLdb import IntegrityError
movie_list=get_movie_list()
for movie in movie_list:
    movie=get_movie_info(movie)
    try:
        insert_movie(movie)
    except IntegrityError:
        pass
    except Exception:
        pass

In [201]:
def get_comment_list(code, last_page=1):
    url='https://movie.naver.com/https://movie.naver.com/movie/bi/mi/pointWriteFormList.nhn'
    params={
        'code':code,
        'type':'after',
        'order':'newest'
    }
    result_list=[]
    for page in range(1, last_page+1):
        params['page']=page
        
        resp=requests.get(url, params)
        soup=BeautifulSoup(resp.content)
        repl_list=soup.find('div', class_='score_result').find_all('li')
        
        for repl in repl_list:
            score=repl.find(class_='star_score').text.strip()
            content=repl.find('p')
            nameNdate=content.find_next_sibling('dl').findAll('em')
            content=content.text
            nickname=nameNdate[0]
            date=nameNdate[1]  
            result_list.append({
                'write_name':nickname,
                'write_date':date.text,
                'content':content,
                'score': score,
                'code': code
            })
    return result_list

In [202]:
get_comment_list('156464')

[{'write_name': <em>
  <a href="#" onclick="javascript:showPointListByNid(15815091, 'after');parent.clickcr(this, 'ara.uid', '', '', event); return false;" target="_top">
  <span>신영(youn****)</span>
  </a>
  </em>,
  'write_date': '2019.06.16 18:48',
  'content': '관람객난 이미 퀸을 알고 있었다  ',
  'score': '10',
  'code': '156464'},
 {'write_name': <em>
  <a href="#" onclick="javascript:showPointListByNid(15810155, 'after');parent.clickcr(this, 'ara.uid', '', '', event); return false;" target="_top">
  <span>태바기(nogo****)</span>
  </a>
  </em>,
  'write_date': '2019.06.15 00:02',
  'content': '진짜 공감이다 이거!!!!  ',
  'score': '10',
  'code': '156464'},
 {'write_name': <em>
  <a href="#" onclick="javascript:showPointListByNid(15806789, 'after');parent.clickcr(this, 'ara.uid', '', '', event); return false;" target="_top">
  <span>소(aksg****)</span>
  </a>
  </em>,
  'write_date': '2019.06.13 23:50',
  'content': '관람객와 보다가 울었음..너무 슬펐음 ..진짜루다  ',
  'score': '10',
  'code': '156464'},
 {'write_name': <e

In [203]:
def insert_comment(comment):
    score=comment.get('score')
    writer=comment.get('writer')
    write_date=comment.get('write_date')
    write_date='-'.join(write_date.split(','))
    
    content=comment.get('content')
    movie_code=comment.get('code')
    
    query="""
    INSERT INTO movie_comment(score, writer,write_date,content, movie_code)
    VALUES('{}', '{}', '{}', '{}','{}')
    """.format(score,writer, write_date, content, movie_code)
    
    cur.execute(query)
    db.commit()

In [205]:
for movie in movie_list:
    try:
        comment_list=get_comment_list(movie['code'])
        for comment in comment_list:
            insert_comment(comment)
    except Exception:
        pass

### wony's code

In [None]:
from datetime import date
url="https://movie.naver.com/movie/sdb/rank/rmovie.nhn"
date=date.today().strftime("%Y%m%d")
params={
    'sel':'pnt',
    'date':date
}
resp=requests.get(url, params)
soup=BeautifulSoup(resp.content, 'html.parser')


all_list= soup.find('table', class_="list_ranking").find_all('div', class_="tit5")
div_list=[]
for div in all_list:
    movie_dict={}
    a_tag=div.find('a')
    movie_dict['title']=a_tag['title']
    link=a_tag['href']
    movie_dict['code']=link.split('=')[-1]
    div_list.append(movie_dict)
    ind_resp=requests.get("https://movie.naver.com"+link)
    
from pprint import pprint
pprint(div_list)