In [None]:
#### crawl_ratings.py
# ctrl F 단어블록 ctrl c ctrl v 
# https://github.com/e9t/nsmc/blob/master/code/crawl_ratings.py

In [1]:
#! /usr/bin/python3
# -*- coding: utf-8 -*-


from collections import defaultdict
from glob import glob
import os
import re
import time

from lxml import html
import numpy as np
import pandas as pd
import requests

import utils
import json


BASEURL     = 'http://movie.naver.com/movie/point/af/list.nhn'
RATINGURL   = BASEURL + '?&page=%s' # 평점
MOVIEURL    = BASEURL + '?st=mcode&target=after&sword=%s&page=%s' # %s 129406 미쓰 와이프 
                       # ?st=mcode&sword=181287&target=after
DATADIR     = 'data/ratings'
INDEXFILE   = 'index.txt' # 찾고자 하는 영화이름
TMPFILE     = 'data/ratings_all.txt'
RATINGSFILE = 'data/ratings.txt'
SEED        = 1234
SLEEP       = 600
NDOCS       = 200000


extract_nums = lambda s: re.search('\d+', s).group(0)
sanitize_str = lambda s: s.strip()


Bad key "axes_unicode_minus" on line 3 in
/home/jovyan/.config/matplotlib/matplotlibrc.
You probably need to get an updated matplotlibrc file from
http://github.com/matplotlib/matplotlib/blob/master/matplotlibrc.template
or from the matplotlib source distribution


In [13]:
def parse_item(item): #item은 tr 1ea를 이루고 있는 list
#     for idx, it in enumerate(item):
#         print(f'item 출력 {idx} : {it}') # td elements 3ea 
    #import pdb; pdb.set_trace()
    try:
        return {
                'review_id': item.xpath('./td[@class="ac num"]/text()')[0],
                #'rating': item.xpath('./td[@class="title"]/text()')[0],    # 여자코딩 
                'rating': item.xpath('./td[@class="title"]/div/em/text()')[0],
                'movie_id': extract_nums(item.xpath('./td[@class="title"]/a/@href')[0]),
                'movie_name': sanitize_str(' '.join(item.xpath('./td[@class="title"]/a/text()'))), # xpath는list => 1줄 str => 공백제건
                'review': sanitize_str(' '.join(item.xpath('./td[@class="title"]/text()'))), # xpath는list => 1줄 str => 공백제건
                'author': item.xpath('./td[@class="num"]/a/text()')[0],
                'date': item.xpath('./td[@class="num"]/text()')[0]
        }
    except (IndexError, AttributeError) as e:
        print(e, item.xpath('.//text()'))
        # ['\n\t\t', '16711283', '\n\t\t\t', '\n\t\t\t\n\t\t\t\n\t\t\t\n\t\t\t\t', '미쓰 와이프', '\n\t\t\t\n\t\t\t\n\t\t\t', '\n\t\t\t\t', '별점 - 총 10점 중', '8', '\n\t\t\t', '\n\t\t\t', '9살 딸이랑 울면서 봤어요 역시 눈물의 포인트는 달랐지만 재밌게봤어요^^ \n\t\t\t\n\t\t\t\n\t\t\t\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t', '신고', '\n\t\t\t\n\t\t\t\n\t\t\t', '\n\t\t', 'youn****', '20.02.14', '\n\t']
        return None
    except (AssertionError) as e:
        print(e, 'Sleep for %s' % SLEEP)
        time.sleep(SLEEP)
    except Exception as e:
        print(e, '음 여기까진 생각을 못했는데...')

def crawl_rating_page(url):
    resp = requests.get(url)
    root = html.fromstring(resp.text)
    
    # 한 page당 tr 10ea인 list정보 => items
    items = root.xpath('//body//table[@class="list_netizen"]//tr')[1:]
    #print(items)
    
    # 최대 페이지 넘버 찾기
    npages = max(map(int, ([0] + root.xpath('//div[@class="paging"]//a/span/text()'))))
    #print(npages)
    
    #return items[0]  # items[0] => tr 1ea(감상평 1ea) 이루고 있는 list
    return list(filter(None, [parse_item(item) for item in items])), npages    
    
# 테스트 =============> #return items[0] 활성화시키면 됨
#url='http://movie.naver.com/movie/point/af/list.nhn'+\
    '?st=mcode&target=after&sword=%s&page=%s' % (129406, 1)
#item=crawl_rating_page(url)
#parse_item(item)

In [14]:
def crawl_movie(movie_id):
    review_data = []
    negative_cnt = 0
    positive_cnt = 0
    for page_num in range(50):  
        url = MOVIEURL % (movie_id, page_num + 1) # %s 인자에 값 주기 ==> %s 129406 (미쓰 와이프) %s 1
        #print(url) # http://movie.naver.com/movie/point/af/list.nhn?st=mcode&target=after&sword=129406&page=1
        page_items, npages = crawl_rating_page(url) # page_items 최대 10ea [{},,,  ] 리턴됨
        #print(f'npages:{npages}') # page 10지나면 다른 npages가 나옴
        
        # 감상평중 긍정평가 count
        positive_cnt +=len([ int(page_item['rating']) for page_item in page_items if int(page_item['rating']) > 8 ])
        # 감상평중 부정평가 count
        negative_cnt +=len([ int(page_item['rating']) for page_item in page_items if int(page_item['rating']) < 5 ])
        review_data.extend(page_items) # append object 추가 [1,2,3,[4,5]], extend element 추가 [1,2,3,4,5]
        #print(review_data)
        
        if len(review_data)==0:
            return  review_data, negative_cnt, positive_cnt
        if page_num >= npages - 1:
            break
    # 테스트
    #return review_data, positive_cnt, negative_cnt

    # 감상평 있으면 JSON 저장
    if review_data:
        #utils.write_json(items, '%s/%s.json' % (DATADIR, movie_id))
        with open( '%s/%s.json' % (DATADIR, movie_id), 'w', encoding='UTF-8-sig') as json_file: # DATADIR => data/ratings #encoding='UTF-8-sig'없어도 한글깨짐 없음
            json_file.write(json.dumps(review_data, ensure_ascii=False)) #한글깨짐관련 
        return review_data, positive_cnt, negative_cnt
        # data/ratings 디렉토리에 대한 permission denied => sudo chmod 777 ratings
    else:
        return []
    
#테스트
#movie_id='187324'
#review_data, positive_cnt, negative_cnt=crawl_movie(movie_id)
# print(f'positive_cnt:{positive_cnt}')
# print(f'negative_cnt:{negative_cnt}') # 부정평가 감상평이 극히 적다
# print(f'len(review_data[0:10] : {len(review_data[0:10])}')
#print(review_data[0:1])


[{'review_id': '16718346', 'rating': '10', 'movie_id': '187324', 'movie_name': '극장판 원피스 스탬피드 신고', 'review': '', 'author': 'rbwj****', 'date': '20.02.16'}]


In [4]:
def get_index(filename): # INDEXFILE ='index.txt'
    if os.path.exists(filename):
        # movie_id, total = map(int, utils.read_txt(filename).split('\n')[0].split(','))
        with open(filename, 'r') as f:
            movie_id, total, negative_cnt_t = map(int,f.read().split('\n')[0].split(','))
        
        # read() 전체, readline() 한줄 str # readlines() 전체 list # split() list
    else:
        movie_id, total, negative_cnt_t = 190325, 0, 0 # 129406 미쓰 와이프 # 174294 성혜의 나라 # 190325 공룡왕 디노
    print(movie_id, total,negative_cnt_t)
    return [movie_id, total, negative_cnt_t]  # [129406,0,0]

In [5]:
def put_index(movie_id, total, filename): #  filename => INDEXFILE ='index.txt'
    #utils.write_txt('%s,%s' % (movie_id, total), filename)
    with open(filename, 'w') as f:
        f.write('%s,%s,%s' % (movie_id, total, negative_cnt_t))
        
# 테스트        
# movie_id=129406
# total = 100
# put_index(movie_id, total, INDEXFILE) #  filename => INDEXFILE ='index.txt'

In [15]:
def read_json(filename):
    with open(filename, 'r', encoding='UTF-8-sig') as json_file:
        readjson = json.load(json_file) # loads
    return readjson
# 테스트
# filename = 'data/ratings/186644.json'
# read_json(filename)

[{'review_id': '16078107',
  'rating': '4',
  'movie_id': '186644',
  'movie_name': '데스 키스 신고',
  'review': '우와!!  찰스브론슨 인줄 알았다!',
  'author': 'soul****',
  'date': '19.08.23'}]

In [18]:
       
def merge_ratings():

    def balance_classes(df, ndocs_per_class): # ndocs_per_class 100000
        df_pos = df[df['label']==1][:int(ndocs_per_class)]
        df_neg = df[df['label']==0][:int(ndocs_per_class)]
        return df_pos.append(df_neg)


    sub_space = lambda s: re.sub('\s+', ' ', s)
    write_row = lambda l, f: f.write('\t'.join(l) + '\n') # l literable

    filenames = glob('%s/*' % DATADIR) # DATADIR     = 'data/ratings'
    with open(TMPFILE, 'w') as f: # TMPFILE     = 'data/ratings_all.txt'
        write_row('id document label'.split(), f)
        for filename in filenames:
            readjson = read_json(filename)
            for review in readjson:
                rating = int(review['rating'])
                if rating > 8:      # positive 9 10
                    write_row([review['review_id'], sub_space(review['review']), '1'], f)
                elif rating < 5:    # negative 1 2 3 4
                    write_row([review['review_id'], sub_space(review['review']), '0'], f)
                else:               # neutral
                    pass
    print('Ratings merged to %s' % TMPFILE)

    
    df = pd.read_csv(TMPFILE, sep='\t', quoting=3) # TMPFILE     = 'data/ratings_all.txt' 
    # if delimiter=None: delimiter=sep,     quoting = 3은 큰따옴표를 무시하도록 한다. header=0 파일의 첫 번째 줄에 열 이름이 있음을 나타내며
    df = df.fillna('')
    # 무작위 표본 추출 긍정평가 100000, 부정평가 100000
    np.random.seed(SEED)
    df = df.iloc[np.random.permutation(len(df))]
    df = balance_classes(df, NDOCS/2) # NDOCS       = 200000
    
    # df -> csv
    df.to_csv(RATINGSFILE, sep='\t', index=False)  # RATINGSFILE = 'data/ratings.txt'
    print('Ratings written to %s' % RATINGSFILE)
    
if __name__=='__main__':
    movie_id, total, negative_cnt_t = get_index(INDEXFILE)
    while negative_cnt_t < 4250 and movie_id > 0: # movie_id 129406 미쓰 와이프 # 174294
        review_data, positive_cnt, negative_cnt=crawl_movie(movie_id)
        total += len(review_data)
        negative_cnt_t += negative_cnt
        put_index(movie_id, total, INDEXFILE)
        print(f'{MOVIEURL % (movie_id, 1)} {len(review_data)} {total} 부정합계:{negative_cnt_t}')
        #MOVIEURL    = BASEURL + '?st=mcode&target=after&sword=%s&page=%s' # %s 129406 미쓰 와이프 
        movie_id -= 1
    merge_ratings()

185933 19322 4511
Ratings merged to data/ratings_all.txt
Ratings written to data/ratings.txt


In [19]:
!pwd

/home/jovyan/work/ml-definitive-guide/8장


In [None]:
#### partition (train, test)

In [21]:
#! /usr/bin/python
# -*- coding: utf-8 -*-

import numpy as np; np.random.seed(1234)
import pandas as pd


ntrain = 150000

data = pd.read_csv('./data/ratings.txt', sep='\t', quoting=3)
data = pd.DataFrame(np.random.permutation(data))
trn, tst = data[:ntrain], data[ntrain:]

header = 'id document label'.split()
trn.to_csv('./data/ratings_train.txt', sep='\t', index=False, header=header)
tst.to_csv('./data/ratings_test.txt', sep='\t', index=False, header=header)