In [1]:
import re
import time
import pandas as pd
from collections import defaultdict
from bs4 import BeautifulSoup
from urllib.request import urlopen
from urllib.request import Request


In [2]:


class DoubanMovieTop():
    def __init__(self):
        self.top_urls = [
            'https://movie.douban.com/top250?start={0}&filter='.format(x*25) for x in range(10)]
        self.data = defaultdict(list)
        self.columns = ['title', 'link', 'score', 'score_cnt', 'top_no', 'director', 'writers', 'actors', 'types',
                        'edit_location', 'language', 'dates', 'play_location', 'length', 'rating_per', 'betters',
                        'had_seen', 'want_see', 'tags', 'short_review', 'review', 'ask', 'discussion']
        self.df = None

    def url_open(self, url):
        # 打开每个地址
        # 添加header，伪装成浏览器
        headers = {
            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.70 Safari/537.36"
        }
        res = Request(url=url, headers=headers)
        return res

    def get_bsobj(self, url):
        html = urlopen(self.url_open(url)).read()
        bsobj = BeautifulSoup(html, 'lxml')
        return bsobj

    def get_info(self):
        for url in self.top_urls:
            bsobj = self.get_bsobj(url)
            main = bsobj.find('ol', {'class': 'grid_view'})

            # 标题及链接信息
            title_objs = main.findAll('div', {'class': 'hd'})
            titles = [i.find('span').text for i in title_objs]
            links = [i.find('a')['href'] for i in title_objs]

            # 评分信息
            score_objs = main.findAll('div', {'class': 'star'})
            scores = [i.find('span', {'class': 'rating_num'}
                            ).text for i in score_objs]
            score_cnts = [i.findAll('span')[-1].text for i in score_objs]

            for title, link, score, score_cnt in zip(titles, links, scores, score_cnts):
                self.data[title].extend([title, link, score, score_cnt])
                bsobj_more = self.get_bsobj(link)
                more_data = self.get_more_info(bsobj_more)
                self.data[title].extend(more_data)
                print(self.data[title])
                print(len(self.data))
                time.sleep(1)

    def get_more_info(self, bsobj):
        # 榜单排名
        top_no = bsobj.find('span', {'class': 'top250-no'}).text.split('.')[1]

        # 更多信息
        main = bsobj.find('div', {'id': 'info'})

        # 导演
        dire_obj = main.findAll('a', {'rel': 'v:directedBy'})
        director = [i.text for i in dire_obj]

        # 编剧
        try:
            writer_obj = main.findAll('span', {'class': 'attrs'})[1]
            writers = [i.text for i in writer_obj.findAll('a')]
        except Exception as e:
            writers = []
            print(e)

        # 主演
        try:
            actor_obj = main.findAll('a', {'rel': 'v:starring'})
            actors = [i.text for i in actor_obj]
        except Exception as e:
            actors = []
            print(e)

        # 类型
        type_obj = main.findAll('span', {'property': 'v:genre'})
        types = [i.text for i in type_obj]

        # 制片地区
        pattern = re.compile('地区: (.*?)\n语言', re.S)
        edit_location = re.findall(pattern, main.text)[0]

        # 语言
        pattern2 = re.compile('语言: (.*?)\n上映日期')
        language = re.findall(pattern2, main.text)[0]

        # 上映日期/地区
        date_obj = main.findAll('span', {'property': 'v:initialReleaseDate'})
        dates = [i.text.split('(')[0][:4] for i in date_obj]
        play_location = [i.text.split('(')[1][:-1] for i in date_obj]

        # 片长
        length = main.find('span', {'property': 'v:runtime'})['content']

        # 5星到1星比例
        rating_obj = bsobj.findAll('span', {'class': 'rating_per'})
        rating_per = [i.text for i in rating_obj]

        # 好于
        better_obj = bsobj.find('div', {'class': 'rating_betterthan'})
        betters = [i.text for i in better_obj.findAll('a')]

        # 想看/看过
        watch_obj = bsobj.find('div', {'class': 'subject-others-interests-ft'})
        had_seen = watch_obj.find('a').text[:-3]
        want_see = watch_obj.findAll('a')[-1].text[:-3]

        # 标签
        tag_obj = bsobj.find('div', {'class': 'tags-body'}).findAll('a')
        tags = [i.text for i in tag_obj]

        # 短评
        short_obj = bsobj.find('div', {'id': 'comments-section'})
        short_review = short_obj.find('div').find(
            'span', {'class': 'pl'}).find('a').text.split(' ')[1]

        # 影评
        review = bsobj.find('a', {'href': 'reviews'}).text.split(' ')[1]

        # 问题
        ask_obj = bsobj.find('div', {'id': 'askmatrix'})
        ask = ask_obj.find('h2').find('a').text.strip()[2:-1]

        # 讨论
        discuss_obj = bsobj.find('p', {'class': 'pl', 'align': 'right'}).find('a')
        discussion = discuss_obj.text.strip().split('（')[1][2:-2]

        more_data = [top_no, director, writers, actors, types, edit_location, language, dates, play_location,
                    length, rating_per, betters, had_seen, want_see, tags, short_review, review, ask, discussion]

        return more_data
# 将数据缓存到本地

    def dump_data(self):
        data = []
        for title, value in self.data.items():
            data.append(value)
        self.df = pd.DataFrame(data, columns=self.columns)
        self.df.to_csv('douban_top250.csv', index=False)



In [3]:
if __name__ == '__main__':
    douban = DoubanMovieTop()
    douban.get_info()
    douban.dump_data()



['肖申克的救赎', 'https://movie.douban.com/subject/1292052/', '9.7', '1677591人评价', '1', ['弗兰克·德拉邦特'], ['弗兰克·德拉邦特', '斯蒂芬·金'], ['蒂姆·罗宾斯', '摩根·弗里曼', '鲍勃·冈顿', '威廉姆·赛德勒', '克兰西·布朗', '吉尔·贝罗斯', '马克·罗斯顿', '詹姆斯·惠特摩', '杰弗里·德曼', '拉里·布兰登伯格', '尼尔·吉恩托利', '布赖恩·利比', '大卫·普罗瓦尔', '约瑟夫·劳格诺', '祖德·塞克利拉', '保罗·麦克兰尼', '芮妮·布莱恩', '阿方索·弗里曼', 'V·J·福斯特', '弗兰克·梅德拉诺', '马克·迈尔斯', '尼尔·萨默斯', '耐德·巴拉米', '布赖恩·戴拉特', '唐·麦克马纳斯'], ['剧情', '犯罪'], '美国', '英语', ['1994', '1994'], ['多伦多电影节', '美国'], '142', ['84.7%', '13.7%', '1.4%', '0.1%', '0.1%'], ['99% 剧情片', '99% 犯罪片'], '2277515', '229141', ['经典', '励志', '信念', '自由', '人性', '人生', '美国', '剧情'], '310103', '8424', '85', '1220']
1
['霸王别姬', 'https://movie.douban.com/subject/1291546/', '9.6', '1240628人评价', '2', ['陈凯歌'], ['芦苇', '李碧华'], ['张国荣', '张丰毅', '巩俐', '葛优', '英达', '蒋雯丽', '吴大维', '吕齐', '雷汉', '尹治', '马明威', '费振翔', '智一桐', '李春', '赵海龙', '李丹', '童弟', '沈慧芬', '黄斐'], ['剧情', '爱情', '同性'], '中国大陆 / 中国香港', '汉语普通话', ['1993', '1993'], ['中国香港', '中国大陆'], '171', ['81.6%', '16.0%', '2.1%', '0.1%', '0.1%'], ['99% 同性片', '