In [1]:
import os 
import time 
import requests
import pandas       
from bs4 import BeautifulSoup     
from selenium import webdriver                                                                                                                                                      
from selenium.webdriver.chrome.options import Options
from webdriver_manager.chrome import ChromeDriverManager

In [13]:
driver = webdriver.Chrome()
loginURL = "https://accounts.douban.com/passport/login"
driver.get(loginURL)
time.sleep(5)

In [3]:
def findElitePostPageNumber(groupID):
    startURL = "https://www.douban.com/group/" + str(groupID) + "/discussion?start=0&type=elite"
    driver.get(startURL)
    html = driver.page_source
    soup = BeautifulSoup(html, features='lxml')
    pageNumber = int(soup.find("span", {"class":"thispage"}).attrs["data-total-page"])
    return pageNumber

In [4]:
def findElitePostURLs(groupID):
    ElitePostURLs = []
    baseURL = "https://www.douban.com/group/" + str(groupID) + "/discussion?start="
    pageNumber = findElitePostPageNumber(groupID)
    for page in range(0, pageNumber*25, 25):
        currentPageURL = baseURL + str(page) + "&type=elite"
        driver.get(currentPageURL)
        html = driver.page_source
        soup = BeautifulSoup(html, features='lxml')
        all = soup.find_all("td", {"class":"title"})
        for item in all:
            postInfo = {}
            # find post url
            elitePostURL = item.find("a").attrs["href"]
            postInfo["postURL"] = elitePostURL
            # find post title                                 
            postInfo["postTitle"] = item.find("a").attrs["title"].replace(' ', '-').replace('/', '')

            ElitePostURLs.append(postInfo)
            
    df = pandas.DataFrame(ElitePostURLs)
    df = df.drop_duplicates("postURL")
    df = df.reset_index(drop=True)
    return df

In [5]:
basePath = '/Users/sw/Desktop/DoubanSpider'
def createOutputDir(groupID):
    groupPath = os.path.join(basePath, "Group"+str(groupID))
    os.mkdir(groupPath)
    groupURLs = findElitePostURLs(groupID)
    for i in range(len(groupURLs)):
        postTitle = groupURLs["postTitle"][i]
        postPath = os.path.join(groupPath, postTitle)
        os.mkdir(postPath)

In [6]:
def postImageGifDownloader(groupID, postURL, postTitle):
    postPath = basePath + "/Group"+str(groupID)+"/" + postTitle+"/"
    driver.get(postURL)
    html = driver.page_source
    soup = BeautifulSoup(html, features='lxml')
    images = soup.find_all("div", {"class":"image-wrapper"})
    # check whether exist images in the post
    if len(images):
        for i in range(len(images)):
            try:
                # is a gif
                imageLink = images[i].find("img").attrs["data-original-url"]
                imageName = '{}-{}.gif'.format(postTitle, i)
            except:
                # is a jpg
                imageLink = images[i].find("img").attrs["src"]
                imageName = '{}-{}.jpg'.format(postTitle, i)    
            with open(postPath+imageName, "wb") as f:
                postImage = requests.get(imageLink).content
                f.write(postImage)
                print('Writing', imageName)

In [7]:
def savePostInsideLinks(groupID, postURL, postTitle):
    postPath = basePath + "/Group"+str(groupID)+"/" + postTitle+"/"
    driver.get(postURL)
    html = driver.page_source
    soup = BeautifulSoup(html, features='lxml')

    links = soup.find_all("a", {"class":"link"})
    # 确认有没有链接
    if len(links):
        postInsideLinks = []
        for element in links:
            insideLinkInfo = {}
            insideLinkTitle = element.text
            insideLinkInfo["linkTitle"] = insideLinkTitle
            insidelink = element.attrs["href"]
            insideLinkInfo["linkAddress"] = insidelink
            postInsideLinks.append(insideLinkInfo)
        postInsideLinks = pandas.DataFrame(postInsideLinks)
    
        # csv format
        postInsideLinks.to_csv(postPath+"link"+".csv", encoding="utf-8")
        # txt format
        postInsideLinks.to_csv(postPath+"link"+".txt", sep='\t', index=False, encoding="utf-8")
          

In [8]:
def savePostContentText(groupID, postURL, postTitle):
    driver.get(postURL)
    html = driver.page_source
    soup = BeautifulSoup(html, features='lxml')
    contentTexts = soup.find_all("p", class_=False, id=False, alignment="")
    result = []
    for line in contentTexts:
        result.append(line.text.replace("\n", ""))
    result = pandas.DataFrame(result)

    basePath = '/Users/sw/Desktop/DoubanSpider'
    postPath = basePath + "/Group"+str(groupID)+"/" + postTitle+"/"
    result.to_csv(postPath+"正文"+".txt", sep='\t',index=False, encoding="utf-8")

In [9]:
def savePostEntireScreenshot(postURL, postTitle):
    options = webdriver.ChromeOptions()
    options.headless = True
    driver = webdriver.Chrome(ChromeDriverManager().install(), options=options)
    driver.implicitly_wait(100)

    postScreenShotName = postTitle + ".png"

    driver.get(postURL)

    S = lambda X: driver.execute_script('return document.body.parentNode.scroll'+X)
    driver.set_window_size(S('Width'),S('Height'))                                                                                                              
    driver.find_element_by_class_name('article').screenshot(postScreenShotName)
    print('Writing', postScreenShotName)

In [1]:
def elitePostDownloader(groupID):
    elitePostInfos = findElitePostURLs(groupID)
    createOutputDir(groupID)
    for i in range(len(elitePostInfos)):
        postURL = elitePostInfos["postURL"][i]
        postTitle = elitePostInfos["postTitle"][i]
        # 1. 图片 Gif
        postImageGifDownloader(groupID, postURL, postTitle)
        # 2. 正文内容
        savePostContentText(groupID, postURL, postTitle)
        # 3. 正文链接
        savePostInsideLinks(groupID, postURL, postTitle)
        # 4. 全部帖子截图
        savePostEntireScreenshot(postURL, postTitle)
    