In [None]:
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

import pandas as pd 
import numpy as np
from tqdm import tqdm
from time import time
import random

In [None]:
driver = webdriver.Chrome()
driver.implicitly_wait(1)  # 寻找元素的最长等待时间，而不是打开网页的最长等待时间。打开网页好像会自动等待全部加载完成

In [None]:
urls = pd.read_parquet('贝壳北京二手房房源条目信息.parquet')['网页']
urls

In [None]:
details = [] # 元素将是字典

# 测试时检查普适性用
random_urls = urls.copy()
random.shuffle(random_urls)
# 测试用
selected_urls = ['https://bj.ke.com/ershoufang/101111550757.html']

for url in tqdm(urls):  # for url in tqdm(urls):
    driver.get(url)
    d = {}
    
    # 1.编号与网页
    d['编号'] = url[url.rfind('/')+1:url.rfind('.')]
    d['网页'] = url

    #start = time()
    
    # 2.标题。兼用来判断页面是否过期
    title = driver.find_elements(By.XPATH,'/html/body/div[1]/div[2]/div[2]/div/div/div[1]/h1')
    if not title:
        continue
    title = title[0].text
    d['标题'] = title
    #print(2,time()-start)
    #start = time()
    
    # 3.必要信息
    total_price = driver.find_element(By.XPATH,'/html/body/div[1]/div[4]/div[1]/div[2]/div[2]/div/span[1]').text
    unit1 = driver.find_element(By.XPATH,'/html/body/div[1]/div[4]/div[1]/div[2]/div[2]/div/span[2]/span').text
    d['总价'] = ' '.join([total_price, unit1])
    unit_price = driver.find_element(By.XPATH,'/html/body/div[1]/div[4]/div[1]/div[2]/div[2]/div/div[1]/div[1]/span').text
    unit2 = driver.find_element(By.XPATH,'/html/body/div[1]/div[4]/div[1]/div[2]/div[2]/div/div[1]/div[1]/i').text
    d['均价'] = ' '.join([unit_price, unit2])
    neighborhood = driver.find_element(By.XPATH,'/html/body/div[1]/div[4]/div[1]/div[2]/div[4]/div[1]/a[1]').text
    d['小区'] = neighborhood
    area = driver.find_element(By.XPATH,'/html/body/div[1]/div[4]/div[1]/div[2]/div[4]/div[2]/span[2]')
    area = area.find_elements(By.XPATH,'a')
    d['区域'] = ' '.join([a.text for a in area])
    person = driver.find_element(By.XPATH,'/html/body/div[1]/div[4]/div[1]/div[2]/div[5]/div[2]/div/div[1]/div[2]/div[1]/a')
    d['维护人'] = person.text
    #print(3,time()-start)
    #start = time()
    
    # 网页主要部分，以下都有main作为搜寻出发点
    main = driver.find_element(By.XPATH,'/html/body/div[1]/div[5]/div[1]')
    
    # 每一部分用for ... in ... 句式，来解决网页可能没有该方面信息而报错的问题
    # 4.基本信息
    for introduction in main.find_elements(By.ID,'introduction'):
        base_attr = introduction.find_elements(By.XPATH,'div/div/div[1]/div[2]/ul/li')
        for x in base_attr:
            key = x.find_element(By.XPATH,'span').text
            value = x.text.replace(key,'') 
            d['基本信息_'+key] = value
            
        transaction_attr = introduction.find_elements(By.XPATH,'div/div/div[2]/div[2]/ul/li')
        for x in transaction_attr:
            key = x.find_element(By.XPATH,'span').text
            value = x.text.replace(key,'')
            d['基本信息_'+key] = value
    #print(4,time()-start)
    #start = time()
    
    # 5.房源特色
    for feature in main.find_elements(By.CLASS_NAME,'introContent.showbasemore'):
        for tag in feature.find_elements(By.CLASS_NAME,'tags.clear'):
            key = '房源特色_'+tag.find_element(By.CLASS_NAME,'name').text
            value = [x.text for x in tag.find_elements(By.XPATH,'div[2]/a')]
            d[key] = value
        for attr in feature.find_elements(By.CLASS_NAME,'baseattribute.clear'):
            key = '房源特色_'+attr.find_element(By.XPATH,'div[1]').text
            value = attr.find_element(By.XPATH,'div[2]').text
            d[key] = value
    #print(5,time()-start)
    #start = time()
    
    # 6.房主自荐
    for owner in main.find_elements(By.XPATH,'div/div[@id="yezhuSell"]'):
        recommend = {}
        for paragraph in owner.find_element(By.CLASS_NAME,'txt').find_elements(By.XPATH,'div'):
            key = paragraph.find_element(By.XPATH,'b').text
            value = paragraph.find_element(By.XPATH,'span').text
            recommend[key] = value
        d['房主自荐_'] = recommend
    #print(6,time()-start)
    #start = time()
        
    # 7.户型分间
    for layout in main.find_elements(By.ID,'layout'):
        d['户型分间_'] = [room.text for room in layout.find_elements(By.CLASS_NAME,'row')]
    #print(7,time()-start)
    #start = time()
    
    # 8.小区简介
    for neighborhood_card in main.find_elements(By.CLASS_NAME,'xiaoquCard'):
        d['小区简介_详情网页'] = neighborhood_card.find_element(By.CLASS_NAME,'fr').get_attribute('href')
        for neighborhood_info in neighborhood_card.find_element(By.CLASS_NAME,'xiaoqu_main.fl').find_elements(By.XPATH,'div'):
            key = '小区简介_'+neighborhood_info.find_element(By.XPATH,'label').text
            value = neighborhood_info.find_element(By.XPATH,'span').text
            d[key] = value
    #print(8,time()-start)
    #start = time()
    
    # 9.参考首付
    for calculator in main.find_elements(By.ID,'calculator'):
        calculator_parameter = calculator.find_element(By.CLASS_NAME,'item-top')
        parameter_list = [parameter.text.replace('\n',' ') for parameter in calculator_parameter.find_elements(By.XPATH,'dl')]
        parameter_list = list(filter(None,parameter_list))  # 除去爬取过程中出现的空字符串
        price = calculator_parameter.find_element(By.XPATH,'//*[@name="price"]').get_attribute('value')
        evaluation = calculator_parameter.find_element(By.XPATH,'//*[@name="evaluation"]').get_attribute('value')
        parameter_list[1] = parameter_list[1].replace(' ',f' {price} ')
        parameter_list[2] = parameter_list[2].replace(' ',f' {evaluation} ')
        d['参考首付_参数'] = parameter_list
        calculator_result = calculator.find_element(By.CLASS_NAME,'result-text')
        d['参考首付_结果'] = [result.text.replace('\n',' ') for result in calculator_result.find_elements(By.XPATH,'div')]
    #print(9,time()-start)
    #start = time()
    
    # 10.带看记录。记录多页时点击下一页后就会丢失整个页面的引用，放到最后再做
    # 问题在于网速，click()后页面未必及时更新，导致下文重新挂上的引用仍在旧页面上，即使设置了强制暂停1秒
    # 解决思路，带看反馈单独爬取，不干扰其他部分的信息
    for visit in main.find_elements(By.XPATH,'//*[@class="daikan_content"]'):
        # visit在点击下一页后会刷新，存在引用丢失问题，而除“带看记录”外其他部分不会刷新。
        #故异于其他部分，下文在刷新循环部分while中用main作查找元素的起始
        feedbacks = {}
        number_page = visit.find_element(By.CLASS_NAME,'daikanPager.clear').text.split(' ')[0].split('/')[-1]
        number_page = int(number_page)
        i = 1
        while i <= number_page:
            j = 1
            for feedback in main.find_elements(By.CLASS_NAME,'daikan_item_content.fr.clear'):
                #print(1)
                agent_name = feedback.find_element(By.CLASS_NAME,'itemAgentName.LOGCLICK.CLICKDATA').text
                agent_comment = feedback.find_element(By.CLASS_NAME,'des').text
                feedbacks[agent_name] = agent_comment
                if j == 1 :
                    flag_name = agent_name
                j += 1
            i += 1
            if i > number_page:
                break
            # 以下两步解决时而出现的ElementClickInterceptedException异常，来源百度
            button = main.find_element(By.ID,'nextPageComment')
            driver.execute_script('arguments[0].click();',button)

            # 由于元素一直存在，只好通过文本内容有无变化来判断点击下一页后有无刷新。
            # 第一位经纪人的姓名有无变化
            locator = (By.CLASS_NAME,'itemAgentName.LOGCLICK.CLICKDATA')  # 定位第一位经纪人姓名的位置
            WebDriverWait(main,10).until_not(EC.text_to_be_present_in_element(locator, flag_name))  # 直到不包含
            #main = driver.find_element(By.XPATH,'/html/body/div[1]/div[5]/div[1]')  # main没有被刷新，不需重挂
            #visit = main.find_element(By.XPATH,'//*[@id="daikanContainer"]//*[@class="daikan_content"]')

        d['经纪人带看反馈'] = feedbacks
    #print(10,time()-start)
    #start = time()
    
    details.append(d)

In [None]:
df = pd.DataFrame(details)
df.columns

In [None]:
#df.iloc[:,0:10]

In [None]:
#df.iloc[:,10:20]

In [None]:
#df.iloc[:,20:30]

In [None]:
#df.iloc[:,30:40]

In [None]:
#df.iloc[:,40:]

In [None]:
df.to_parquet('贝壳北京二手房房源详细信息.parquet')

In [None]:
#df.to_csv('贝壳北京二手房房源详细信息.csv')