In [24]:
import numpy as np
import time
import pandas as pd
import requests
from lxml import etree
from tqdm import tqdm_notebook as tqdm
import warnings
import time
import random
import os
warnings.filterwarnings('ignore')
import matplotlib.pyplot as plt

In [2]:
# Utilities

def list_url(page):
    return 'https://sz.lianjia.com/ershoufang/pg{}/'.format(str(page))

def detail_url(idx):
    return 'https://sz.lianjia.com/ershoufang/{}.html'.format(str(idx))

def random_sleep(max_sleep=3, min_sleep=0.2):
    time.sleep(min_sleep + random.random() * max_sleep)
    
def make_header():
    user_agent = [
        'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36',
        'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36 OPR/26.0.1656.60',
        'Mozilla/5.0 (Windows NT 5.1; U; en; rv:1.8.1) Gecko/20061208 Firefox/2.0.0 Opera 9.50',
        'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; en) Opera 9.50',
        'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:34.0) Gecko/20100101 Firefox/34.0',
        'Mozilla/5.0 (X11; U; Linux x86_64; zh-CN; rv:1.9.2.10) Gecko/20100922 Ubuntu/10.10 (maverick) Firefox/3.6.10',
        'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/534.57.2 (KHTML, like Gecko) Version/5.1.7 Safari/534.57.2',
        'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.71 Safari/537.36',
        'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11',
        'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/534.16 (KHTML, like Gecko) Chrome/10.0.648.133 Safari/534.16',
        'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/30.0.1599.101 Safari/537.36',
        'Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; rv:11.0) like Gecko',
        'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.71 Safari/537.1 LBBROWSER',
        'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E; LBBROWSER)',
        'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E; QQBrowser/7.0.3698.400)',
        'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; QQDownload 732; .NET4.0C; .NET4.0E)',
        'Mozilla/5.0 (Windows NT 5.1) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.84 Safari/535.11 SE 2.X MetaSr 1.0',
        'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Trident/4.0; SV1; QQDownload 732; .NET4.0C; .NET4.0E; SE 2.X MetaSr 1.0)',
    ]
    return {'user-agent': random.choice(user_agent)}

def get_page(url):
    header = make_header()
    r = requests.get(url=url, verify=False, headers=header)
    if r.status_code == 200:
        return etree.HTML(r.text)
    else:
        print(r.status_code)

In [3]:
def parse_main(page: etree._Element):
    df = pd.DataFrame()
    
    df['id'] = page.xpath("//div[@class='info clear']//div[@class='title']/a/@data-housecode")
    df['title'] = page.xpath("//div[@class='info clear']//div[@class='title']/a/text()")
    df['addr_1'] = page.xpath("//div[@class='info clear']//div[@class='positionInfo']//a[1]/text()")
    df['addr_2'] = page.xpath("//div[@class='info clear']//div[@class='positionInfo']//a[2]/text()")

    floor = []
    area = []
    direction = []
    deco = []
    height = []
    age = []
    typ = []
    for i in page.xpath("//div[@class='info clear']//div[@class='houseInfo']/text()"):
        text = i.split('|')
        text = [t.strip() for t in text]
        floor.append(text[0])
        area.append(text[1])
        direction.append(text[2])
        deco.append(text[3])
        height.append(text[4])
        age.append(text[5])
        typ.append(text[6])
    
    df['floor_plan'] = floor
    df['area'] = area
    df['direction'] = direction
    df['decoration'] = deco
    df['height'] = height
    df['age'] = age
    df['type'] = typ
    del floor, area, direction, deco, height, age, typ

    watch = []
    list_time = []
    for i in page.xpath("//div[@class='info clear']//div[@class='followInfo']/text()"):
        text = i.split('/')
        text = [t.strip() for t in text]
        watch.append(text[0])
        list_time.append(text[1])

    df['watch'] = watch
    df['list_time'] = list_time
    
    del watch, list_time

    df['total_price'] = page.xpath("//div[@class='info clear']//div[@class='totalPrice']/span/text()")
    df['unit_price'] = page.xpath("//div[@class='info clear']//div[@class='unitPrice']/@data-price")

    tags = []
    for i in page.xpath("//div[@class='info clear']//div[@class='tag']"):
        tags.append(','.join(i.xpath('./span/text()')))
        
    df['tags'] = tags
    
    return df

In [None]:
if 'save' not in os.listdir():
    os.mkdir('save')

sleep = 10
num_its = 100
error_counts = 0
for i in tqdm(range(1, num_its)):
    while error_counts <=100:
        try:
            page = get_page(list_url(i))
            parse_main(page).to_pickle('save/{0}.pkl'.format(str(i)), protocol = -1)
            random_sleep(sleep)
        except:
            error_counts += 1
            pass

HBox(children=(IntProgress(value=0, max=99), HTML(value='')))




HBox(children=(IntProgress(value=0, max=99), HTML(value='')))

In [18]:
def load_result(path):
    
#     def parse_list_time(x):
#         try:
#             if x[1] == '天':
#                 return np.int(x[0])
#             elif x[2] == '月':
#                 return np.int(x[0]) * 30
#             else:
#                 return 365
#         except:
#             return None
        
    res = []
    for i in os.listdir('save'):
        res.append(pd.read_pickle('save/{0}'.format(str(i))))
    df = pd.concat(res, axis=0).reset_index(drop=True)
#     df['area'] = df['area'].apply(lambda x: x[:x.index('平')]).astype(float)
#     df['age'] = df['age'].apply(lambda x: x[:x.index('年')]).astype(int)
#     df['watch'] = df['watch'].apply(lambda x: x[:x.index('人')]).astype(int)
#     df['list_time'] = df['watch'].apply(parse_list_time)
    
    
    return df

In [162]:
page3 = get_page(detail_url('105103180360'))

In [63]:
page2 = get_page(detail_url('105102698194'))

In [74]:
page = get_page(detail_url('105102437021'))

In [246]:
def retrieve(lis):
    try:
        return lis[0]
    except:
        return None

def parse_detail(page: etree._Element):
    
    def parse_tag(dic, tag):
        if 'tag is_near_subway ' in tag:
            dic['near_subway'] = 1
        else:
            dic['near_subway'] = 0

        if 'tag good CLICKDATA VIEWDATA' in tag:
            dic['good'] = 1
        else:
            dic['good'] = 0

        if 'tag vr ' in tag:
            dic['vr'] = 1
        else:
            dic['vr'] = 0

        if 'tag five ' in tag:
            dic['greater_than_2'] = 1
        else:
            dic['greater_than_2'] = 0

        if 'tag is_see_free ' in tag:
            dic['see_anytime'] = 1
        else:
            dic['see_anytime'] = 0

        if 'tag taxfree ' in tag:
            dic['greater_than_5'] = 1
        else:
            dic['greater_than_5'] = 0
    
    
    dic = {}
    dic['id'] = retrieve(page.xpath('//div[@class="houseRecord"]/span[2]/text()'))
    dic['title'] = retrieve(page.xpath('//h1[@class="main"]/@title'))
    dic['watch'] = retrieve(page.xpath("//span[@id='favCount']/text()"))
    dic['total_price'] = retrieve(page.xpath('//div[@class="price "]/span[1]/text()'))
    dic['unit_price'] = retrieve(page.xpath('//div[@class="unitPrice"]/span/text()'))
    dic['community'] = retrieve(page.xpath('//div[@class="communityName"]/a[1]/text()'))
    dic['community_id'] = retrieve(page.xpath('//div[@class="communityName"]/a[1]/@href')).split('/')[-2]
    dic['district'] = retrieve(page.xpath('//div[@class="areaName"]/span[2]/a[1]/text()'))
    dic['addr'] = retrieve(page.xpath('//div[@class="areaName"]/span[2]/a[2]/text()'))
    dic['addr_desc'] = retrieve(page.xpath('//div[@class="areaName"]/a/text()'))
    dic['visit_time'] = retrieve(page.xpath('//div[@class="visitTime"]/span[2]/text()'))
    dic['floor_place'] = retrieve(page.xpath('//div[@class="base"]/div[2]/ul/li[1]/text()'))
    dic['floor'] = retrieve(page.xpath('//div[@class="base"]/div[2]/ul/li[2]/text()'))
    dic['area'] = retrieve(page.xpath('//div[@class="base"]/div[2]/ul/li[3]/text()'))
    dic['floor_structure'] = retrieve(page.xpath('//div[@class="base"]/div[2]/ul/li[4]/text()'))
    dic['area_inside'] = retrieve(page.xpath('//div[@class="base"]/div[2]/ul/li[5]/text()'))
    dic['building_type'] = retrieve(page.xpath('//div[@class="base"]/div[2]/ul/li[6]/text()'))
    dic['facing'] = retrieve(page.xpath('//div[@class="base"]/div[2]/ul/li[7]/text()'))
    dic['building_structure'] = retrieve(page.xpath('//div[@class="base"]/div[2]/ul/li[8]/text()'))
    dic['decoration'] = retrieve(page.xpath('//div[@class="base"]/div[2]/ul/li[9]/text()'))
    dic['stair_to_rooms'] = dic['area_inside'] = retrieve(page.xpath('//div[@class="base"]/div[2]/ul/li[10]/text()'))
    dic['elevator'] = dic['area_inside'] = retrieve(page.xpath('//div[@class="base"]/div[2]/ul/li[11]/text()'))
    dic['property_duration'] = retrieve(page.xpath('//div[@class="base"]/div[2]/ul/li[12]/text()'))
    dic['list_time'] = retrieve(page.xpath('//div[@class="transaction"]/div[2]/ul/li[1]/span[2]/text()'))
    dic['property_type'] = retrieve(page.xpath('//div[@class="transaction"]/div[2]/ul/li[2]/span[2]/text()'))
    dic['last_transaction'] = retrieve(page.xpath('//div[@class="transaction"]/div[2]/ul/li[3]/span[2]/text()'))
    dic['property_usage'] = retrieve(page.xpath('//div[@class="transaction"]/div[2]/ul/li[4]/span[2]/text()'))
    dic['owned_duration'] = retrieve(page.xpath('//div[@class="transaction"]/div[2]/ul/li[5]/span[2]/text()'))
    dic['property_owner_type'] = retrieve(page.xpath('//div[@class="transaction"]/div[2]/ul/li[6]/span[2]/text()'))
    dic['mortgage'] = retrieve(page.xpath('//div[@class="transaction"]/div[2]/ul/li[7]/span[2]/@title'))
    dic['certificate_photo'] = retrieve(page.xpath('//div[@class="transaction"]/div[2]/ul/li[8]/span[2]/text()'))
    
    tag = page.xpath("//div[@class='tags clear']/div[2]/a/@class")
    parse_tag(dic, tag)
    
    desc = retrieve(page.xpath("//div[@class='baseattribute clear']/div[text() = '周边配套']/following-sibling::*/text()"))
    if desc is not None:
        dic['surrounding_desc'] = desc.strip()
    else:
        dic['surrounding_desc'] = ''
        
    desc = retrieve(page.xpath("//div[@class='baseattribute clear']/div[text() = '小区介绍']/following-sibling::*/text()"))
    if desc is not None:
        dic['community_desc'] = desc.strip()
    else:
        dic['community_desc'] = ''
        
    desc = retrieve(page.xpath("//div[@class='baseattribute clear']/div[text() = '户型介绍']/following-sibling::*/text()"))
    if desc is not None:
        dic['floor_desc'] = desc.strip()
    else:
        dic['floor_desc'] = ''
        
    desc = retrieve(page.xpath("//div[@class='baseattribute clear']/div[text() = '核心卖点']/following-sibling::*/text()"))
    if desc is not None:
        dic['selling_desc'] = desc.strip()
    else:
        dic['selling_desc'] = ''
        
    desc = retrieve(page.xpath("//div[@class='baseattribute clear']/div[text() = '交通出行']/following-sibling::*/text()"))
    if desc is not None:
        dic['commute_desc'] = desc.strip()
    else:
        dic['commute_desc'] = ''
        
    floor_detail = []
    for i in page2.xpath("//div[@id='infoList']/div"):
        floor_detail.append('/'.join(i.xpath('./div/text()')))
    dic['floor_detail'] = ','.join(floor_detail)
    
    return dic

In [249]:
parse_detail(page2)

{'id': '105102698194',
 'title': '此房满五唯一，红本在手，税费少',
 'watch': '3',
 'total_price': '280',
 'unit_price': '63855',
 'community': '合正锦湖魅力城',
 'community_id': '2411048483183',
 'district': '罗湖区',
 'addr': '洪湖',
 'addr_desc': '近7号线洪湖站',
 'visit_time': '有租户需要预约',
 'floor_place': '1室1厅1厨1卫',
 'floor': '中楼层 (共31层)',
 'area': '43.85㎡',
 'floor_structure': '平层',
 'area_inside': '有',
 'building_type': '塔楼',
 'facing': '东南',
 'building_structure': '钢混结构',
 'decoration': '精装',
 'stair_to_rooms': '三梯十二户',
 'elevator': '有',
 'property_duration': '70年',
 'list_time': '2019-07-17',
 'property_type': '商品房',
 'last_transaction': '2011-01-19',
 'property_usage': '普通住宅',
 'owned_duration': '满五年',
 'property_owner_type': '非共有',
 'mortgage': '有抵押 60万元',
 'certificate_photo': '已上传房本照片',
 'near_subway': 1,
 'good': 0,
 'vr': 1,
 'greater_than_2': 0,
 'see_anytime': 0,
 'greater_than_5': 1,
 'surrounding_desc': '配套很齐全，自身的物业就已经很完善了，楼下就是华润超市，健身房，美食街，发廊，年代酒吧，各种小吃，还有儿童乐园，老年人活动点。小区正面是儿童公园，侧面是洪湖公园，斜面是人民公园，属于双地铁三公园小区

In [207]:
retrieve(page2.xpath("//div[@class='baseattribute clear']/div[text() = '周边配套']/following-sibling::*/text()"))

'\n                    \n配套很齐全，自身的物业就已经很完善了，楼下就是华润超市，健身房，美食街，发廊，年代酒吧，各种小吃，还有儿童乐园，老年人活动点。小区正面是儿童公园，侧面是洪湖公园，斜面是人民公园，属于双地铁三公园小区。出行方便，环境优美。\n                  '

In [245]:
page2.xpath("//span[@id='favCount']/text()")

['3']

In [165]:
def parse_tag(dic, tag):
    if 'tag is_near_subway ' in tag:
        dic['near_subway'] = 1
    else:
        dic['near_subway'] = 0
        
    if 'tag good CLICKDATA VIEWDATA' in tag:
        dic['good'] = 1
    else:
        dic['good'] = 0
        
    if 'tag vr ' in tag:
        dic['vr'] = 1
    else:
        dic['vr'] = 0
    
    if 'tag five ' in tag:
        dic['greater_than_2'] = 1
    else:
        dic['greater_than_2'] = 0
        
    if 'tag is_see_free ' in tag:
        dic['see_anytime'] = 1
    else:
        dic['see_anytime'] = 0
    
    if 'tag tax_free ' in tag:
        dic['greater_than_5'] = 1
    else:
        dic['greater_than_5'] = 0
    
        
        
    tag_dict = {
        'tag is_near_subway ': '近地铁',
        'tag good CLICKDATA VIEWDATA': ''
    }