In [None]:
from urllib import request
from urllib.parse import quote
from urllib.error import HTTPError
from bs4 import BeautifulSoup
from tqdm import tqdm
import json
import random

In [None]:
import pandas as pd
import numpy as np

In [None]:
import matplotlib.pyplot as plt
%matplotlib inline

抓取网页信息，BeautifulSoup用法参考：https://www.crummy.com/software/BeautifulSoup/bs4/doc/#attrs

In [None]:
# 区县
def get_district(url):
    resp = request.urlopen(url)
    html = resp.read()
    soup = BeautifulSoup(html, "lxml")
    poi = soup.find("div", attrs={"data-role": "ershoufang"})
    district = [a["href"] for a in poi.find("div").find_all("a")]
    return district

In [None]:
# 板块
def get_subdistrict(url):
    resp = request.urlopen(url)
    html = resp.read()
    soup = BeautifulSoup(html, "lxml")
    poi = soup.find("div", attrs={"data-role": "ershoufang"})
    subdistrict = [a["href"] for a in  poi.find_all("div")[1].find_all("a")]
    return subdistrict

In [None]:
# 板块内页数
def get_totalpage(url):
    resp = request.urlopen(url)
    html = resp.read()
    soup = BeautifulSoup(html, "lxml")
    poi = soup.find("div", class_="house-lst-page-box")
    if poi==None: return 0
    pagedata = poi['page-data']
    totalpage = json.loads(pagedata)['totalPage']
    return totalpage

In [None]:
# 板块内小区概览
def get_xiaoqu(url):
    resp = request.urlopen(url)
    html = resp.read()
    soup = BeautifulSoup(html, "lxml")
    poi = soup.find("ul", class_="listContent")
    xiaoqu = [[l.a['href'],
                 l.find("div", class_="info").find("div", class_="title").a.string,
                 l.find("div", class_="positionInfo").a.string,
                 l.find("div", class_="positionInfo").find_all("a")[1].string,
                 l.find("div", class_="totalPrice").span.string]
                 for l in poi.find_all('li')]
    return xiaoqu

In [None]:
# 小区详细信息
def get_xiaoqudetail(url):
    resp = request.urlopen(url)
    html = resp.read()
    soup = BeautifulSoup(html, "lxml")
    poi = soup.find("div", class_="xiaoquInfo")
    if poi==None: return None
    detail = {}
    detail['address'] = soup.find("div", class_="detailDesc").string    
    detail['year'] = poi.find_all("div", class_="xiaoquInfoItem")[0].find("span", class_="xiaoquInfoContent").string
    detail['style'] = poi.find_all("div", class_="xiaoquInfoItem")[1].find("span", class_="xiaoquInfoContent").string
    detail['fee'] = poi.find_all("div", class_="xiaoquInfoItem")[2].find("span", class_="xiaoquInfoContent").string
    detail['maintenance'] = poi.find_all("div", class_="xiaoquInfoItem")[3].find("span", class_="xiaoquInfoContent").string
    detail['developer'] = poi.find_all("div", class_="xiaoquInfoItem")[4].find("span", class_="xiaoquInfoContent").string
    detail['buildings'] = poi.find_all("div", class_="xiaoquInfoItem")[5].find("span", class_="xiaoquInfoContent").string
    detail['houses'] = poi.find_all("div", class_="xiaoquInfoItem")[6].find("span", class_="xiaoquInfoContent").string
    return detail

上海各区

In [None]:
url_base = 'https://sh.lianjia.com'

In [None]:
%%time
url = url_base + '/xiaoqu'
district = get_district(url)

去除上海周边

In [None]:
district = list(set(district) - set(['/xiaoqu/shanghaizhoubian/']))

In [None]:
print(len(district))
district[0]

板块

In [None]:
%%time
district2subdistrict = {}
subdistrict = []
for d in district:
    url = url_base + d
    try:
        result = get_subdistrict(url)
        district2subdistrict[d] = result
        subdistrict += result
    except:
        print(url)

去重

In [None]:
print('{} unique out of {}'.format(len(set(subdistrict)), len(subdistrict)))
subdistrict = list(set(subdistrict))
subdistrict[0]

小区

In [None]:
xiaoqu = []

In [None]:
pbar = tqdm(subdistrict)
for s in pbar:
    url = url_base + s
    totalpage = get_totalpage(url)
    pbar.set_description("{} pages:{}".format(url, totalpage))
    for i in range(1, totalpage + 1):
        url_page = url + 'pg' + str(i)
        try:
            xiaoqu += get_xiaoqu(url_page)
        except HTTPError:
            print('HTTPError: {}'.format(url_page))
pbar.close()

In [None]:
print(len(xiaoqu))
xiaoqu[-1]

数据清理

In [None]:
# 加入列名
df = pd.DataFrame(xiaoqu, columns=['link','title','district','subdistrict','price'])

In [None]:
# 小区名去除括号，以便查询经纬度
df['name'] = df.title.apply(lambda x: x.replace('(','').replace(')',''))

In [None]:
# 无单价的置0，以便转为数字格式
df.loc[df.price=='暂无', 'price'] = 0
df['price'] = df.price.astype(int)

In [None]:
# 单价前3强
df.sort_values('price', ascending=False).head(3)

In [None]:
# 单价以万计，以便在地图上显示
df['price_show'] = df.price.apply(lambda x: round(x/10000, 1))

查询经纬度，参考：http://lbsyun.baidu.com/index.php?title=webapi/guide/webservice-geocoding

In [None]:
BAIDU_API = "http://api.map.baidu.com/geocoder/v2/?output=json&ak={}&city={}&address={}"

def get_coord(ak, city, address):
    url = BAIDU_API.format(ak, city, address)
    url = quote(url, safe=':/?&=')
    resp = request.urlopen(url)
    j = json.loads(resp.read())
    result = {}    
    result['status'] = j['status']
    if result['status']==0:  # ok
        result['lat'] = j['result']['location']['lat']
        result['lng'] = j['result']['location']['lng']
        result['confidence'] = j['result']['confidence']
        result['precise'] = j['result']['precise']
    return result

此处需要百度的appKey，因为每天有限额，建议自备多个appKey，同时记录超额的appKey，不重复使用

In [None]:
BAIDU_AKS = ["ZUONbpqGBsYGXNIYHicvbAbM"]
aks_outQuota = []
aks = list(set(BAIDU_AKS) - set(aks_outQuota))
city = '上海'

通过小区名查询经纬度

In [None]:
pbar = tqdm(df[df.lat==0].index)
for i in pbar:
    if len(aks)==0:
        print('当日配额用尽，停止抓取')
        break
    ak = aks[random.randint(0, len(aks)-1)]
    address = df.loc[i, 'name']
    pbar.set_description(address)
    result = get_coord(ak, city, address)
    df.loc[i, 'status'] = result['status']
    if result['status']==0:  # ok
        df.loc[i, 'lat'] = result['lat']
        df.loc[i, 'lng'] = result['lng']
        df.loc[i, 'confidence'] = result['confidence']
        df.loc[i, 'precise'] = result['precise']
    elif str(result['status'])[0]=='3':  # 配额错误
        aks_outQuota.append(ak)
        aks = list(set(BAIDU_AKS) - set(aks_outQuota))
pbar.close()

拿一个小区出来看看，经纬度是否正常

In [None]:
df_test = df[(df.subdistrict==df.subdistrict[0]) & (df.lat>0)]

左上角的点明显离群，经纬度需要校正，用地址查询应该更准确，但先要抓取小区详细信息

In [None]:
plt.scatter(df_test.lng, df_test.lat)

小区详细信息

In [None]:
pbar = tqdm(df.index)
for i in pbar:
    url = df.loc[i, 'link']
    pbar.set_description(url)
    df.loc[i, 'visited'] = 1
    try:
        detail = get_xiaoqudetail(url)
        if detail==None: continue
        df.loc[i, 'address'] = detail['address']
        df.loc[i, 'year'] = detail['year']
        df.loc[i, 'style'] = detail['style']
        df.loc[i, 'fee'] = detail['fee']
        df.loc[i, 'maintenance'] = detail['maintenance']
        df.loc[i, 'developer'] = detail['developer']
        df.loc[i, 'buildings'] = detail['buildings']
        df.loc[i, 'houses'] = detail['houses']
        # 抓取时会出现内存泄漏（原因未明，怀疑与https有关），可能死机，因此周期性保存结果
        if i%1000==0: df.to_csv('xiaoqu.csv', index=False)
    except HTTPError:
        print('HTTPError: {}'.format(url_page))
pbar.close()

同一小区地址可以有多个，只选取第一个用于查询经纬度

In [None]:
df['address2'] = df.loc[~df.address.isna(),'address'].apply(lambda x: x.split(')')[1].replace('，',',').split(',')[0])

保存结果

In [None]:
df.to_csv('xiaoqu.csv', index=False)

通过地址查询经纬度

In [None]:
# 排除地址可能为空的记录
pbar = tqdm(df[~df.address2.isna()].index[1732:])
for i in pbar:
    ak = aks[random.randint(0, len(aks)-1)]
    address = df.loc[i, 'address2']
    pbar.set_description(address)
    result = get_coord(ak, city, address)
    df.loc[i, 'status2'] = result['status']
    if result['status']==0:  # ok
        df.loc[i, 'lat2'] = result['lat']
        df.loc[i, 'lng2'] = result['lng']
        df.loc[i, 'confidence2'] = result['confidence']
        df.loc[i, 'precise2'] = result['precise']
    elif str(result['status'])[0]=='3':  # 配额错误
        aks_outQuota.append(ak)
        aks = list(set(BAIDU_AKS) - set(aks_outQuota))
pbar.close()

小区经纬度为空的，直接用地址经纬度替换

In [None]:
index_replace = df[(df.lat.isna()) & (df.lat2>0)].index
df.loc[index_replace, 'lat'] = df.loc[index_replace, 'lat2']
df.loc[index_replace, 'lng'] = df.loc[index_replace, 'lng2']

逐个板块，观察经纬度与其中位数的距离，与四分位距离的比值，确认离群值分割点

In [None]:
subdistrict = df.subdistrict.unique()
len(subdistrict)

In [None]:
for s in subdistrict:
    df_test = df.loc[(df.subdistrict==s) & (df.lat>0), ['lat','lng','lat2','lng2']]
    # 样本数过小， 不计算偏差
    if len(df_test) < 10: continue
    lat_m = df_test.lat.median()
    lat_q = np.percentile(df_test.lat,[25,75])
    lat_qd = lat_q[1] - lat_q[0]
    lng_m = df_test.lng.median()
    lng_q = np.percentile(df_test.lng,[25,75])
    lng_qd = lng_q[1] - lng_q[0]
    df.loc[df_test.index, 'lat_dev'] = df_test.lat.apply(lambda x: round(abs(x - lat_m)/lat_qd, 1))
    df.loc[df_test.index, 'lng_dev'] = df_test.lng.apply(lambda x: round(abs(x - lng_m)/lng_qd, 1))
    df.loc[df_test.index, 'lat2_dev'] = df_test.lat2.apply(lambda x: round(abs(x - lat_m)/lat_qd, 1))
    df.loc[df_test.index, 'lng2_dev'] = df_test.lng2.apply(lambda x: round(abs(x - lng_m)/lng_qd, 1))

有些比值上千，一般比值大于2就认为是离群值

In [None]:
df['lat_dev'].plot(kind='hist', logy=True)

缩小范围，观察比值<100的分布

In [None]:
df.loc[df.lat_dev<100, 'lat_dev'].plot(kind='hist', logy=True)

比值<10的分布，基本取到2，就能包含9成的样本

In [None]:
df.loc[df.lat_dev<10, 'lat_dev'].plot(kind='hist', logy=True)

但比值取多少才能排除真实的离群值，对于市中心边界规整、小区集中的板块，偏离度小，比值取小了很多正常样本会被误判为离群，而对于郊区分布稀疏的小区，比值取大了离群的样本会漏判为正常，宁缺毋滥，选择观察郊区的板块，决定分割阈值

In [None]:
df_test = df[(df.subdistrict=='崇明其它') & (df.lat>0)]

In [None]:
outlier = df_test[(df_test.lat_dev>2) | (df_test.lng_dev>2)]

右下方的6个点都不是崇明，比值取2会漏掉1个离群值

In [None]:
plt.scatter(df_test.lng, df_test.lat)
plt.scatter(outlier.lng, outlier.lat, c='r')

In [None]:
outlier = df_test[(df_test.lat_dev>1.8) | (df_test.lng_dev>1.8)]

比值取1.8正确判断所有非崇明的小区，因此阈值选择1.8

In [None]:
plt.scatter(df_test.lng, df_test.lat)
plt.scatter(outlier.lng, outlier.lat, c='r')

逐个板块，筛选经纬度离群值（大于1.8倍四分位距离），若地址经纬度不离群，则替换，若地址经纬度也离群，则标记经纬度不可信

In [None]:
subdistrict = df.subdistrict.unique()

In [None]:
# 初始化经纬度可信标记
df.loc[df.lat_dev>=0, 'coord_trusted'] = True
# 阈值
threshold = 1.8

# 离群值
outlier = df[(df.lat_dev > threshold) | 
             (df.lng_dev > threshold)].index
# 选择性替换为地址经纬度
for i in outlier:
    # 地址经纬度不离群，选择替换
    if df.loc[i, 'lat2_dev'] <= threshold and df.loc[i, 'lng2_dev'] <= threshold:
        df.loc[i, 'lat'] = df.loc[i, 'lat2']
        df.loc[i, 'lng'] = df.loc[i, 'lng2']
    # 否则标记经纬度不可信
    else:
        df.loc[i, 'coord_trusted'] = False

保存结果

In [None]:
df.to_csv('xiaoqu.csv', index=False)

导出json，以便在地图上显示

In [None]:
jf = df[df.coord_trusted & (df.price>0)][['lng','lat','price_show','title','district','subdistrict','link']].to_dict(orient='records')
len(jf)

In [None]:
def save_json(j, filename):
    fp = open(filename, 'w')
    json.dump(j, fp)
    fp.close()

In [None]:
save_json(jf, 'xiaoqu.json')