本项目通过抓取武汉链家二手房在售信息进行保存并进行可视化分析，利用requests和BeautifulSoup库进行抓取解析，对于得到的数据利用pandas进行整理，后期将会利用matplotlib，seaborn进行可视化分析并总结。

In [10]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time
import csv
import re
import sys

In [2]:
headers = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36'}
area_list = ['jiangan','jianghan','qiaokou','dongxihu','wuchang','qingshan','hongshan','hanyang','donghugaoxin','jiangxia','caidian','huangbei','xinzhou','zhuankoukaifaqu']
area_cnlist  = ['江岸','江汉','硚口','东西湖','武昌','青山','洪山','汉阳','东湖高新','江夏','蔡甸','黄陂','新洲','沌口开发区']

In [15]:
#获取解析页面
def get_page(url,num):
    try:
        r = requests.get(url,headers=headers)
        r.raise_for_status()
        r.encoding = r.apparent_encoding
        #print('--------第%d页抓取成功--------'%num) 
        return r.text
    except:
        return None

#解析页面
def parse_page(content,area_cn):
    soup = BeautifulSoup(content,'lxml')
    
    pattern = re.compile('[0-9]{4}')  #利用正则匹配年份
    years = [i.get_text().strip().split('-')[0] for i in soup.select('div[class="positionInfo"]')]
    years_list = []
    for year in years:
        y = pattern.search(year)
        if y == None:
            years_list.append('none')
        else:
            years_list.append(y.group())
    pattern1 = re.compile('"totalPage":(.*?),".*?')
    last_page = re.findall(pattern1,str(soup))
    #查找所需二手房信息，并将查找到的数据放到字典中       
    item = {}
    item['Renovation'] = [i.get_text().strip().split('|')[4] for i in soup.select('div[class="houseInfo"]')] #改造情况
    item['Garden'] = [i.get_text().strip().split('|')[0] for i in soup.select('div[class="houseInfo"]')]    #  房名
    item['Layout'] = [i.get_text().strip().split('|')[1] for i in soup.select('div[class="houseInfo"]')]    #户型
    item['Size'] = [i.get_text().strip().split('|')[2] for i in soup.select('div[class="houseInfo"]')]      #大小
    item['Direction'] = [i.get_text().strip().split('|')[3] for i in soup.select('div[class="houseInfo"]')] #朝向
    item['Elevator'] = [i.get_text().strip().split('|')[-1] if len(i.get_text().strip().split('|'))==6 else 'none' for i in soup.select('div[class="houseInfo"]')]#电梯情况
    item['Year'] = years_list                                                                               #二手房年份
    item['Distric'] = [i.get_text().strip().split('-')[1] for i in soup.select('div[class="positionInfo"]')] # 小区
    item['Floor'] = [i.get_text().strip().split('-')[0][0:3] for i in soup.select('div[class="positionInfo"]')] #楼层
    item['Price'] = [i.get_text().strip() for i in soup.select('div[class="totalPrice"] span')]                 #价格
    item['house_unit_price'] = [i.get_text().strip() for i in soup.select('div[class="unitPrice"] span')]#均价
    item['Region'] = area_cn
    return pd.DataFrame(item)
    
                
               
    

In [16]:
def view_bar(num, total):  #显示进度条
    rate = num / total
    rate_num = int(rate * 100)
    r = '\r[%s%s]%d%%' % ("="*num, " "*(100-num), rate_num, )
    sys.stdout.write(r)
    sys.stdout.flush()

In [21]:
def main():
    filename = 'wuhanlianjia.csv'
    house_data = []
    data = pd.DataFrame()
    for area,area_cn in zip(area_list,area_cnlist):
        url = "https://wh.lianjia.com/ershoufang/%s" % area
        for  page_ing in range(1,101): 
            url_ing = url + "/pg" + str(page_ing) + '/'
            getpage = get_page(url_ing,page_ing)
            if len(getpage) > 0:
                house_info = parse_page(getpage,area_cn)
                house_data.append(house_info)   
            time.sleep(2)
            view_bar(page_ing,100)
        data = pd.concat(house_data, ignore_index = True)
        data.to_csv(filename,encoding = 'gbk', index = False)
        print('完毕')
    
        
         

In [None]:
main()

In [13]:
f = open('wuhanlianjia.csv','r')
wuhanlianjia_df = pd.read_csv(f)
wuhanlianjia_df.head()

Unnamed: 0,Renovation,Garden,Layout,Size,Direction,Elevator,Year,Distric,Floor,Price,house_unit_price
0,精装,星海虹城一期,4室2厅,144.33平米,东 南 西,有电梯,2006,楚河汉街,低楼层,440.0,单价30486元/平米
1,其他,保利中央公馆,3室2厅,116.83平米,南 北,有电梯,2011,新南湖,中楼层,265.0,单价22683元/平米
2,其他,金色雅园金源苑,2室2厅,102.74平米,南,无电梯,2002,长港路,中楼层,175.0,单价17034元/平米
3,简装,常青花园十一小区,2室2厅,91.57平米,南,有电梯,2008,常青花园,低楼层,198.0,单价21623元/平米
4,精装,保利海上五月花一期,3室2厅,86.8平米,南,有电梯,none,庙山,高楼层,140.0,单价16130元/平米


In [14]:
f.close()

通过浏览csv文件发现一些数据特征存在误差，后续将进一步对数据进行清洗整理，并对整理好的数据来进行可视化展示，分析二手房房价与地区，年份，楼层等特征之间关系，以及武汉各区二手房售卖数量进行总结