## 链家数据分析
根据爬取的链家网数据，做一点数据分析。

### 0.载入数据

In [None]:
import pandas as pd
lj_data = pd.read_csv('./data/LJdata.csv')

In [None]:
lj_data.columns

In [None]:
lj_data.columns = ['district', 'address', 'title', 'house_type', 'area', 'price', 'floor', 'build_time', 'direction', 'update_time', 'view_num', 'extra_info', 'link']

### 查看数据的形状和信息
hint:都还记得info/describe/shape的用法吧，用起来！

In [None]:
lj_data.shape

In [None]:
lj_data.info()

In [None]:
lj_data.describe(include='all')

### 找到最近更新信息的20套房子

In [None]:
lj_data.head(1)

In [None]:
lj_data.sort_values(by='update_time', ascending=False).head(20)

In [None]:
lj_data.loc[lj_data['update_time']=='2017.07.27',:].shape

In [None]:
lj_data['update_time'].unique()

In [None]:
lj_data.loc[lj_data['update_time']=='2017.07.27',:]

### 平均看房人数

In [None]:
lj_data['view_num'].median()

In [None]:
%matplotlib inline
lj_data['view_num'].value_counts().plot(kind='bar')

In [None]:
lj_data['view_num'].value_counts()

### 房龄最小的20套房子的平均看房人数、平均面积...

In [None]:
lj_data.head(5)

In [None]:
import numpy as np

def get_house_build_year(x):
    try:
        return int(x[:4])
    except:
        return np.NaN

#lj_data.loc[:,'house_age'] = 2018-lj_data['build_time'].apply(lambda x:x[:4]).astype(int)
lj_data.loc[:,'house_age'] = 2018-lj_data['build_time'].apply(get_house_build_year)

In [None]:
lj_data.head(1)

In [None]:
lj_data.loc[:,'house_area'] = lj_data['area'].apply(lambda x:x[:-2]).astype(float)

In [None]:
lj_data.head()

In [None]:
lj_data.info()

In [None]:
lj_data.nsmallest(columns='house_age', n=20)[['view_num','house_area']].agg('mean')

### 房子价格的分布(平均，方差，中位数)

In [None]:
lj_data['price'].describe()

### 最受欢迎的朝向(平均看房人数)

In [None]:
popular_direction = lj_data.groupby('direction')[['view_num']].agg('sum')

In [None]:
popular_direction.nlargest(columns='view_num', n=1)

### 房型数量分布

In [None]:
house_type_dis = lj_data.groupby('house_type').size()

In [None]:
%matplotlib inline
house_type_dis.plot(kind='pie')

### 最受欢迎的房型

In [None]:
tmp = lj_data.groupby('house_type').agg({'view_num':'sum'})

In [None]:
tmp.reset_index(inplace=True)

In [None]:
tmp[tmp['view_num']==tmp['view_num'].max()]

### 房子的平均租房价格(按平米算)

In [None]:
lj_data.loc[:,'price_per_m2'] = lj_data['price']/lj_data['house_area']

In [None]:
lj_data['price_per_m2'].mean()

### 最受关注的小区

In [None]:
lj_data.head()

In [None]:
lj_data[['address','view_num']].groupby('address').sum().nlargest(columns='view_num', n=1)

### 出租房源最多的小区

In [None]:
lj_data['address'].value_counts().head(1)

### 出租房源最多的地区(大家从详情页可以对标到 海淀/西城/东城/朝阳 不同城区...)

In [None]:
lj_data.head()

### 集中供暖和非集中供暖的有多少家，平均价格是多少

In [None]:
lj_data.loc[:,'center_heating'] = lj_data['extra_info'].apply(lambda x: '集中供暖' in x)

In [None]:
lj_data['center_heating'].value_counts()

In [None]:
lj_data[['center_heating', 'price', 'price_per_m2']].groupby('center_heating').agg('mean')

### 哪个城区的租房价格最贵，平均价格是多少

### 不同房型的平均/最大/最小面积

In [None]:
lj_data[['house_type','house_area']].groupby('house_type').agg(['mean','max','min'])

### 哪个地铁口附近的房子最多

In [None]:
lj_data.head()

In [None]:
import re

In [None]:
def find_sub_station(x):
    try:
        return re.search(pattern='距离(\d+号线)(.*?站)(\d+?米)', string=x).group(2)
    except:
        return np.NaN

In [None]:
lj_data.loc[:,'sub_station'] = lj_data['extra_info'].apply(find_sub_station)

In [None]:
lj_data.head()

In [None]:
lj_data['sub_station'].value_counts()

### 地铁附近的房子平均价格 比 非地铁的高多少

In [None]:
def has_sub_station(x):
    return 1 if '距离' in x else 0

lj_data.loc[:,'has_sub_station'] = lj_data['extra_info'].apply(has_sub_station)

In [None]:
lj_data.head()

In [None]:
lj_data[['has_sub_station', 'price', 'price_per_m2']].groupby('has_sub_station').agg('mean')

### 地铁附近的房源离地铁平均距离

In [None]:
def get_subway_distance(x):
    try:
        return re.search(pattern='距离(\d+号线)(.*?站)(\d+?)米', string=x).group(3)
    except:
        return np.NaN

In [None]:
lj_data.loc[:,'distance'] = lj_data['extra_info'].apply(get_subway_distance).astype(float)

In [None]:
lj_data.head()

In [None]:
lj_data['distance'].mean()

### 最多的在租楼层

In [None]:
lj_data.head()

In [None]:
def get_floor(x):
    if '低楼层' in x:
        return '低楼层'
    elif '中楼层' in x:
        return '中楼层'
    else:
        return '高楼层'
    
lj_data.loc[:,'house_floor'] = lj_data['floor'].apply(get_floor)

In [None]:
lj_data['house_floor'].value_counts()

### 不同地区的房龄分布

### 直接看房的房子比例

In [None]:
def get_info(x):
    return 1 if '随时看房' in x else 0

lj_data.loc[:,'convenient'] = lj_data['extra_info'].apply(get_info)

In [None]:
lj_data.head()

In [None]:
lj_data['convenient'].value_counts()

### 有电梯和无电梯的房子数量

In [None]:
def get_elev(x):
    try:
        return int(re.search(pattern='共(\d+)层', string=x).group(1))
    except:
        return np.NaN

lj_data.loc[:,'elev'] = lj_data['floor'].apply(get_elev)

In [None]:
lj_data.head()

In [None]:
lj_data.loc[:,'has_elev'] = lj_data['elev'].apply(lambda x:x>=8)

In [None]:
lj_data['has_elev'].value_counts()

### 有电梯和无电梯房子情况(面积/价格/朝向/带看人数)分布

In [None]:
lj_data[['has_elev', 'house_area', 'price', 'price_per_m2', 'direction', 'view_num']].groupby('has_elev').describe(include='all')

### 附加信息里主要有哪几类信息，覆盖多少百分比的房子

In [None]:
# 分割附加信息，怎么合理分列（识别信息类别），然后就可以看覆盖百分比了。