In [36]:
import pandas as pd
import matplotlib.pyplot as plt


## 一、 导入数据

In [37]:
df = pd.read_csv("./listings.csv")

## 二、 缺失值填充

### 1. 查看缺失值
非零值字段，即存在缺失值。

In [38]:
df.isnull().sum()

id                                    0
name                                  1
host_id                               0
host_name                             0
neighbourhood_group               28452
neighbourhood                         0
latitude                              0
longitude                             0
room_type                             0
price                                 0
minimum_nights                        0
number_of_reviews                     0
last_review                       11158
reviews_per_month                 11158
calculated_host_listings_count        0
availability_365                      0
dtype: int64

In [39]:
df.dtypes

id                                  int64
name                               object
host_id                             int64
host_name                          object
neighbourhood_group               float64
neighbourhood                      object
latitude                          float64
longitude                         float64
room_type                          object
price                               int64
minimum_nights                      int64
number_of_reviews                   int64
last_review                        object
reviews_per_month                 float64
calculated_host_listings_count      int64
availability_365                    int64
dtype: object

### 2. 对于数据类型，处理缺失值最简单的方法就是用平均数来填充缺失值

In [40]:
df.reviews_per_month = df.reviews_per_month.fillna(df.reviews_per_month.median())


### 3. 对于分类类型，处理缺失值的方法是用最常见的类别做填充


In [41]:
df.last_review = df.last_review.fillna('1900-01-01')

In [42]:
df.neighbourhood_group = df.neighbourhood_group.fillna(0)

## 三、 空值替换

In [46]:
df.availability_365 = df.availability_365.replace('', 0) 

## 四、 数值映射

In [43]:
df.room_type = df.room_type.map({'Entire home/apt':0,'Private room':1, 'Shared room':2})


In [44]:
df.room_type.head()

0    0
1    1
2    0
3    0
4    0
Name: room_type, dtype: int64

## 五、OneHot编码


In [45]:
#使用get_dummies进行one-hot编码，列名前缀是Embarked
roomTypeDf = pd.DataFrame()
roomTypeDf = pd.get_dummies(df.room_type, prefix='RoomType')

df = pd.concat([df,roomTypeDf],axis=1)
# df.drop('RoomType',axis=1,inplace=True)
df.head()

Unnamed: 0,id,name,host_id,host_name,neighbourhood_group,neighbourhood,latitude,longitude,room_type,price,minimum_nights,number_of_reviews,last_review,reviews_per_month,calculated_host_listings_count,availability_365,RoomType_0,RoomType_1,RoomType_2
0,44054,Modern and Comfortable Living in CBD,192875,East Apartments,0.0,朝阳区 / Chaoyang,39.89503,116.45163,0,792,1,89,2019-03-04,0.85,9,341,1,0,0
1,100213,The Great Wall Box Deluxe Suite A团园长城小院东院套房,527062,Joe,0.0,密云县 / Miyun,40.68434,117.17231,1,1201,1,2,2017-10-08,0.1,4,0,0,1,0
2,128496,Heart of Beijing: House with View 2,467520,Cindy,0.0,东城区,39.93213,116.422,0,389,3,259,2019-02-05,2.7,1,93,1,0,0
3,161902,cozy studio in center of Beijing,707535,Robert,0.0,东城区,39.93357,116.43577,0,376,1,26,2016-12-03,0.28,5,290,1,0,0
4,162144,"nice studio near subway, sleep 4",707535,Robert,0.0,朝阳区 / Chaoyang,39.93668,116.43798,0,537,1,37,2018-08-01,0.4,5,352,1,0,0


## 六、相关性矩阵

In [48]:
corrDf = df.corr()

In [49]:
# 查看各个特征与生成情况（Price）的相关系数，ascending=False表示按降序排列
corrDf['price'].sort_values(ascending = False)

price                             1.000000
latitude                          0.126437
RoomType_0                        0.101187
availability_365                  0.026933
host_id                           0.019251
longitude                         0.009472
minimum_nights                    0.008419
id                               -0.005895
calculated_host_listings_count   -0.015742
number_of_reviews                -0.044383
reviews_per_month                -0.045277
RoomType_2                       -0.047822
RoomType_1                       -0.080837
room_type                        -0.100707
neighbourhood_group                    NaN
Name: price, dtype: float64