In [2]:
import pandas as pd
train_data = pd.read_csv('train.csv')
test_data = pd.read_csv('test.csv')

print('train all:', train_data.shape)
print('test all:', test_data.shape)

train all: (1460, 81)
test all: (1459, 80)


In [3]:
print(train_data.columns)

Index(['Id', 'MSSubClass', 'MSZoning', 'LotFrontage', 'LotArea', 'Street',
       'Alley', 'LotShape', 'LandContour', 'Utilities', 'LotConfig',
       'LandSlope', 'Neighborhood', 'Condition1', 'Condition2', 'BldgType',
       'HouseStyle', 'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd',
       'RoofStyle', 'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType',
       'MasVnrArea', 'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual',
       'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinSF1',
       'BsmtFinType2', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', 'Heating',
       'HeatingQC', 'CentralAir', 'Electrical', '1stFlrSF', '2ndFlrSF',
       'LowQualFinSF', 'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath',
       'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 'KitchenQual',
       'TotRmsAbvGrd', 'Functional', 'Fireplaces', 'FireplaceQu', 'GarageType',
       'GarageYrBlt', 'GarageFinish', 'GarageCars', 'GarageArea', 'GarageQual',
       'GarageCond', 'PavedDrive

In [4]:
#print(train_data.describe)

In [5]:
missing_over_20 = []
for i in range(train_data.shape[1]):
    col_name = train_data.columns[i]
    num = train_data[col_name].count()
    rate = num/train_data.shape[0]*100  # 统计非空的数据所占比例
    if rate < 80:
        missing_over_20.append(col_name)
        print(col_name, rate, '%')  # 输出缺失数据大于20%的列名

Alley 6.232876712328768 %
FireplaceQu 52.73972602739726 %
PoolQC 0.4794520547945206 %
Fence 19.246575342465754 %
MiscFeature 3.6986301369863015 %


In [6]:
# 在原始数据上删除缺失的列
train_data.drop(missing_over_20, axis=1, inplace=True)
test_data.drop(missing_over_20, axis=1, inplace=True)

In [7]:
print(train_data.shape)

(1460, 76)


In [9]:
y = train_data.SalePrice
train_x = train_data.drop(['SalePrice'], axis=1)
test_x = test_data
print(train_x.shape)
print(test_x.shape)

(1460, 75)
(1459, 75)


In [10]:
# 编码object数据
one_hot_encoded_train_x = pd.get_dummies(train_x)
# 是否会造成测试数据和训练数据对同一数据的编码不同
one_hot_encoded_test_x = pd.get_dummies(test_x)  
# 使训练和测试数据的列名相同
final_train, final_test = one_hot_encoded_train_x.align(
    one_hot_encoded_test_x, join='left', axis=1)  

In [11]:
# 填补数据
from sklearn.preprocessing import Imputer
im = Imputer()
final_train_x = im.fit_transform(final_train)
# transform 后就没有了Id。下面是题目要求的预测数据
final_test_x = im.transform(final_test) 

In [12]:
from sklearn.model_selection import train_test_split
train_x_s, test_x_s, train_y_s, test_y_s = \
    train_test_split(final_train_x, y, random_state=0)

In [13]:
from sklearn.ensemble import RandomForestRegressor
rfr = RandomForestRegressor()
rfr.fit(train_x_s, train_y_s)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
           oob_score=False, random_state=None, verbose=0, warm_start=False)

In [29]:
from sklearn.metrics import mean_squared_log_error
import numpy as np
# kaggle 标准的测量准则 rmsle  先取log，然后取差的平方，然后取平均值，然后开平方
s = 0
predict_y = rfr.predict(test_x_s)
test_y = np.array(test_y_s)
print(np.sqrt(mean_squared_log_error(predict_y,test_y)))

0.14628567521324273


In [15]:
predict_y = rfr.predict(final_test_x)

In [16]:
my_submission = pd.DataFrame({'Id': test_data.Id, 
                              'SalePrice': predict_y})
# you could use any filename. We choose submission here
my_submission.to_csv('submission.csv', index=False)