In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

import warnings
warnings.filterwarnings("ignore")

In [2]:
train = pd.read_csv("Desktop/dataset/house_price/train.csv")
test = pd.read_csv("Desktop/dataset/house_price/test.csv")

In [7]:
# 将训练集和测试集拼接起来，同时进行特征处理变换
full = pd.concat([train,test], ignore_index=True)
full.shape

(2919, 81)

### 1. 初步建模(primitive modelization)
    我们先粗略将数据清洗一遍，用模型拟合得到初步的房价预测结果。

In [8]:
# 观察缺失值特征分布的dtypes
na_count = full.isnull().sum()
na_chart = pd.DataFrame([na_count, full.dtypes]).T
na_chart.columns = ["null_nums", "dtypes"]
na_chart = na_chart[na_chart['null_nums']!=0]
na_chart.sort_values(by='null_nums', ascending=False)

Unnamed: 0,null_nums,dtypes
PoolQC,2909,object
MiscFeature,2814,object
Alley,2721,object
Fence,2348,object
SalePrice,1459,float64
FireplaceQu,1420,object
LotFrontage,486,float64
GarageFinish,159,object
GarageQual,159,object
GarageYrBlt,159,float64


In [9]:
# 根据不同dtype类型，填补缺失值  
cols1 = ["MasVnrArea", "BsmtUnfSF", "TotalBsmtSF", "GarageCars", "BsmtFinSF2", "BsmtFinSF1", "GarageArea"]
for col in cols1:
    full[col].fillna(0, inplace=True)
    
cols2 = ["LotFrontage", "GarageYrBlt"]
for col in cols2:
    full[col].fillna(full[col].median(), inplace=True)

cols3 = ["MSZoning", "BsmtFullBath", "BsmtHalfBath", "Utilities", "Functional", "Electrical", "KitchenQual", "SaleType","Exterior1st", "Exterior2nd"]
for col in cols3:
    full[col].fillna(full[col].mode()[0], inplace=True)

cols4 = ["PoolQC" , "MiscFeature", "Alley", "Fence", "FireplaceQu", "GarageQual", "GarageCond", "GarageFinish", "GarageYrBlt", "GarageType", "BsmtExposure", "BsmtCond", "BsmtQual", "BsmtFinType2", "BsmtFinType1", "MasVnrType"]
for col in cols4:
    full[col].fillna("None", inplace=True)

In [10]:
# 只有test["SalePrice"]有空缺值，填缺完毕
full.isnull().sum()[full.isnull().sum()!=0]

SalePrice    1459
dtype: int64

In [11]:
# 独热编码(One-hot Encoding)，将Object对象转换为0-1矩阵
full2 = pd.get_dummies(full.drop(["Id","SalePrice"], axis=1))
full2.shape

(2919, 302)

In [12]:
from sklearn import preprocessing
from sklearn.metrics import mean_squared_error
from sklearn.decomposition import PCA, KernelPCA

In [13]:
from sklearn.model_selection import cross_val_score, GridSearchCV, KFold
from sklearn.linear_model import LinearRegression, SGDRegressor
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, ExtraTreesRegressor
from sklearn.svm import SVR, LinearSVR

In [14]:
def rmse_cv(model,X,y):
    rmse = np.sqrt(-cross_val_score(model,X,y,scoring="neg_mean_squared_error",cv=3))
    return rmse

In [15]:
models = [LinearRegression(),Ridge(),Lasso(alpha=0.0005,max_iter=10000),RandomForestRegressor(),SGDRegressor(),
          GradientBoostingRegressor(),SVR(),LinearSVR(),ExtraTreesRegressor()]
names = ["LR", "Ridge", "Lasso", "RF", "SGD", "GBR", "SVR", "LinSVR", "Extra"]

In [16]:
# pca_pro: 是否pca降维处理；pca_n: 特征保留数量
def scale_fit_print(data, pca_pro=False, pca_n=300):
    scaler = preprocessing.RobustScaler()
    n_train = train.shape[0]
    X = data[:n_train]
    X_test = data[n_train:]
    y = train.SalePrice
    
    X_scaled = scaler.fit(X).transform(X)
    y_log = np.log(y)
    X_test_scaled = scaler.transform(X_test)
    
    if(pca_pro==True):
        pca = PCA(n_components=pca_n)
        X_scaled = pca.fit_transform(X_scaled)
        X_test_scaled = pca.transform(X_test_scaled)
    
    for name, model in zip(names, models):
        score = rmse_cv(model, X_scaled, y_log)
        print("{}: {:.6f}".format(name, score.mean()))

In [17]:
# 初步拟合结果看，线性回归模型表现还可以
scale_fit_print(full2)

LR: 0.151452
Ridge: 0.142953
Lasso: 0.137313
RF: 0.155166
SGD: 2193169787914716.750000
GBR: 0.128823
SVR: 0.217860
LinSVR: 0.245905
Extra: 0.155335


In [81]:
#pd.set_option('max_rows', 20)
#pd.set_option('max_columns', 200)

### 2. 特征工程(feature engineering)

- **2.1 分箱处理(binning)**
    
    我们知道在分类问题中，离散化特征可以减小异常数据的干扰，而在回归模型中，离散化的变量经过独热编码(one-hot encoding),每个变量有单独权重，相当于引入非线性。

In [18]:
full3 = full.copy()

In [20]:
# 将离散变量-年份(dtype = int64)转换为dtype-category
cols1 = ["YearBuilt", "YearRemodAdd", "GarageYrBlt"]
for i in cols1:
    full3[i] = pd.cut(full3[i], 5, labels=[i+"_1", i+"_2", i+"_3", i+"_4", i+"_5"])

In [21]:
# 选择出dtype = int or float 的columns, 然后将列变量较少的字符化，然后get_dummies()
full_number = full[full.dtypes[full.dtypes!="object"].index]
a = pd.Series()
for i in full_number.columns:
    a[i] = pd.Series(full_number[i].value_counts().index).count()

num_cols = a[a<=6].index
full3[num_cols] = full3[num_cols].astype(str)

In [22]:
# 连续数据（列变量大于120）平均分为10箱
cols3 = a[a>120].index
for i in cols3:
    full3[i] = pd.cut(full3[i], 10)

In [24]:
full_dummy = pd.get_dummies(full3.drop(["Id", "SalePrice"],axis=1))
full_dummy.shape

(2919, 476)

In [25]:
# 相比于原始处理，分箱后Ridge模型拟合效果有些微提高
# 线性回归模型变差（维度灾难？）
scale_fit_print(full_dummy)

LR: 1232004.234848
Ridge: 0.137529
Lasso: 0.128423
RF: 0.164019
SGD: 1878481452156363.250000
GBR: 0.140707
SVR: 0.167453
LinSVR: 0.254827
Extra: 0.177683


In [31]:
# PCA降维对线性模型有避免维数灾难的效果
scale_fit_print(full_dummy, pca_pro=True, pca_n=200)

LR: 0.140608
Ridge: 0.139064
Lasso: 0.132894
RF: 0.186373
SGD: 28286084106538.781250
GBR: 0.163302
SVR: 0.165750
LinSVR: 0.725235
Extra: 0.178011


- **2.2 特征组合(feature combination)**
    
    比如A, B两个特征，线性回归a*A+b*B, 而特征组合加入A^2, B^2, A*B，可以提供非线性。

In [32]:
full4 = full.copy()

In [35]:
# 通过相关系数矩阵找出与SalePrice相关性较高的特征，进行之后的特征组合
corr = train.corr()["SalePrice"].sort_values(ascending=False)
corr[1:14]

OverallQual     0.790982
GrLivArea       0.708624
GarageCars      0.640409
GarageArea      0.623431
TotalBsmtSF     0.613581
1stFlrSF        0.605852
FullBath        0.560664
TotRmsAbvGrd    0.533723
YearBuilt       0.522897
YearRemodAdd    0.507101
GarageYrBlt     0.486362
MasVnrArea      0.477493
Fireplaces      0.466929
Name: SalePrice, dtype: float64

In [37]:
poly = preprocessing.PolynomialFeatures(degree=2, interaction_only=True)
adding = poly.fit_transform(full4[corr[1:14].index])

matrix_add = pd.DataFrame(adding, columns=poly.get_feature_names())
matrix_add.shape

(2919, 92)

In [39]:
# 将增加的92个特征与原矩阵连接
full4 = pd.concat([full4, matrix_add], axis=1)
full4.shape

(2919, 265)

In [40]:
full_dummy = pd.get_dummies(full4.drop(["Id", "SalePrice"],axis=1))
full_dummy.shape

(2919, 486)

In [41]:
# Lasso模型表现最好
scale_fit_print(full_dummy)

LR: 0.148936
Ridge: 0.128785
Lasso: 0.119454
RF: 0.152205
SGD: 492856126237165.312500
GBR: 0.130944
SVR: 0.208317
LinSVR: 0.379873
Extra: 0.147833


In [45]:
# PCA未起到优化作用
scale_fit_print(full_dummy, pca_pro=True, pca_n=400)

LR: 0.148904
Ridge: 0.128785
Lasso: 0.124062
RF: 0.202709
SGD: 819967425067572.125000
GBR: 0.166980
SVR: 0.211705
LinSVR: 0.679242
Extra: 0.191268


**总结与问题：**
1. 特征组合对拟合结果非常重要
2. PCA降维对线性回归模型有较大提升效果
3. SGDRegressor（随机梯度下降回归模型）与参考文章中差别较大，可以继续探究
4. dtyep=object的特征也可以进行特征组合，如何筛选相关度高的object特征？LabelEncoder()+相关系数矩阵可能是个办法。

参考文章: [kaggle竞赛-房价预测](https://www.cnblogs.com/massquantity/p/8640991.html)
感谢博主！