In [375]:
###########使用的库###########

import numpy as np
import pandas as pd
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import KFold,cross_val_score
import xgboost as xgb
import lightgbm as lgb
from sklearn.kernel_ridge import KernelRidge




In [376]:
###########使用到的函数栏###########

def load_train_data():
    train_data=pd.read_csv("./train.csv")
    return train_data

def load_test_data():
    test_data=pd.read_csv("./test.csv")
    return test_data

def category2num1(series):
    mapping1={"Ex":0,"Gd":1,"TA":2,"Fa":3,"Po":4}
    return series.map(mapping1)

def category2num2(series):
    mapping2={"Ex":0,"Gd":1,"TA":2,"Fa":3,"Po":4,"None":5}
    return series.map(mapping2)

def category2num3(series):
    mapping3={"Gd":0,"Av":1,"Mn":2,"No":3,"None":4}
    return series.map(mapping3)

def category2num4(series):
    mapping4={"GLQ":0,"ALQ":1,"BLQ":2,"Rec":3,"LwQ":4,"Unf":5,"None":6}
    return series.map(mapping4)

def category2num5(series):
    mapping5={"Fin":0,"RFn":1,"Unf":2,"None":3}
    return series.map(mapping5)

def standard(series):
    series=(series-series.mean())/series.std()
    return series

def valuation(prediction,labels):
    res=np.sqrt(mean_squared_error(prediction,labels))
    return res

###########使用到的函数栏###########

In [377]:
###########获取训练数据和预测数据和训练目标###########
train_data=load_train_data()
test_data=load_test_data()

#获取目标值和特征,**这个等去除完异常值再进行**
#train_labels=np.log(train_data['SalePrice'])
#train_data=train_data.drop(['SalePrice',"Id"],axis=1)
#test_data=test_data.drop(["Id"],axis=1)

#test_labels=np.log(test_data["SalePrice"])
#test_data=test_data.drop(["SalePrice"],axis=1)

###########获取训练数据和预测数据和训练目标###########


In [378]:
###########根据缺失值的情况，决定删除掉哪些列###########

print(len(train_data))

#要是有异常值要先replace转化为na
#为了能够看清楚哪些列的空值所以增加一个筛选出有空值的列再看哪个控制大一些
train_data_full_sum=train_data.isna().sum()
print(train_data_full_sum[train_data_full_sum>0])
test_data_full_sum=test_data.isna().sum()
test_data_full_sum[test_data_full_sum>0]

#删除掉空值占50%以上的列，因为这些列信息少，说明应该没用
train_data=train_data.drop(["Alley","MasVnrType","FireplaceQu","PoolQC","Fence","MiscFeature"],axis=1)
test_data=test_data.drop(["Alley","MasVnrType","FireplaceQu","PoolQC","Fence","MiscFeature"],axis=1)
print(train_data.columns)

train_data.drop_duplicates(inplace=True)
test_data.drop_duplicates(inplace=True)



1460
LotFrontage      259
Alley           1369
MasVnrType       872
MasVnrArea         8
BsmtQual          37
BsmtCond          37
BsmtExposure      38
BsmtFinType1      37
BsmtFinType2      38
Electrical         1
FireplaceQu      690
GarageType        81
GarageYrBlt       81
GarageFinish      81
GarageQual        81
GarageCond        81
PoolQC          1453
Fence           1179
MiscFeature     1406
dtype: int64
Index(['Id', 'MSSubClass', 'MSZoning', 'LotFrontage', 'LotArea', 'Street',
       'LotShape', 'LandContour', 'Utilities', 'LotConfig', 'LandSlope',
       'Neighborhood', 'Condition1', 'Condition2', 'BldgType', 'HouseStyle',
       'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd', 'RoofStyle',
       'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrArea', 'ExterQual',
       'ExterCond', 'Foundation', 'BsmtQual', 'BsmtCond', 'BsmtExposure',
       'BsmtFinType1', 'BsmtFinSF1', 'BsmtFinType2', 'BsmtFinSF2', 'BsmtUnfSF',
       'TotalBsmtSF', 'Heating', 'HeatingQC', 'Ce

In [379]:
###########对数值列有异常值的列进行删除###########

#选择只为数值的数据类型，因为类别变量是自己指定的，不会有偏差很大的异常值出现
#numeric_cols=train_data.select_dtypes(include=[np.number]).columns

#自己根据列的实际意义来决定什么列要进行异常值的剔除
#只参照一些不包含零值的，即每个房子正常都会有的来判断什么房子比较特殊便不予考虑
selected_columns=["LotFrontage","LotArea","TotalBsmtSF","GrLivArea","GarageArea"]

for col in selected_columns:
    Q1=train_data[col].quantile(0.25)
    Q3=train_data[col].quantile(0.75)
    IQR=Q3-Q1
    #观察数据后，发现数据较为离散，因为数值型中有不少0以及一些大数据并存，所以我们主要限制太大的数据，而放宽对小数据的限制，因为一些用户可能没有这方面的需求
    #所以默认为0
    low_bound=Q1-20*IQR
    high_bound=Q3+20*IQR
    train_data=train_data[((train_data[col]>=low_bound) & (train_data[col]<=high_bound))]

print(len(train_data))

train_labels=np.log(train_data['SalePrice'])
train_data=train_data.drop(['SalePrice',"Id"],axis=1)
test_data=test_data.drop(["Id"],axis=1)


1200


In [380]:
###########缺失值的处理###########

#第一种找到相关联的特征的进行分组用groupby，然后再取中位数
train_data["LotFrontage"]=train_data.groupby("Neighborhood")["LotFrontage"].transform(lambda x : x.fillna(x.median()))
test_data["LotFrontage"]=test_data.groupby("Neighborhood")["LotFrontage"].transform(lambda x : x.fillna(x.median()))

#第二种直接填充众数，因为缺失值较少，填充众数比填充中位数鲁棒性要强
train_data["Electrical"]=train_data["Electrical"].fillna(train_data["Electrical"].mode()[0])

#第三种填充none和0
# 地板类型和面积，这两者是一致的
#train_data['MasVnrType'] = train_data['MasVnrType'].fillna('None')
train_data['MasVnrArea'] = train_data['MasVnrArea'].fillna(0)
#test_data['MasVnrType'] = test_data['MasVnrType'].fillna('None')
test_data['MasVnrArea'] = test_data['MasVnrArea'].fillna(0)
# NA用None填充，表示没有地下室
for col in ['BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2']:
    train_data[col] = train_data[col].fillna('None')
    test_data[col] = test_data[col].fillna('None')
    #print(test_data[col])
for col in ['BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', 'BsmtFullBath', 'BsmtHalfBath']:
    test_data[col] = test_data[col].fillna(0)
# NA用None填充，表示没有车库
for col in ['GarageType', 'GarageFinish', 'GarageQual', 'GarageCond']:
    train_data[col] = train_data[col].fillna('None')
    test_data[col] = test_data[col].fillna('None')
train_data['GarageYrBlt'] = train_data['GarageYrBlt'].fillna(0)
test_data['GarageYrBlt'] = test_data['GarageYrBlt'].fillna(0)
# GarageCars、GarageArea
for col in ['GarageCars','GarageArea']:
    test_data[col] = test_data[col].fillna(0)
# MSZoning、utilities、KitchenQual、Functional、SaleType
for col in ['MSZoning','Utilities','KitchenQual','Functional','SaleType']:
    test_data[col] = test_data[col].fillna(test_data[col].mode()[0])
# Exterior1st、Exterior2nd
test_data['Exterior1st'] = test_data['Exterior1st'].fillna(test_data['Exterior1st'].mode()[0])
test_data['Exterior2nd'] = test_data['Exterior2nd'].fillna(test_data['Exterior1st'].mode()[0])


#train_isnull2 = train_data.isnull().sum()
# print(type(train_isnull))    Series
#print(train_isnull2[train_isnull2 > 0])
#test_isnull2 = test_data.isnull().sum()
# print(type(train_isnull))    Series
#print(test_isnull2[test_isnull2 > 0])
print(test_data[test_data["BsmtFinType1"]=="NA"])


Empty DataFrame
Columns: [MSSubClass, MSZoning, LotFrontage, LotArea, Street, LotShape, LandContour, Utilities, LotConfig, LandSlope, Neighborhood, Condition1, Condition2, BldgType, HouseStyle, OverallQual, OverallCond, YearBuilt, YearRemodAdd, RoofStyle, RoofMatl, Exterior1st, Exterior2nd, MasVnrArea, ExterQual, ExterCond, Foundation, BsmtQual, BsmtCond, BsmtExposure, BsmtFinType1, BsmtFinSF1, BsmtFinType2, BsmtFinSF2, BsmtUnfSF, TotalBsmtSF, Heating, HeatingQC, CentralAir, Electrical, 1stFlrSF, 2ndFlrSF, LowQualFinSF, GrLivArea, BsmtFullBath, BsmtHalfBath, FullBath, HalfBath, BedroomAbvGr, KitchenAbvGr, KitchenQual, TotRmsAbvGrd, Functional, Fireplaces, GarageType, GarageYrBlt, GarageFinish, GarageCars, GarageArea, GarageQual, GarageCond, PavedDrive, WoodDeckSF, OpenPorchSF, EnclosedPorch, 3SsnPorch, ScreenPorch, PoolArea, MiscVal, MoSold, YrSold, SaleType, SaleCondition]
Index: []

[0 rows x 73 columns]


In [381]:
###########数据编码###########

train_data_number=train_data.shape[0]
#test_data_number=test_data.shape[0]

all_data=pd.concat((train_data,test_data)).reset_index(drop=True)
#print(len(train_data))
#print(train_data["BsmtFinType1"])

all_data['MSSubClass'] = all_data['MSSubClass'].astype(str)
all_data['ExterQual'] = category2num1(all_data['ExterQual'])
all_data['ExterCond'] = category2num1(all_data['ExterCond'])
all_data['BsmtQual'] = category2num2(all_data['BsmtQual'])
all_data['BsmtCond'] = category2num2(all_data['BsmtCond'])
all_data['BsmtExposure'] = category2num3(all_data['BsmtExposure'])
all_data['BsmtFinType1'] = category2num4(all_data['BsmtFinType1'])
all_data['BsmtFinType2'] = category2num4(all_data['BsmtFinType2'])
all_data['HeatingQC'] = category2num1(all_data['HeatingQC'])
all_data['KitchenQual'] = category2num2(all_data['KitchenQual'])
all_data['GarageFinish'] = category2num5(all_data['GarageFinish'])
all_data['GarageQual'] = category2num2(all_data['GarageQual'])
all_data['GarageCond'] = category2num2(all_data['GarageCond'])

#print(all_data["MasVnrType"])

#采用labelEncoder的，因为类别之间有优劣之分
for col in ['OverallQual','OverallCond','YearBuilt','YearRemodAdd', 'ExterQual','ExterCond', 'BsmtQual', 'BsmtCond',
            'BsmtExposure','BsmtFinType1', 'BsmtFinType2','HeatingQC','CentralAir','BsmtFullBath','BsmtHalfBath',
            'FullBath','HalfBath','BedroomAbvGr','KitchenAbvGr','KitchenQual','TotRmsAbvGrd' ,'Fireplaces',
            'GarageYrBlt','GarageFinish','GarageCars','MiscVal','MoSold','YrSold']:
            le=LabelEncoder()
            #print(all_data[col])
            le.fit(all_data[col])
            #print(all_data[col])
            is_null=all_data[col].isna().sum()
            if is_null>0:
                print(col)
            all_data[col]=le.transform(all_data[col])

for col in ['MSSubClass', 'MSZoning','Street','LotShape','LandContour','Utilities','LotConfig','LandSlope',
            'Neighborhood','Condition1','Condition2','BldgType','HouseStyle','RoofStyle','RoofMatl','Exterior1st',
            'Exterior2nd','Foundation','Heating','Electrical','Functional','GarageType','PavedDrive',
            'SaleType','SaleCondition']:
    all_cols=pd.get_dummies(all_data[col],prefix=col)
    all_data=pd.concat([all_data,all_cols],axis=1)
    all_data=all_data.drop(col,axis=1)

for col in ['LotFrontage','LotArea','MasVnrArea','BsmtFinSF1','BsmtFinSF2','BsmtUnfSF','TotalBsmtSF','1stFlrSF',
             '2ndFlrSF','LowQualFinSF','GrLivArea','GarageArea','WoodDeckSF','OpenPorchSF','EnclosedPorch','3SsnPorch',
             'ScreenPorch','PoolArea']:
     all_data[col] = standard(all_data[col])


train_data=all_data[:train_data_number]
test_data=all_data[train_data_number:]

#对测试集编码
# test_data['MSSubClass'] = test_data['MSSubClass'].astype(str)
# test_data['ExterQual'] = category2num1(test_data['ExterQual'])
# test_data['ExterCond'] = category2num1(test_data['ExterCond'])
# test_data['BsmtQual'] = category2num2(test_data['BsmtQual'])
# test_data['BsmtCond'] = category2num2(test_data['BsmtCond'])
# test_data['BsmtExposure'] = category2num3(test_data['BsmtExposure'])
# test_data['BsmtFinType1'] = category2num4(test_data['BsmtFinType1'])
# test_data['BsmtFinType2'] = category2num4(test_data['BsmtFinType2'])
# test_data['HeatingQC'] = category2num1(test_data['HeatingQC'])
# test_data['KitchenQual'] = category2num2(test_data['KitchenQual'])
# test_data['GarageFinish'] = category2num5(test_data['GarageFinish'])
# test_data['GarageQual'] = category2num2(test_data['GarageQual'])
# test_data['GarageCond'] = category2num2(test_data['GarageCond'])

#print(test_data["KitchenQual"])

#采用labelEncoder的，因为类别之间有优劣之分
# for col in ['OverallQual','OverallCond','YearBuilt','YearRemodAdd', 'ExterQual','ExterCond', 'BsmtQual', 'BsmtCond',
#             'BsmtExposure','BsmtFinType1', 'BsmtFinType2','HeatingQC','CentralAir','BsmtFullBath','BsmtHalfBath',
#             'FullBath','HalfBath','BedroomAbvGr','KitchenAbvGr','KitchenQual','TotRmsAbvGrd' ,'Fireplaces',
#             'GarageYrBlt','GarageFinish','GarageCars','MiscVal','MoSold','YrSold']:
#             le=LabelEncoder()
#             #print(test_data[col])
#             le.fit(test_data[col])
#             #print(test_data[col])
#             is_null=test_data[col].isna().sum()
#             if is_null>0:
#                 print(col)
#             test_data[col]=le.transform(test_data[col])

# for col in ['MSSubClass', 'MSZoning','Street','LotShape','LandContour','Utilities','LotConfig','LandSlope',
#             'Neighborhood','Condition1','Condition2','BldgType','HouseStyle','RoofStyle','RoofMatl','Exterior1st',
#             'Exterior2nd','Foundation','Heating','Electrical','Functional','GarageType','PavedDrive',
#             'SaleType','SaleCondition']:
#     all_cols=pd.get_dummies(test_data[col],prefix=col)
#     test_data=pd.concat([test_data,all_cols],axis=1)
#     test_data=test_data.drop(col,axis=1)
            

In [382]:
###########模型建立与交叉验证看效果调整参数###########

n_folds=5
def rmsle_cv(model):
    kf=KFold(n_folds,shuffle=True,random_state=42)
    nmse=cross_val_score(model,train_data.values,train_labels.values,cv=kf,scoring='neg_mean_squared_error')
    return (nmse)

model_xgb = xgb.XGBRegressor(colsample_bytree=0.4603, gamma=0.0468,
                             learning_rate=0.05, max_depth=3,
                             min_child_weight=1.7817, n_estimators=2200,
                             reg_alpha=0.4640, reg_lambda=0.8571,
                             subsample=0.5213, 
                              nthread = -1)

score_xgb=rmsle_cv(model_xgb)
score_xgb=np.sqrt(-score_xgb)
print("Xgboost score {:.4f} {:.4f}".format(score_xgb.mean(),score_xgb.std()))

model_lgb=lgb.LGBMRegressor(objective='regression',num_leaves=5,
                              learning_rate=0.05, n_estimators=720,
                              max_bin = 55, bagging_fraction = 0.8,
                              bagging_freq = 5, feature_fraction = 0.2319,
                              feature_fraction_seed=9, bagging_seed=9,
                              min_data_in_leaf =6, min_sum_hessian_in_leaf = 11)

score_lgb=rmsle_cv(model_lgb)
score_lgb=np.sqrt(-score_lgb)
print("LightGBM score {:.4f} {:.4f}".format(score_lgb.mean(),score_lgb.std()))

KRR = KernelRidge(alpha=0.6, kernel='polynomial', degree=2, coef0=2.5)
score_krr = rmsle_cv(KRR)
score_krr=np.sqrt(-score_krr)
print("KRR score: {:.4f} ({:.4f})\n".format(score_krr.mean(), score_krr.std()))



Xgboost score 0.1314 0.0038
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001796 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1358
[LightGBM] [Info] Number of data points in the train set: 960, number of used features: 181
[LightGBM] [Info] Start training from score 12.014093
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.003475 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1357
[LightGBM] [Info] Number of data points in the train set: 960, number of used features: 181
[LightGBM] [Info] Start training from score 12.014931
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001354 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1356
[LightGBM] [Info] Number of 

In [383]:
###########模型学习并在训练集上测试###########

model_xgb.fit(train_data.values, train_labels)
model_xgb_prec = model_xgb.predict(train_data.values)
print(valuation(model_xgb_prec, train_labels))

model_lgb.fit(train_data.values, train_labels)
model_lgb_prec = model_lgb.predict(train_data.values)
print(valuation(model_lgb_prec, train_labels))

KRR.fit(train_data.values, train_labels)
KRR_prec = KRR.predict(train_data.values)
print(valuation(KRR_prec, train_labels))

0.0885412661021795
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002459 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1404
[LightGBM] [Info] Number of data points in the train set: 1200, number of used features: 190
[LightGBM] [Info] Start training from score 12.015138
0.07419266672868606
0.08209340416009005


In [384]:
########### debug找出train_data和test_data哪些列不一样 ###########

unique_train_data=set(train_data.columns)-set(test_data.columns)
print(unique_train_data)
unique_test_data=set(test_data.columns)-set(train_data.columns)
print(unique_test_data)

set()
set()


In [385]:
###########模型bagging###########

model_xgb_res = model_xgb.predict(test_data.values)
model_lgb_res = model_lgb.predict(test_data.values)
KRR_res = KRR.predict(test_data.values)

final_res = 0.3 * np.expm1(model_xgb_res) + 0.5 * np.expm1(model_lgb_res) + 0.2 * np.expm1(KRR_res)

submission = pd.read_csv("sample_submission.csv")
submission['SalePrice'] = final_res
submission.to_csv('submission_2.csv', index=None)


