In [3]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
%matplotlib inline
import seaborn as sns

%config InlineBackend.figure_format = 'retina'
from pylab import rcParams
rcParams['figure.figsize'] = (14, 11)
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score

import warnings
warnings.filterwarnings('ignore')
from sklearn.cross_validation import train_test_split
from sklearn import metrics

In [4]:
df = pd.read_csv('houses.csv')

In [5]:
df.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 81 columns):
Id               1460 non-null int64
MSSubClass       1460 non-null int64
MSZoning         1460 non-null object
LotFrontage      1201 non-null float64
LotArea          1460 non-null int64
Street           1460 non-null object
Alley            91 non-null object
LotShape         1460 non-null object
LandContour      1460 non-null object
Utilities        1460 non-null object
LotConfig        1460 non-null object
LandSlope        1460 non-null object
Neighborhood     1460 non-null object
Condition1       1460 non-null object
Condition2       1460 non-null object
BldgType         1460 non-null object
HouseStyle       1460 non-null object
OverallQual      1460 non-null int64
OverallCond      1460 non-null int64
YearBuilt        1460 non-null int64
YearRemodAdd     1460 non-null int64
RoofStyle        1460 non-null object
RoofMatl         1460 non-null object
Exterior1st      1460 non-n

In [7]:
# Заполним пропуски и перекодируем категориальные переменные

In [8]:
for x in df.columns:
    if df[x].dtype == 'object':
        df[x] = df[x].fillna('NA')
    elif df[x].dtype == 'int64':
        df[x] = df[x].fillna(0)
    elif df[x].dtype == 'float64':
        df[x] = df[x].fillna(0)

In [9]:
from sklearn.preprocessing import LabelEncoder

In [10]:
le = LabelEncoder()

In [11]:
le.fit_transform(np.array(df['SaleType'].values))

array([8, 8, 8, ..., 8, 8, 8])

In [26]:
col = [] 
col_scale = []
for x in df.columns:
    if df[x].dtype == 'object':
        col.append(x)
    if df[x].dtype != 'object' and x != 'SalePrice':
        col_scale.append(x)
        df[x] = le.fit_transform(df[x])

In [27]:
y = df.SalePrice
X = df.drop('SalePrice', axis=1)

In [28]:
col = X.columns

In [29]:
from sklearn.preprocessing import StandardScaler

In [30]:
scaler = StandardScaler()
scaler.fit(X)
X_scale = pd.DataFrame(scaler.transform(X[col_scale]), columns = col_scale)

In [31]:
X = pd.concat([X[col], X_scale], axis = 1)

In [32]:
X.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition
0,0,5,3,37,327,1,1,3,3,0,...,-0.255029,-0.062051,0.063305,0.457447,-0.185975,-0.162018,-1.599111,0.138777,0.313867,0.208502
1,1,0,3,52,498,1,1,3,3,0,...,-0.255029,-0.062051,0.063305,0.457447,-0.185975,-0.162018,-0.48911,-0.614439,0.313867,0.208502
2,2,5,3,40,702,1,1,0,3,0,...,-0.255029,-0.062051,0.063305,0.457447,-0.185975,-0.162018,0.990891,0.138777,0.313867,0.208502
3,3,6,3,32,489,1,1,0,3,0,...,-0.255029,-0.062051,0.063305,0.457447,-0.185975,-0.162018,-1.599111,-1.367655,0.313867,-3.426284
4,4,5,3,56,925,1,1,0,3,0,...,-0.255029,-0.062051,0.063305,0.457447,-0.185975,-0.162018,2.100892,0.138777,0.313867,0.208502


In [33]:
num_folds = 10
seed = 7

In [34]:
kfold = KFold(n_splits=num_folds, random_state=seed)

In [35]:
rfr = RandomForestRegressor()
results = cross_val_score(rfr, X, y, cv=kfold)
print("Accuracy: %.3f%% (%.3f%%)" % (results.mean()*100.0, results.std()*100.0
))

Accuracy: 88.189% (2.212%)


In [36]:
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=.1, random_state=7)

In [37]:
rfr.fit(X_train, y_train)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
           oob_score=False, random_state=None, verbose=0, warm_start=False)

In [38]:
predict = rfr.predict(X_valid)

In [39]:
metrics.r2_score(y_valid, predict)

0.9069753703949233

In [40]:
metrics.explained_variance_score(y_valid, predict)

0.9071810651940775

In [41]:
# Выведем важность признаков

In [42]:
i = 0
for x in X.columns:
    print(x, '', '{:2f}'.format(rfr.feature_importances_[i]))
    i+=1

Id  0.001594
MSSubClass  0.000333
MSZoning  0.003619
LotFrontage  0.001405
LotArea  0.005272
Street  0.000000
Alley  0.000207
LotShape  0.002226
LandContour  0.000204
Utilities  0.000000
LotConfig  0.000277
LandSlope  0.000094
Neighborhood  0.003888
Condition1  0.000170
Condition2  0.000000
BldgType  0.000043
HouseStyle  0.000434
OverallQual  0.174628
OverallCond  0.002000
YearBuilt  0.010708
YearRemodAdd  0.005325
RoofStyle  0.000108
RoofMatl  0.000052
Exterior1st  0.000612
Exterior2nd  0.000474
MasVnrType  0.000269
MasVnrArea  0.002113
ExterQual  0.000842
ExterCond  0.000129
Foundation  0.000364
BsmtQual  0.001274
BsmtCond  0.000018
BsmtExposure  0.000625
BsmtFinType1  0.001964
BsmtFinSF1  0.011450
BsmtFinType2  0.000102
BsmtFinSF2  0.000312
BsmtUnfSF  0.002760
TotalBsmtSF  0.028998
Heating  0.000149
HeatingQC  0.001311
CentralAir  0.000378
Electrical  0.000021
1stFlrSF  0.002471
2ndFlrSF  0.002352
LowQualFinSF  0.000201
GrLivArea  0.089366
BsmtFullBath  0.000551
BsmtHalfBath  0.0004

# реализуем Блендинг моделей
<img src="img/stacking.png" style="max-width: 680px; display: inline" />

In [43]:
from sklearn.neighbors import KNeighborsRegressor
from sklearn.linear_model import Ridge
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import Lasso

In [44]:
X_train_st, X_test, y_train_st, y_test = train_test_split(X_train, y_train, test_size=.2, random_state=7)

In [45]:
# выбираем модели
knn3 = KNeighborsRegressor(n_neighbors=3)
knn10 = KNeighborsRegressor(n_neighbors=10)
rg1 = Ridge(alpha=1.1)
las = Lasso()
rfr = RandomForestRegressor()
metamodel = Ridge()

In [46]:
def model_fit_predict(model, X_train, y_train, X_test, y_test):
    model.fit(X_train,y_train)
    prediction = model.predict(X_test)
    r2_sc = metrics.r2_score(y_test, prediction)
    return prediction, r2_sc

In [47]:
list_of_models = [knn3, knn10, rg1, las, rfr]

In [48]:
pred = []
r2_sc = []
for mod in list_of_models:
    a,b = model_fit_predict(mod, X_train_st, y_train_st, X_test, y_test)
    pred.append(a)
    r2_sc.append(b)

In [49]:
r2_sc

[0.684141672993386,
 0.7061434571472094,
 0.9054883792917177,
 0.9138294324929833,
 0.8751282895545535]

In [50]:
X_pred = pd.DataFrame(np.array(pred).T)

In [51]:
X_pred.shape

(263, 5)

In [52]:
metamodel.fit(X_pred, y_test)

Ridge(alpha=1.0, copy_X=True, fit_intercept=True, max_iter=None,
   normalize=False, random_state=None, solver='auto', tol=0.001)

In [53]:
pred = []
r2_sc = []
for mod in list_of_models:
    a,b = model_fit_predict(mod, X_train, y_train, X_valid, y_valid)
    pred.append(a)
    r2_sc.append(b)

In [54]:
# метрика r2 для различных моделей

In [55]:
r2_sc

[0.7355665189634497,
 0.7714566210372095,
 0.9258802677881688,
 0.9265388127857308,
 0.9111840012021021]

In [56]:
X_pred = pd.DataFrame(np.array(pred).T)

In [57]:
y_pred = metamodel.predict(X_pred)

In [58]:
# метрика r2 для ансамбля моделей

In [59]:
metrics.r2_score(y_valid, y_pred)

0.9331575877519733

In [None]:
## Стекинг

Используем идею K-Fold проверки. Выборку разбиваем на фолды, затем последовательно перебираем фолды и обучаем базовые алгоритмы на всех фолдах, кроме одного, а на оставшемся получаем ответы базовых алгоритмов и трактуем их как значения соответствующих признаков на этом фолде. Для получения метапризнаков объектов тестовой выборки базовые алгоритмы обучают на всей обучающей выборке и берут их ответы на тестовой.

<img src="img/stacking-2b.png" style="max-width: 680px; display: inline" />

In [60]:
def get_meta_features(model, X_train, y_train, X_test, stack_cv):
    meta_train = np.zeros_like(y_train, dtype=float)
    meta_test = np.zeros_like(y_test, dtype=float)
    
    for i, (train_ind, test_ind) in enumerate(stack_cv.split(X_train, y_train)):
        
        model.fit(X_train.iloc[train_ind], y_train.iloc[train_ind])
        meta_train[test_ind] = model.predict(X_train.iloc[test_ind])
        meta_test += model.predict(X_test)
        
    r2_sc1 = metrics.r2_score(y_test, meta_test / stack_cv.n_splits)
    
    return meta_train, meta_test / stack_cv.n_splits, r2_sc1

In [61]:
from sklearn.model_selection import StratifiedKFold

stack_cv = StratifiedKFold(n_splits=10, random_state=555)

In [62]:
stack_cv.split(X_train, y_train)

<generator object _BaseKFold.split at 0x7f9ad7a2e9e8>

In [63]:
meta_train = []
meta_test = []
col_names = []
r2_sc_stack = []

In [64]:
for mod in list_of_models:
    meta_tr, meta_te, r2_score = get_meta_features(mod, X_train, y_train, X_test, stack_cv)
    meta_train.append(meta_tr)
    meta_test.append(meta_te)
    r2_sc_stack.append(r2_score)
    col_names.append(str(mod)+'_pred')

In [65]:
X_meta_train = pd.DataFrame(np.stack(meta_train, axis=1), columns=col_names)
X_meta_test = pd.DataFrame(np.stack(meta_test, axis=1), columns=col_names)

In [66]:
#Выведем качество моделей 
r2_sc_stack

[0.84288447139127,
 0.7605870627803599,
 0.9272889430133864,
 0.9268369673593643,
 0.9770831759250732]

In [67]:
X_meta_train.shape

(1314, 5)

In [68]:
X_meta_test.shape

(263, 5)

In [69]:
metamodel.fit(X_meta_train, y_train)

Ridge(alpha=1.0, copy_X=True, fit_intercept=True, max_iter=None,
   normalize=False, random_state=None, solver='auto', tol=0.001)

In [70]:
y_pred_meta_valid = metamodel.predict(X_meta_test)

In [71]:
# Финальная метрика качества метамодели
metrics.r2_score(y_test, y_pred_meta_valid)

0.9527830815966771