In [1]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
%matplotlib inline
import seaborn as sns

%config InlineBackend.figure_format = 'retina'
from pylab import rcParams
rcParams['figure.figsize'] = (14, 11)
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score

import warnings
warnings.filterwarnings('ignore')
from sklearn.cross_validation import train_test_split
from sklearn import metrics

In [2]:
df = pd.read_csv('houses.csv')

In [3]:
df.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 81 columns):
Id               1460 non-null int64
MSSubClass       1460 non-null int64
MSZoning         1460 non-null object
LotFrontage      1201 non-null float64
LotArea          1460 non-null int64
Street           1460 non-null object
Alley            91 non-null object
LotShape         1460 non-null object
LandContour      1460 non-null object
Utilities        1460 non-null object
LotConfig        1460 non-null object
LandSlope        1460 non-null object
Neighborhood     1460 non-null object
Condition1       1460 non-null object
Condition2       1460 non-null object
BldgType         1460 non-null object
HouseStyle       1460 non-null object
OverallQual      1460 non-null int64
OverallCond      1460 non-null int64
YearBuilt        1460 non-null int64
YearRemodAdd     1460 non-null int64
RoofStyle        1460 non-null object
RoofMatl         1460 non-null object
Exterior1st      1460 non-n

In [5]:
# Заполним пропуски и перекодируем категориальные переменные

In [6]:
for x in df.columns:
    if df[x].dtype == 'object':
        df[x] = df[x].fillna('NA')
    elif df[x].dtype == 'int64':
        df[x] = df[x].fillna(0)
    elif df[x].dtype == 'float64':
        df[x] = df[x].fillna(0)

In [7]:
from sklearn.preprocessing import LabelEncoder

In [8]:
le = LabelEncoder()

In [9]:
le.fit_transform(np.array(df['SaleType'].values))

array([8, 8, 8, ..., 8, 8, 8])

In [10]:
col = [] 
for x in df.columns:
    if df[x].dtype == 'object':
        col.append(x)
        df[x] = le.fit_transform(df[x])

In [11]:
y = df.SalePrice
X = df.drop('SalePrice', axis=1)

In [12]:
col = X.columns

In [13]:
from sklearn.preprocessing import StandardScaler

In [14]:
scaler = StandardScaler()
scaler.fit(X)
X = pd.DataFrame(scaler.transform(X), columns = col)

In [15]:
num_folds = 10
seed = 7

In [16]:
kfold = KFold(n_splits=num_folds, random_state=seed)

In [17]:
rfr = RandomForestRegressor()
results = cross_val_score(rfr, X, y, cv=kfold)
print("Accuracy: %.3f%% (%.3f%%)" % (results.mean()*100.0, results.std()*100.0
))

Accuracy: 84.599% (3.858%)


In [18]:
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=.1, random_state=7)

In [19]:
rfr.fit(X_train, y_train)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
           oob_score=False, random_state=None, verbose=0, warm_start=False)

In [20]:
predict = rfr.predict(X_valid)

In [21]:
metrics.r2_score(y_valid, predict)

0.9119027565578994

In [22]:
metrics.explained_variance_score(y_valid, predict)

0.9121734612850716

In [23]:
# Выведем важность признаков

In [24]:
i = 0
for x in X.columns:
    print(x, '', '{:2f}'.format(rfr.feature_importances_[i]))
    i+=1

Id  0.002312
MSSubClass  0.003373
MSZoning  0.001491
LotFrontage  0.002661
LotArea  0.014279
Street  0.000000
Alley  0.000534
LotShape  0.000674
LandContour  0.000571
Utilities  0.000001
LotConfig  0.001190
LandSlope  0.002262
Neighborhood  0.004771
Condition1  0.001144
Condition2  0.000021
BldgType  0.000823
HouseStyle  0.000536
OverallQual  0.532566
OverallCond  0.002553
YearBuilt  0.009790
YearRemodAdd  0.006265
RoofStyle  0.000639
RoofMatl  0.000165
Exterior1st  0.001142
Exterior2nd  0.000950
MasVnrType  0.004057
MasVnrArea  0.017176
ExterQual  0.002054
ExterCond  0.000457
Foundation  0.000424
BsmtQual  0.001158
BsmtCond  0.000423
BsmtExposure  0.002317
BsmtFinType1  0.001161
BsmtFinSF1  0.043037
BsmtFinType2  0.000303
BsmtFinSF2  0.000121
BsmtUnfSF  0.003080
TotalBsmtSF  0.044245
Heating  0.000004
HeatingQC  0.000362
CentralAir  0.001553
Electrical  0.000121
1stFlrSF  0.034959
2ndFlrSF  0.040736
LowQualFinSF  0.000065
GrLivArea  0.130592
BsmtFullBath  0.000718
BsmtHalfBath  0.0001

# реализуем Блендинг моделей
<img src="img/stacking.png" style="max-width: 680px; display: inline" />

In [25]:
from sklearn.neighbors import KNeighborsRegressor
from sklearn.linear_model import Ridge
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import Lasso

In [26]:
X_train_st, X_test, y_train_st, y_test = train_test_split(X_train, y_train, test_size=.2, random_state=7)

In [27]:
# выбираем модели
knn3 = KNeighborsRegressor(n_neighbors=3)
knn10 = KNeighborsRegressor(n_neighbors=10)
rg1 = Ridge(alpha=1.1)
las = Lasso()
rfr = RandomForestRegressor()
metamodel = Ridge()

In [28]:
def model_fit_predict(model, X_train, y_train, X_test, y_test):
    model.fit(X_train,y_train)
    prediction = model.predict(X_test)
    r2_sc = metrics.r2_score(y_test, prediction)
    return prediction, r2_sc

In [29]:
list_of_models = [knn3, knn10, rg1, las, rfr]

In [30]:
pred = []
r2_sc = []
for mod in list_of_models:
    a,b = model_fit_predict(mod, X_train_st, y_train_st, X_test, y_test)
    pred.append(a)
    r2_sc.append(b)

In [31]:
r2_sc

[0.7695030091034127,
 0.8287934067269855,
 0.8290911617095896,
 0.8282886869445494,
 0.8457194527311629]

In [32]:
X_pred = pd.DataFrame(np.array(pred).T)

In [33]:
X_pred.shape

(263, 5)

In [34]:
metamodel.fit(X_pred, y_test)

Ridge(alpha=1.0, copy_X=True, fit_intercept=True, max_iter=None,
   normalize=False, random_state=None, solver='auto', tol=0.001)

In [35]:
pred = []
r2_sc = []
for mod in list_of_models:
    a,b = model_fit_predict(mod, X_train, y_train, X_valid, y_valid)
    pred.append(a)
    r2_sc.append(b)

In [36]:
# метрика r2 для различных моделей

In [37]:
r2_sc

[0.7805382632724003,
 0.780082704158134,
 0.8766054790751481,
 0.876272905919814,
 0.898571192331457]

In [38]:
X_pred = pd.DataFrame(np.array(pred).T)

In [39]:
y_pred = metamodel.predict(X_pred)

In [40]:
# метрика r2 для ансамбля моделей

In [41]:
metrics.r2_score(y_valid, y_pred)

0.911666551041856

## Стекинг

Используем идею K-Fold проверки. Выборку разбиваем на фолды, затем последовательно перебираем фолды и обучаем базовые алгоритмы на всех фолдах, кроме одного, а на оставшемся получаем ответы базовых алгоритмов и трактуем их как значения соответствующих признаков на этом фолде. Для получения метапризнаков объектов тестовой выборки базовые алгоритмы обучают на всей обучающей выборке и берут их ответы на тестовой.

<img src="img/stacking-2b.png" style="max-width: 680px; display: inline" />

In [42]:
def get_meta_features(model, X_train, y_train, X_test, stack_cv):
    meta_train = np.zeros_like(y_train, dtype=float)
    meta_test = np.zeros_like(y_test, dtype=float)
    
    for i, (train_ind, test_ind) in enumerate(stack_cv.split(X_train, y_train)):
        
        model.fit(X_train.iloc[train_ind], y_train.iloc[train_ind])
        meta_train[test_ind] = model.predict(X_train.iloc[test_ind])
        meta_test += model.predict(X_test)
        
    r2_sc1 = metrics.r2_score(y_test, meta_test / stack_cv.n_splits)
    
    return meta_train, meta_test / stack_cv.n_splits, r2_sc1

In [43]:
from sklearn.model_selection import StratifiedKFold

stack_cv = StratifiedKFold(n_splits=10, random_state=555)

In [44]:
stack_cv.split(X_train, y_train)

<generator object _BaseKFold.split at 0x7f5563cb7eb8>

In [45]:
meta_train = []
meta_test = []
col_names = []
r2_sc_stack = []

In [46]:
for mod in list_of_models:
    meta_tr, meta_te, r2_score = get_meta_features(mod, X_train, y_train, X_test, stack_cv)
    meta_train.append(meta_tr)
    meta_test.append(meta_te)
    r2_sc_stack.append(r2_score)
    col_names.append(str(mod)+'_pred')

In [47]:
X_meta_train = pd.DataFrame(np.stack(meta_train, axis=1), columns=col_names)
X_meta_test = pd.DataFrame(np.stack(meta_test, axis=1), columns=col_names)

In [48]:
#Выведем качество моделей 
r2_sc_stack

[0.8849534931780351,
 0.8522500844797891,
 0.8799158813354754,
 0.8797963603613566,
 0.973530134280788]

In [49]:
X_meta_train.shape

(1314, 5)

In [50]:
X_meta_test.shape

(263, 5)

In [51]:
metamodel.fit(X_meta_train, y_train)

Ridge(alpha=1.0, copy_X=True, fit_intercept=True, max_iter=None,
   normalize=False, random_state=None, solver='auto', tol=0.001)

In [52]:
y_pred_meta_valid = metamodel.predict(X_meta_test)

In [54]:
# Финальная метрика качества метамодели
metrics.r2_score(y_test, y_pred_meta_valid)

0.9303059739080353