In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.datasets import load_boston
from sklearn.linear_model import Ridge, LinearRegression
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.cross_validation import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.cross_validation import cross_val_score
%matplotlib inline

#Boston House Prices

###Analyzing dataset

In [2]:
link = 'https://archive.ics.uci.edu/ml/machine-learning-databases/housing/housing.data'
col=['CRIM','ZN','INDUS','CHAS','NOX','RM','AGE','DIS','RAD','TAX','PTRATIO','B','LSTAT','PRICE']
dataset=pd.read_fwf(link,names=col)
dataset.head()

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,PRICE
0,0.00632,18.0,2.31,0,0.538,6.575,65.2,4.09,1,296.0,15.3,396.9,4.98,24.0
1,0.02731,0.0,7.07,0,0.469,6.421,78.9,4.9671,2,242.0,17.8,396.9,9.14,21.6
2,0.02729,0.0,7.07,0,0.469,7.185,61.1,4.9671,2,242.0,17.8,392.83,4.03,34.7
3,0.03237,0.0,2.18,0,0.458,6.998,45.8,6.0622,3,222.0,18.7,394.63,2.94,33.4
4,0.06905,0.0,2.18,0,0.458,7.147,54.2,6.0622,3,222.0,18.7,396.9,5.33,36.2


In [3]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 506 entries, 0 to 505
Data columns (total 14 columns):
CRIM       506 non-null float64
ZN         506 non-null float64
INDUS      506 non-null float64
CHAS       506 non-null int64
NOX        506 non-null float64
RM         506 non-null float64
AGE        506 non-null float64
DIS        506 non-null float64
RAD        506 non-null int64
TAX        506 non-null float64
PTRATIO    506 non-null float64
B          506 non-null float64
LSTAT      506 non-null float64
PRICE      506 non-null float64
dtypes: float64(12), int64(2)
memory usage: 55.4 KB


We notice that we have continuous data in almost all our columns (including target) and we do not have null values. 

Next we will split the data for our stacking algorithm.

In [4]:
#Data
scaler = MinMaxScaler()
X=dataset.ix[:,0:-1]
X = scaler.fit_transform(X)

#Target
y=dataset.ix[:,-1]

#Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

###Stacking Diagram

For our stacking algorithm, we will collect predictions from Ridge Regression, Random Forest and Gradient Boosting on Stack 0. The parameters from Stack 0 will be optimized using Linear Regression in Stack 1. All 4 Regressors were picked based on the fact that we are dealing with continuous data.

<img src="Flowchart.png">

###Stack 0

We will use Ridge Regression, Random Forest Classifier and Gradient Boosting for our base stack. 

The reasons are:
- Ridge Regression will deal with multicolearity
- Random Forest will have a high accuracy 
- Gradient Boost will be resistant to overfit

In [26]:
def stack0(data,target):
    
    classifiers = [Ridge(),
                  RandomForestRegressor(),
                  GradientBoostingRegressor()]

    columns=['Ridge',
             'Random_Forest',
             'Gradient_Boost']

    stack0 = {}
    
    clfs = {}

    data_train, data_test, target_train, target_test = train_test_split(X, y, test_size=0.33, random_state=42)
    stack0['PRICE'] = target_test
    
    for clf,col in zip(classifiers,columns):  
        clf.fit(data_train,target_train)
        pred = clf.predict(data_test)
        stack0[col]=pred
        clfs[col]=clf

    stack0_df = pd.DataFrame(stack0)

    return stack0_df, clfs

In [27]:
train0 = stack0(X_train, y_train)
train0[0].head()

Unnamed: 0,Gradient_Boost,PRICE,Random_Forest,Ridge
173,22.770963,23.6,23.32,27.948651
274,31.914874,32.4,33.1,35.372382
491,15.831544,13.6,15.02,17.692246
72,24.011423,22.8,22.95,25.50167
452,17.096761,16.1,16.85,17.681237


Sample train data from Stack 0

###Stack 1

Next we tune the base classifiers with Linear Regression in Stack 1. Linear Regressio is used as we are dealing with continuous data from the Stack 0.

In [30]:
def stack1(data,target):
    
    stack0_df, clfs = stack0(data,target)
    columns=['Ridge','Random_Forest','Gradient_Boost']
    clf = LinearRegression()
    clf.fit(stack0_df[columns].values, stack0_df['PRICE'].values)
    coefficients = clf.coef_ 
    
    return coefficients, clfs

In [75]:
def stack_score(train_data, train_target, test_data, test_target):
    
    columns=['Ridge','Random_Forest','Gradient_Boost']
    
    coefficients, clfs = stack1(train_data, train_target)
    
    prediction = np.zeros(len(test_data))
    
    for i in range(len(columns)):
        prediction += coefficients[i]*clfs[columns[i]].predict(test_data)
        
    tol = np.std(y_test)    
    score = float(np.sum([np.isclose(i,j,atol=tol) for i,j in zip(prediction,test_target)]))/len(test_target)
    return score

In [76]:
stack_score(X_train, y_train, X_test, y_test)

0.9760479041916168