In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [2]:
from sklearn.datasets import load_boston

boston = load_boston()

In [3]:
df = pd.DataFrame(boston.data, columns=boston.feature_names)
df['Target'] = boston.target

In [4]:
df.shape

(506, 14)

In [5]:
df.head()

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,Target
0,0.00632,18.0,2.31,0.0,0.538,6.575,65.2,4.09,1.0,296.0,15.3,396.9,4.98,24.0
1,0.02731,0.0,7.07,0.0,0.469,6.421,78.9,4.9671,2.0,242.0,17.8,396.9,9.14,21.6
2,0.02729,0.0,7.07,0.0,0.469,7.185,61.1,4.9671,2.0,242.0,17.8,392.83,4.03,34.7
3,0.03237,0.0,2.18,0.0,0.458,6.998,45.8,6.0622,3.0,222.0,18.7,394.63,2.94,33.4
4,0.06905,0.0,2.18,0.0,0.458,7.147,54.2,6.0622,3.0,222.0,18.7,396.9,5.33,36.2


In [6]:
df.isnull().sum()

CRIM       0
ZN         0
INDUS      0
CHAS       0
NOX        0
RM         0
AGE        0
DIS        0
RAD        0
TAX        0
PTRATIO    0
B          0
LSTAT      0
Target     0
dtype: int64

In [7]:
X = df.iloc[:, :-1]
y = df.iloc[:, -1]

In [16]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.neighbors import KNeighborsRegressor

from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor

from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score

import time

def printFeatureImportance(clf, X,y):
    print('\n feature importance:')
    clf.fit(X, y)
    features = X.columns
    importances = clf.feature_importances_
    arg_sort = np.argsort(importances)[::-1]
    n = min(len(features), 10)
    for i in range(n):
        idx = arg_sort[i]
        print('%2d. %-*s %.2f %%' %(i+1,15, features[idx],importances[idx]*100))

def fitRegressionTree(X,y, estimator=DecisionTreeRegressor, param_grid=None, cv=10):
    t1 = time.time()
    
    X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.3)
    
    est = estimator()
    
    if param_grid:
        gs = GridSearchCV(estimator=est,
                    param_grid=param_grid,
                    cv=cv)
        
        gs = gs.fit(X_train, y_train)
        est = gs.best_estimator_
        print('best param: ' + str(gs.best_params_))
        print('grid search time: %.2f sec.' %(time.time()-t1))
    
    train_scores = cross_val_score(estimator=est, X=X_train, y=y_train, cv=cv)
    test_scores = cross_val_score(estimator=est, X=X_test, y=y_test, cv=cv)
    
    print('train mean R^2: %.2f (std=%.2f)' 
              %(np.mean(train_scores), np.std(train_scores)))
    print('test mean R^2: %.2f (std=%.2f)' 
              %(np.mean(test_scores), np.std(test_scores)))
    
    printFeatureImportance(est, X,y)
    print('\n total time: %.2f sec.' %(time.time()-t1))
    return est

def fitRegression(X,y, estimator=LinearRegression, categorical_features=[], normalize=True, param_grid=None, cv=5):
    t1 = time.time()
    
    X = X.copy()
    y = y.copy()
    
    est = estimator()
    
    columns = X.columns
    numeric_features = [x for x in columns if x not in categorical_features]
    if normalize:
        X[numeric_features] = StandardScaler().fit_transform(X[numeric_features])
        
    X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.3)
    
    if categorical_features:
        ohe = OneHotEncoder(categorical_features=categorical_features)
        X_ohe = ohe.fit_transform(X).toarray()
        X_train, X_test, y_train, y_test = train_test_split(X_ohe,y, test_size=0.3)
    
    if param_grid:
        gs = GridSearchCV(estimator=est,
                    param_grid=param_grid,
                    cv=cv)
        
        gs = gs.fit(X_train, y_train)
        est = gs.best_estimator_
        print('best param: ' + str(gs.best_params_))
        print('grid search time: %.2f sec.' %(time.time()-t1))
    
        
    train_scores = cross_val_score(estimator=est, X=X_train, y=y_train, cv=cv)
    test_scores = cross_val_score(estimator=est, X=X_test, y=y_test, cv=cv)
    
    print('train mean R^2: %.2f (std=%.2f)' 
              %(np.mean(train_scores), np.std(train_scores)))
    print('test mean R^2: %.2f (std=%.2f)' 
              %(np.mean(test_scores), np.std(test_scores)))
    
    print('\n total time: %.2f sec.' %(time.time()-t1))
    return est

In [17]:
fitRegression(X,y)

train mean R^2: 0.67 (std=0.05)
test mean R^2: 0.75 (std=0.04)

 total time: 0.05 sec.


LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [18]:
param_grid = [{'alpha':[0.01, 0.05, 0.1,0.5, 1.0,5.0, 10.0]}]
fitRegression(X,y,estimator=Ridge, param_grid=param_grid)

best param: {'alpha': 1.0}
grid search time: 0.12 sec.
train mean R^2: 0.70 (std=0.05)
test mean R^2: 0.66 (std=0.21)

 total time: 0.16 sec.


Ridge(alpha=1.0, copy_X=True, fit_intercept=True, max_iter=None,
   normalize=False, random_state=None, solver='auto', tol=0.001)

In [19]:
fitRegressionTree(X,y)

train mean R^2: 0.75 (std=0.11)
test mean R^2: 0.46 (std=0.39)

 feature importance:
 1. RM              57.80 %
 2. LSTAT           20.28 %
 3. DIS             7.53 %
 4. NOX             6.08 %
 5. CRIM            2.99 %
 6. B               1.72 %
 7. TAX             1.43 %
 8. AGE             1.00 %
 9. PTRATIO         0.72 %
10. INDUS           0.23 %

 total time: 0.11 sec.


DecisionTreeRegressor(criterion='mse', max_depth=None, max_features=None,
           max_leaf_nodes=None, min_impurity_split=1e-07,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, presort=False, random_state=None,
           splitter='best')

In [20]:
param_grid = [{'max_depth':[3,5,7,9]}]
fitRegressionTree(X,y, param_grid=param_grid)

best param: {'max_depth': 9}
grid search time: 0.19 sec.
train mean R^2: 0.77 (std=0.09)
test mean R^2: 0.42 (std=0.57)

 feature importance:
 1. RM              58.10 %
 2. LSTAT           20.56 %
 3. DIS             7.48 %
 4. NOX             6.04 %
 5. CRIM            4.12 %
 6. TAX             1.39 %
 7. AGE             0.83 %
 8. B               0.66 %
 9. PTRATIO         0.59 %
10. INDUS           0.09 %

 total time: 0.27 sec.


DecisionTreeRegressor(criterion='mse', max_depth=9, max_features=None,
           max_leaf_nodes=None, min_impurity_split=1e-07,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, presort=False, random_state=None,
           splitter='best')

In [21]:
param_grid = [ {'n_estimators':[10, 20], 
            'max_depth':[3,5], 
            'max_features':[3,5]} ]
fitRegressionTree(X,y, estimator=RandomForestRegressor, param_grid=param_grid)

best param: {'max_features': 5, 'n_estimators': 20, 'max_depth': 5}
grid search time: 1.84 sec.
train mean R^2: 0.81 (std=0.12)
test mean R^2: 0.77 (std=0.10)

 feature importance:
 1. RM              34.89 %
 2. LSTAT           31.89 %
 3. INDUS           7.45 %
 4. PTRATIO         7.20 %
 5. DIS             6.89 %
 6. CRIM            3.50 %
 7. NOX             3.04 %
 8. TAX             2.23 %
 9. B               0.91 %
10. RAD             0.80 %

 total time: 2.44 sec.


RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=5,
           max_features=5, max_leaf_nodes=None, min_impurity_split=1e-07,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=20, n_jobs=1,
           oob_score=False, random_state=None, verbose=0, warm_start=False)

In [22]:
fitRegressionTree(X,y, estimator=GradientBoostingRegressor)

train mean R^2: 0.83 (std=0.13)
test mean R^2: 0.86 (std=0.05)

 feature importance:
 1. RM              23.09 %
 2. LSTAT           15.42 %
 3. DIS             15.31 %
 4. CRIM            7.67 %
 5. AGE             7.21 %
 6. TAX             6.76 %
 7. B               6.72 %
 8. NOX             5.90 %
 9. PTRATIO         5.19 %
10. INDUS           4.21 %

 total time: 0.78 sec.


GradientBoostingRegressor(alpha=0.9, criterion='friedman_mse', init=None,
             learning_rate=0.1, loss='ls', max_depth=3, max_features=None,
             max_leaf_nodes=None, min_impurity_split=1e-07,
             min_samples_leaf=1, min_samples_split=2,
             min_weight_fraction_leaf=0.0, n_estimators=100,
             presort='auto', random_state=None, subsample=1.0, verbose=0,
             warm_start=False)

In [109]:
fitRegression(X,y, estimator=KNeighborsRegressor)

train mean R^2: 0.75 (std=0.05)
test mean R^2: 0.50 (std=0.22)

 total time: 0.06 sec.


Pipeline(steps=[('scl', StandardScaler(copy=True, with_mean=True, with_std=True)), ('est', KNeighborsRegressor(algorithm='auto', leaf_size=30, metric='minkowski',
          metric_params=None, n_jobs=1, n_neighbors=5, p=2,
          weights='uniform'))])