# Step Forward Regression

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
data = pd.read_csv('/Users/shailendrapatil/Fall2017/Machine Learning A-Z Template Folder/Projects/House Price Advanced Regression Techniques/train.csv')
data.shape

(1460, 81)

In [3]:
data.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


In [4]:
numerics=['int16','int32','int64','float16','float32','float64']
numerical_vars=data.select_dtypes(include=numerics).columns
data=data[numerical_vars]
data.shape

(1460, 38)

In [5]:
from sklearn.model_selection import train_test_split

In [6]:
X_train, X_test, y_train, y_test = train_test_split(
    data.drop(labels=['SalePrice'], axis=1),
    data['SalePrice'],
    test_size=0.3,
    random_state=0)

print(X_train.shape, y_train.shape)
print(X_test.shape,y_test.shape)

(1022, 37) (1022,)
(438, 37) (438,)


find and remove correlated features, in order to reduce the feature space a bit so that the algorithm takes shorter time

In [7]:
def correlation(dataset, threshold):
    col_corr = set()  # Set of all the names of correlated columns
    corr_matrix = dataset.corr()
    for i in range(len(corr_matrix.columns)):
        for j in range(i):
            if abs(corr_matrix.iloc[i, j]) > threshold: # we are interested in absolute coeff value
                colname = corr_matrix.columns[i]  # getting the name of column
                col_corr.add(colname)
    return col_corr

corr_features = correlation(X_train, 0.8)
print('correlated features: ', len(set(corr_features)) )

correlated features:  3


In [8]:
# removed correlated  features
X_train.drop(labels=corr_features, axis=1, inplace=True)
X_test.drop(labels=corr_features, axis=1, inplace=True)

X_train.shape, X_test.shape

((1022, 34), (438, 34))

In [9]:
X_train.fillna(0, inplace=True)

Let us use SequentialFeatureSelector from mlxtend to implement step forward selection

In [10]:
from sklearn.ensemble import RandomForestRegressor
from mlxtend.feature_selection import SequentialFeatureSelector as SFS

In [11]:
sfs=SFS(RandomForestRegressor(),k_features=10,forward=True,scoring='r2',n_jobs=-1,cv=3,verbose=2)

In [12]:
sfs.fit(np.array(X_train), y_train)

[Parallel(n_jobs=-1)]: Done  34 out of  34 | elapsed:    1.5s finished

[2018-04-20 20:47:19] Features: 1/10 -- score: 0.668144263102822[Parallel(n_jobs=-1)]: Done  26 out of  33 | elapsed:    1.2s remaining:    0.3s
[Parallel(n_jobs=-1)]: Done  33 out of  33 | elapsed:    1.3s finished

[2018-04-20 20:47:20] Features: 2/10 -- score: 0.7249617789282584[Parallel(n_jobs=-1)]: Done  32 out of  32 | elapsed:    1.4s finished

[2018-04-20 20:47:22] Features: 3/10 -- score: 0.7433854183851171[Parallel(n_jobs=-1)]: Done  24 out of  31 | elapsed:    0.9s remaining:    0.3s
[Parallel(n_jobs=-1)]: Done  31 out of  31 | elapsed:    1.3s finished

[2018-04-20 20:47:23] Features: 4/10 -- score: 0.7646722762613168[Parallel(n_jobs=-1)]: Done  30 out of  30 | elapsed:    2.0s finished

[2018-04-20 20:47:26] Features: 5/10 -- score: 0.7650744288228594[Parallel(n_jobs=-1)]: Done  22 out of  29 | elapsed:    0.9s remaining:    0.3s
[Parallel(n_jobs=-1)]: Done  29 out of  29 | elapsed:    1.1s finished

[

SequentialFeatureSelector(clone_estimator=True, cv=3,
             estimator=RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_split=1e-07, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           n_estimators=10, n_jobs=1, oob_score=False, random_state=None,
           verbose=0, warm_start=False),
             floating=False, forward=True, k_features=10, n_jobs=-1,
             pre_dispatch='2*n_jobs', scoring='r2', verbose=2)

In [13]:
sfs.k_feature_idx_

(4, 6, 13, 15, 16, 17, 19, 22, 24, 30)

In [14]:
X_train.columns[list(sfs.k_feature_idx_)]

Index(['OverallQual', 'YearBuilt', '1stFlrSF', 'LowQualFinSF', 'GrLivArea',
       'BsmtFullBath', 'FullBath', 'KitchenAbvGr', 'GarageCars', 'PoolArea'],
      dtype='object')