In [2]:
import sys
sys.path.append('..')

from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.feature_selection import SelectFromModel
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import mutual_info_regression
from sklearn.feature_selection import VarianceThreshold

from common import load_forest_fires

# Feature Selection

## Why select features

- colinearity
- reduces noise (+ overfitting)
- more interpretable
- train models quicker
- train models better

Adding features is an exponential cost!
- curse of dimensionality
- model needs to understand the new feature in the context of every other feature

What makes a good feature selection algorithm
- remove low information features
- reduce overlap between features


## Three categories of feature selection

1. Wrappers
- assess performance by performance of a model
- new model for each set of features -> expensive

2. Filter
- only consider statistics of the data (correlation, mutual infomation, variance thresholding)
- ignore interaction with learning algorithm

3. Embedded
- combinations of wrapping & filters
- feature selection as part of the model construction process


In [3]:
ds = load_forest_fires()

x = ds.loc[:, ['FFMC', 'DMC', 'DC', 'ISI', 'temp', 'RH', 'wind', 'rain']]
y = ds.loc[:, 'area']

ds.describe()

Downloading forest fires dataset - the aim is to predict the burned area of forest fires, in the northeast region of Portugal, by using meteorological and other data
For more information, read [Cortez and Morais, 2007].
        1. X - x-axis spatial coordinate within the Montesinho park map: 1 to 9
        2. Y - y-axis spatial coordinate within the Montesinho park map: 2 to 9
        3. month - month of the year: 'jan' to 'dec'
        4. day - day of the week: 'mon' to 'sun'
        5. FFMC - FFMC index from the FWI system: 18.7 to 96.20
        6. DMC - DMC index from the FWI system: 1.1 to 291.3
        7. DC - DC index from the FWI system: 7.9 to 860.6
        8. ISI - ISI index from the FWI system: 0.0 to 56.10
        9. temp - temperature in Celsius degrees: 2.2 to 33.30
        10. RH - relative humidity in %: 15.0 to 100
        11. wind - wind speed in km/h: 0.40 to 9.40
        12. rain - outside rain in mm/m2 : 0.0 to 6.4
        13. area - the burned area of the forest (i

Unnamed: 0,X,Y,FFMC,DMC,DC,ISI,temp,RH,wind,rain,area
count,517.0,517.0,517.0,517.0,517.0,517.0,517.0,517.0,517.0,517.0,517.0
mean,4.669246,4.299807,90.644681,110.87234,547.940039,9.021663,18.889168,44.288201,4.017602,0.021663,12.847292
std,2.313778,1.2299,5.520111,64.046482,248.066192,4.559477,5.806625,16.317469,1.791653,0.295959,63.655818
min,1.0,2.0,18.7,1.1,7.9,0.0,2.2,15.0,0.4,0.0,0.0
25%,3.0,4.0,90.2,68.6,437.7,6.5,15.5,33.0,2.7,0.0,0.0
50%,4.0,4.0,91.6,108.3,664.2,8.4,19.3,42.0,4.0,0.0,0.52
75%,7.0,5.0,92.9,142.4,713.9,10.8,22.8,53.0,4.9,0.0,6.57
max,9.0,9.0,96.2,291.3,860.6,56.1,33.3,100.0,9.4,6.4,1090.84


In [4]:
ds.head()

Unnamed: 0,X,Y,month,day,FFMC,DMC,DC,ISI,temp,RH,wind,rain,area
0,7,5,mar,fri,86.2,26.2,94.3,5.1,8.2,51,6.7,0.0,0.0
1,7,4,oct,tue,90.6,35.4,669.1,6.7,18.0,33,0.9,0.0,0.0
2,7,4,oct,sat,90.6,43.7,686.9,6.7,14.6,33,1.3,0.0,0.0
3,8,6,mar,fri,91.7,33.3,77.5,9.0,8.3,97,4.0,0.2,0.0
4,8,6,mar,sun,89.3,51.3,102.2,9.6,11.4,99,1.8,0.0,0.0


In [5]:
ds.columns

Index(['X', 'Y', 'month', 'day', 'FFMC', 'DMC', 'DC', 'ISI', 'temp', 'RH',
       'wind', 'rain', 'area'],
      dtype='object')

## Variance selection

Based only on the feature
- no information about the target

In [6]:
sel = VarianceThreshold(threshold=(0.8))
sel.fit_transform(x)

x.columns[sel.get_support()]

Index(['FFMC', 'DMC', 'DC', 'ISI', 'temp', 'RH', 'wind'], dtype='object')

## Univariate feature selection

Selecting features in isolation, based on statistical relationship to the target

[sklearn docs](https://scikit-learn.org/stable/modules/feature_selection.html#univariate-feature-selection)

In [7]:
k = 6
selector = SelectKBest(mutual_info_regression, k=k)
features = selector.fit_transform(x, y)

features

array([[ 26.2,  94.3,   5.1,   8.2,  51. ,   0. ],
       [ 35.4, 669.1,   6.7,  18. ,  33. ,   0. ],
       [ 43.7, 686.9,   6.7,  14.6,  33. ,   0. ],
       ...,
       [ 56.7, 665.6,   1.9,  21.2,  70. ,   0. ],
       [146. , 614.7,  11.3,  25.6,  42. ,   0. ],
       [  3. , 106.7,   1.1,  11.8,  31. ,   0. ]])

In [8]:
x.columns[selector.get_support()]

Index(['DMC', 'DC', 'ISI', 'temp', 'RH', 'rain'], dtype='object')

In [9]:
for score, f in zip(selector.scores_, x.columns):
    print(score, f)

0.0 FFMC
0.022244534337525046 DMC
0.05308329518699395 DC
0.06166952726822661 ISI
0.018335777054745428 temp
0.0518550180400279 RH
0.0 wind
0.005801704363287641 rain


## SelectFromModel

Requires an interpretable model
- coefficients in linear regression
- feature importances

Select features based on a threshold

In [10]:
mdl = ExtraTreesRegressor(n_estimators=50)
mdl.fit(x, y)
model = SelectFromModel(mdl, prefit=True, threshold='mean')
x_new = model.transform(x)
x.columns[model.get_support()]

  f"X has feature names, but {self.__class__.__name__} was fitted without"


Index(['DMC', 'temp', 'RH', 'wind'], dtype='object')

## Resources

sklearn docs - [Feature selection](https://scikit-learn.org/stable/modules/feature_selection.html#univariate-feature-selection)

Why giving your algorithm ALL THE FEATURES does not always work - Thomas Huijskens - [youtube](https://youtu.be/JsArBz46_3s)

[Automated Feature Engineering and Selection in Python](https://www.youtube.com/watch?v=4-4pKPv9lJ4)