In [30]:
import pandas as pd
import numpy as np
from utils import (
    correlated_feat_removal,
    get_corr_mask,
    get_corr_matix,
    get_matrix_mask,
    get_second_matrix,
    make_selection,
)
from feature_engine.selection import DropCorrelatedFeatures

### 1. Prepare a dataset

In [31]:
data = pd.DataFrame(
    [
        {"a": 1, "b": 2, "c": 3, "d": 2, "e": -1, "f": 3},
        {"a": 10, "b": 20, "c": 30, "d": np.NaN, "e": -9, "f": 4},
        {"a": 20, "b": 40, "c": 10, "d": 38, "e": -18, "f": -20},
        {"a": 19.3, "b": 35, "c": 12, "d": 12, "e": -18, "f": 15},
    ]
)

In [32]:
data.head()

Unnamed: 0,a,b,c,d,e,f
0,1.0,2,3,2.0,-1,3
1,10.0,20,30,,-9,4
2,20.0,40,10,38.0,-18,-20
3,19.3,35,12,12.0,-18,15


### 2. Correlated feature removal - commonly-used way

#### 2.1 Manually coded - commonly-used approach
The correlated features are removed following these steps:

1. We use pandas `.corr()` to get a corr_matrix, i.e., correlation among features.
2. Then we check the column list one by one.
3. For each column, we calculate the correlations between this feature and all the rest. When any of the absolute
value of the correlation excceds the threshold, we drop that feature.

In [33]:
corr, selected_cols = correlated_feat_removal(X=data, cols=None, corr_thresh=0.8)

In [34]:
corr

Unnamed: 0,a,b,c,d,e,f
a,,0.995835,0.138988,0.736925,0.999417,0.234285
b,,,0.156412,0.794019,0.992154,0.31485
c,,,,0.550365,0.129252,0.160612
d,,,,,0.714575,0.815897
e,,,,,,0.204886
f,,,,,,


In [35]:
selected_cols

['a', 'c', 'd']

In [36]:
corr, selected_cols_double_check = correlated_feat_removal(X=data, cols=list(selected_cols), corr_thresh=0.8)

In [37]:
assert selected_cols == selected_cols_double_check, "The first round of correlated feature removal is not complete"

#### 2.2 Use feature engine

In [38]:
tr = DropCorrelatedFeatures(variables=None, method="pearson", threshold=0.8)
Xt = tr.fit_transform(data)

tr.correlated_feature_sets_
Xt.columns

Index(['a', 'c', 'd'], dtype='object')

Same results for both, good!

### 3. Feature selection based on different criteria, e.g. std, missings

In [39]:
# Get the correlation matrix and std difference matrix
corr = get_corr_matix(X=data)
std_diff = get_second_matrix(X=data, second_matrix_type="std")
missing_diff = get_second_matrix(X=data, second_matrix_type="missings")

In [40]:
# Mask both matrices
corr_mask = get_corr_mask(corr=corr, corr_thresh=0.8)
std_mask = get_matrix_mask(matrix=std_diff, threshold=0, mask_value=1)
missing_mask = get_matrix_mask(matrix=missing_diff, threshold=0, mask_value=1)

In [41]:
# Correlation among features
corr

Unnamed: 0,a,b,c,d,e,f
a,,0.995835,0.138988,0.736925,0.999417,0.234285
b,,,0.156412,0.794019,0.992154,0.31485
c,,,,0.550365,0.129252,0.160612
d,,,,,0.714575,0.815897
e,,,,,,0.204886
f,,,,,,


In [42]:
# Difference of std among features
std_diff

Unnamed: 0,a,b,c,d,e,f
a,,8.133039,2.537718,9.620864,-0.776929,5.745992
b,,,-5.595321,1.487825,-8.909968,-2.387047
c,,,,7.083146,-3.314647,3.208274
d,,,,,-10.397794,-3.874872
e,,,,,,6.522922
f,,,,,,


In [43]:
# Difference of num of missings  among features
missing_mask

Unnamed: 0,a,b,c,d,e,f
a,,-1.0,-1.0,1.0,-1.0,-1.0
b,,,-1.0,1.0,-1.0,-1.0
c,,,,1.0,-1.0,-1.0
d,,,,,-1.0,-1.0
e,,,,,,-1.0
f,,,,,,


In [44]:
corr_mask

Unnamed: 0,a,b,c,d,e,f
a,,0.995835,0.0,0.0,0.999417,0.0
b,,,0.0,0.0,0.992154,0.0
c,,,,0.0,0.0,0.0
d,,,,,0.0,0.815897
e,,,,,,0.0
f,,,,,,


In [45]:
std_mask
# Take two examples:
# Eaxmple 1: For pair (index_a, col_b), the std_mask is 1, meaning that `std_b > std_a`. 
# So when `a` and `b` are correlated, we prefer to drop `a`.

# Eaxmple 2: For pair (index_b, col_c), the std_mask is -1, meaning that `std_c < std_b`. 
# So when `b` and `c` are correlated, we prefer to drop `c`.

Unnamed: 0,a,b,c,d,e,f
a,,1.0,1.0,1.0,-1.0,1.0
b,,,-1.0,1.0,-1.0,-1.0
c,,,,1.0,-1.0,1.0
d,,,,,-1.0,-1.0
e,,,,,,1.0
f,,,,,,


In [49]:
corr_mask*std_mask

# In this matrix, we combine two mask matrix together - dropping correlated featues with low std. 
# The signature of matrix indicates the std_mask, and the absolute value of each indicates the correlation. 
# For example, we first go through the column list fom a to f.
# The first pair we check would be (col_b, index_a) where the correlation is 0.995835. 
# The number is larger than 0.9, meaning they are correlated, therefore one of them can be dropped. 
# The number is positive, meaning std_b > std_a, therefore `a` should be dropped.

Unnamed: 0,a,b,c,d,e,f
a,,0.995835,0.0,0.0,-0.999417,0.0
b,,,-0.0,0.0,-0.992154,-0.0
c,,,,0.0,-0.0,0.0
d,,,,,-0.0,-0.815897
e,,,,,,0.0
f,,,,,,


In [50]:
missing_diff

Unnamed: 0,a,b,c,d,e,f
a,,0.0,0.0,1.0,0.0,0.0
b,,,0.0,1.0,0.0,0.0
c,,,,1.0,0.0,0.0
d,,,,,-1.0,-1.0
e,,,,,,0.0
f,,,,,,


In [51]:
missing_mask

Unnamed: 0,a,b,c,d,e,f
a,,-1.0,-1.0,1.0,-1.0,-1.0
b,,,-1.0,1.0,-1.0,-1.0
c,,,,1.0,-1.0,-1.0
d,,,,,-1.0,-1.0
e,,,,,,-1.0
f,,,,,,


In [52]:
corr_mask*missing_mask

# In this matrix, we combine two mask matrix together - dropping correlated featues with more #missings. 
# The signature of matrix indicates the std_mask, and the absolute value of each indicates the correlation. 
# For example, we first go through the column list fom a to f.
# The first pair we check would be (col_b, index_a) where the abs(correlation) is 0.995835. 
# The number is larger than 0.9, meaning they are correlated, therefore one of them can be dropped. 
# The number is positive, meaning #missings_b < #missings_a, therefore `a` should be dropped.

Unnamed: 0,a,b,c,d,e,f
a,,-0.995835,-0.0,0.0,-0.999417,-0.0
b,,,-0.0,0.0,-0.992154,-0.0
c,,,,0.0,-0.0,-0.0
d,,,,,-0.0,-0.815897
e,,,,,,-0.0
f,,,,,,


#### 3.1 Smart feature selection based on `correlation` and `std`
When two feature are correlated, drop the one with `lower standard deviation`.

In [53]:
feats_to_keep, feats_to_drop = make_selection(corr_mask=corr_mask, matrix_mask=std_mask, corr_thresh=0.8, second_matrix_type="std")

In [54]:
feats_to_keep, feats_to_drop

(['b', 'c', 'd'], ['e', 'f', 'a'])

Check if we still have correlated features dataframe:

In [55]:
assert data[data[feats_to_keep].corr().abs() > 0.8].sum().sum() == 0.0, "There are still correlated features"

#### 3.2 Smart feature selection based on `correlation` and `missings`
When two feature are correlated, drop the one with `less missing values`.

In [56]:
feats_to_keep, feats_to_drop = make_selection(corr_mask=corr_mask, matrix_mask=missing_mask, corr_thresh=0.8, second_matrix_type="missings")

In [57]:
feats_to_keep, feats_to_drop

(['c', 'e', 'f'], ['a', 'b', 'd'])

Check if we still have correlated features in the dataframe

In [58]:
assert data[data[feats_to_keep].corr().abs() > 0.8].sum().sum() == 0.0, "There are still correlated features"