In [2]:
import pandas as pd
from utils import (
    correlated_feat_removal,
    get_corr_mask,
    get_corr_matix,
    get_matrix_mask,
    get_second_matrix,
    make_selection,
)

### 1. Prepare a dataset

In [4]:
data = pd.DataFrame(
    [
        {"a": 1, "b": 2, "c": 3, "d": 2, "e": -1},
        {"a": 10, "b": 20, "c": 30, "d": 18, "e": -9},
        {"a": 20, "b": 40, "c": 10, "d": 38, "e": -18},
        {"a": 19.3, "b": 35, "c": 12, "d": 12, "e": -18},
    ]
)

In [5]:
data.head()

Unnamed: 0,a,b,c,d,e
0,1.0,2,3,2,-1
1,10.0,20,30,18,-9
2,20.0,40,10,38,-18
3,19.3,35,12,12,-18


### 2. Correlated feature removal - commonly-used way

The correlated features are removed following these steps:

1. We use pandas `.corr()` to get a corr_matrix and then check the correlation among features.
2. Then we check the column list one by one.
3. For each column, we calculate the correlations between this feature and all the rest. When any of the absolute
value of the correlation excceds the threshold, we drop that feature.

In [6]:
corr, selected_cols = correlated_feat_removal(X=data, cols=None, corr_thresh=0.8)

In [7]:
corr

Unnamed: 0,a,b,c,d,e
a,,0.995835,0.138988,0.718899,0.999417
b,,,0.156412,0.779209,0.992154
c,,,,0.20531,0.129252
d,,,,,0.694965
e,,,,,


In [8]:
selected_cols

['a', 'c', 'd']

In [9]:
corr, selected_cols_double_check = correlated_feat_removal(X=data, cols=list(selected_cols), corr_thresh=0.8)

In [10]:
assert selected_cols == selected_cols_double_check, "The first round of correlated feature removal is not complete"

### 3. Feature selection based on different criteria, e.g. std, missings

In [22]:
# Get the correlation matrix and std difference matrix
corr = get_corr_matix(X=data)
std = get_second_matrix(X=data, matrix_type="std")
missing = get_second_matrix(X=data, matrix_type="missings")

In [23]:
# Mask both matrices
corr_mask = get_corr_mask(corr=corr, corr_thresh=0.8)
std_mask = get_matrix_mask(matrix=std, threshold=0, mask_value=1)
missing_mask = get_matrix_mask(matrix=missing, threshold=0, mask_value=1)

In [28]:
corr

Unnamed: 0,a,b,c,d,e
a,,0.995835,0.138988,0.718899,0.999417
b,,,0.156412,0.779209,0.992154
c,,,,0.20531,0.129252
d,,,,,0.694965
e,,,,,


In [24]:
std

Unnamed: 0,a,b,c,d,e
a,,8.133039,2.537718,6.214455,-0.776929
b,,,-5.595321,-1.918584,-8.909968
c,,,,3.676737,-3.314647
d,,,,,-6.991384
e,,,,,


In [25]:
std_mask

Unnamed: 0,a,b,c,d,e
a,,1.0,1.0,1.0,-1.0
b,,,-1.0,-1.0,-1.0
c,,,,1.0,-1.0
d,,,,,-1.0
e,,,,,


In [27]:
missing

Unnamed: 0,a,b,c,d,e
a,,0.0,0.0,0.0,0.0
b,,,0.0,0.0,0.0
c,,,,0.0,0.0
d,,,,,0.0
e,,,,,


In [26]:
missing_mask

Unnamed: 0,a,b,c,d,e
a,,-1.0,-1.0,-1.0,-1.0
b,,,-1.0,-1.0,-1.0
c,,,,-1.0,-1.0
d,,,,,-1.0
e,,,,,


#### 3.1 Smart feature selection based on `correlation` and `std`
When two feature are correlated, drop the one with lower standard deviation.

In [16]:
feats_to_keep, feats_to_drop = make_selection(corr_mask=corr_mask, matrix_mask=std_mask, corr_thresh=0.8)

In [17]:
feats_to_keep, feats_to_drop

(['b', 'c', 'd'], ['e', 'a'])

Check if we still have correlated features dataframe:

In [18]:
assert data[data[feats_to_keep].corr().abs() > 0.8].sum().sum() == 0.0, "There are still correlated features"

#### 3.2 Smart feature selection based on `correlation` and `missings`
When two feature are correlated, drop the one with less missing values.

In [19]:
feats_to_keep, feats_to_drop = make_selection(corr_mask=corr_mask, matrix_mask=missing_mask, corr_thresh=0.8)

In [20]:
feats_to_keep, feats_to_drop

(['a', 'c', 'd'], ['b', 'e'])

Check if we still have correlated features in the dataframe

In [21]:
assert data[data[feats_to_keep].corr().abs() > 0.8].sum().sum() == 0.0, "There are still correlated features"