## Correlation

In [1]:
import pandas as pd

from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split

from feature_engine.selection import DropCorrelatedFeatures

In [2]:
# Toy dataset with correlated features

X, y = make_classification(
    n_samples=1000,
    n_features=10,
    n_redundant=7,
    n_classes=2,
    random_state=10,
)

X = pd.DataFrame(X)
y = pd.Series(y)

X.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,-0.283792,0.47101,-1.343721,-0.33699,0.116821,0.145666,-0.054484,-0.343668,-0.226413,-0.240955
1,-0.448534,0.009435,-2.024315,-0.261384,0.21931,0.345767,0.045181,-0.490948,0.409079,-0.667868
2,-2.387431,-0.2819,0.180289,-1.268721,1.183003,1.892637,0.299812,-2.589595,2.523974,-3.684599
3,-0.479035,0.761899,1.095608,-0.556597,0.198756,0.251093,-0.086045,-0.577749,-0.347582,-0.419675
4,1.119764,-0.803058,-0.083495,0.940198,-0.510735,-0.740669,0.026449,1.281034,-0.207904,1.362914


In [3]:
# separate dataset into train and test

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=0,
)

X_train.shape, X_test.shape

((700, 10), (300, 10))

## Remove correlated: Feature-engine

### Brute force approach

In [4]:
# To remove correlated features
sel = DropCorrelatedFeatures(method="pearson", threshold=0.8)

# fit finds the correlated features
sel.fit(X_train)  

In [5]:
# the correlated features

sel.features_to_drop_

{3, 4, 5, 6, 7, 8, 9}

In [6]:
# groups of correlated features

sel.correlated_feature_sets_

[{0, 3, 4, 5, 7, 9}, {1, 6, 8}]

There are 2 groups of correlated features in the dataset.

In [7]:
# remove correlated features

X_train_t = sel.transform(X_train)
X_test_t = sel.transform(X_test)

X_train_t.shape, X_test_t.shape

((700, 3), (300, 3))

In [8]:
X_train_t.head()

Unnamed: 0,0,1,2
105,1.693553,-3.715633,0.250835
68,-1.470185,3.029661,-1.979157
479,-0.483779,1.192578,1.439996
399,1.409885,-0.505949,-0.049844
434,-1.085478,1.894522,-1.161771
