In [1]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split

# to obtain the mutual information values
from sklearn.feature_selection import (
    mutual_info_classif, mutual_info_regression,
    f_classif, f_regression)

from sklearn.ensemble import RandomForestClassifier

from feature_engine.selection import MRMR



In [2]:
data = pd.read_csv('dataset_2.csv')

data.head()

Unnamed: 0,var_1,var_2,var_3,var_4,var_5,var_6,var_7,var_8,var_9,var_10,...,var_100,var_101,var_102,var_103,var_104,var_105,var_106,var_107,var_108,var_109
0,4.53271,3.280834,17.982476,4.404259,2.34991,0.603264,2.784655,0.323146,12.009691,0.139346,...,2.079066,6.748819,2.941445,18.360496,17.726613,7.774031,1.473441,1.973832,0.976806,2.541417
1,5.821374,12.098722,13.309151,4.125599,1.045386,1.832035,1.833494,0.70909,8.652883,0.102757,...,2.479789,7.79529,3.55789,17.383378,15.193423,8.263673,1.878108,0.567939,1.018818,1.416433
2,1.938776,7.952752,0.972671,3.459267,1.935782,0.621463,2.338139,0.344948,9.93785,11.691283,...,1.861487,6.130886,3.401064,15.850471,14.620599,6.849776,1.09821,1.959183,1.575493,1.857893
3,6.02069,9.900544,17.869637,4.366715,1.973693,2.026012,2.853025,0.674847,11.816859,0.011151,...,1.340944,7.240058,2.417235,15.194609,13.553772,7.229971,0.835158,2.234482,0.94617,2.700606
4,3.909506,10.576516,0.934191,3.419572,1.871438,3.340811,1.868282,0.439865,13.58562,1.153366,...,2.738095,6.565509,4.341414,15.893832,11.929787,6.954033,1.853364,0.511027,2.599562,0.811364


In [3]:
# separate train and test sets
X_train, X_test, y_train, y_test = train_test_split(
    data[["var_1", "var_2", "var_3", "var_4", "var_5"]],
    data['target'],
    test_size=0.3,
    random_state=0)

X_train.shape, X_test.shape

((35000, 5), (15000, 5))

In [4]:
y_train.unique()

array([1, 0], dtype=int64)

## mutual information

In [5]:
relevance = mutual_info_classif(X_train, y_train, random_state=42)

relevance

array([0.        , 0.00362106, 0.        , 0.00111507, 0.00025414])

In [6]:
redundance = []

for feature in X_train.columns:
    red = np.mean(mutual_info_regression(X_train.drop(feature, axis=1), X_train[feature], random_state=42))
    redundance.append(red)

redundance

[0.015572131527192168,
 0.050378586584219676,
 0.031158562721424587,
 0.03277019176752827,
 0.04363328560736557]

In [7]:
mrmr = relevance / redundance

mrmr

array([0.        , 0.071877  , 0.        , 0.0340271 , 0.00582448])

In [8]:
sel = MRMR(method = "MIQ", regression=False, random_state=42)
sel.fit(X_train, y_train)

In [9]:
sel.relevance_

array([0.        , 0.00362106, 0.        , 0.00111507, 0.00025414])

In [10]:
sel.redundance_

[0.015572131527192168,
 0.050378586584219676,
 0.031158562721424587,
 0.03277019176752827,
 0.04363328560736557]

In [11]:
sel.mrmr_

var_1    0.000000
var_2    0.071877
var_3    0.000000
var_4    0.034027
var_5    0.005824
dtype: float64

In [14]:
sel = MRMR(method = "MID", regression=False, random_state=42)
sel.fit(X_train, y_train)
sel.mrmr_

var_1   -0.015572
var_2   -0.046758
var_3   -0.031159
var_4   -0.031655
var_5   -0.043379
dtype: float64

## anova

In [15]:
relevance = f_classif(X_train, y_train)[0]

relevance

array([ 0.29207399, 36.77380385, 13.04118528, 11.75743595, 14.15631812])

In [20]:
redundance = []

for feature in X_train.columns:
    f = f_regression(X_train.drop(feature, axis=1), X_train[feature])
    red = np.mean(f[0])
    redundance.append(red)

redundance

[103.82542924595566,
 1844.6584032550254,
 1178.3882534531167,
 1360.5060917010146,
 1416.0951768868529]

In [21]:
mrmr = relevance / np.array(redundance)

mrmr

array([0.00281313, 0.01993529, 0.01106697, 0.00864196, 0.00999673])

In [18]:
sel = MRMR(method = "FCQ", regression=False, random_state=42)
sel.fit(X_train, y_train)

In [19]:
sel.relevance_

array([ 0.29207399, 36.77380385, 13.04118528, 11.75743595, 14.15631812])

In [22]:
sel.redundance_

array([ 103.82542925, 1844.65840326, 1178.38825345, 1360.5060917 ,
       1416.09517689])

In [23]:
sel.mrmr_

var_1    0.002813
var_2    0.019935
var_3    0.011067
var_4    0.008642
var_5    0.009997
dtype: float64

## random forest

In [41]:
X_train.head()

Unnamed: 0,var_1,var_2,var_3,var_4,var_5
17967,6.266666,1.520409,17.922689,3.866667,2.585592
32391,6.533332,7.28408,18.014173,3.121622,1.808227
9341,5.072728,6.654357,18.520548,4.749881,2.0741
7929,4.37647,4.752176,15.00085,3.954205,1.946561
46544,5.896078,2.539801,13.484358,3.802789,2.639788


In [46]:
relevance = RandomForestClassifier(n_estimators=5).fit(X_train, y_train).feature_importances_

relevance

array([0.19574455, 0.20706654, 0.19692296, 0.20231382, 0.19795213])

In [44]:
relevance

array([0.2053789 , 0.20118028, 0.19703403, 0.1956124 , 0.20079438])