# MRMR

In this notebook, we'll select features based on the Maximum Relevance Minimum Redundancy framework.

We'll implement the search manually to understand how the algorithm works.

In [1]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split

# to obtain the mutual information values
from sklearn.feature_selection import (
    mutual_info_classif, 
    mutual_info_regression,
    f_classif, 
    f_regression,
)

from sklearn.ensemble import RandomForestClassifier

In [2]:
data = pd.read_csv('../dataset_2.csv')

data.head()

Unnamed: 0,var_1,var_2,var_3,var_4,var_5,var_6,var_7,var_8,var_9,var_10,...,var_100,var_101,var_102,var_103,var_104,var_105,var_106,var_107,var_108,var_109
0,4.53271,3.280834,17.982476,4.404259,2.34991,0.603264,2.784655,0.323146,12.009691,0.139346,...,2.079066,6.748819,2.941445,18.360496,17.726613,7.774031,1.473441,1.973832,0.976806,2.541417
1,5.821374,12.098722,13.309151,4.125599,1.045386,1.832035,1.833494,0.70909,8.652883,0.102757,...,2.479789,7.79529,3.55789,17.383378,15.193423,8.263673,1.878108,0.567939,1.018818,1.416433
2,1.938776,7.952752,0.972671,3.459267,1.935782,0.621463,2.338139,0.344948,9.93785,11.691283,...,1.861487,6.130886,3.401064,15.850471,14.620599,6.849776,1.09821,1.959183,1.575493,1.857893
3,6.02069,9.900544,17.869637,4.366715,1.973693,2.026012,2.853025,0.674847,11.816859,0.011151,...,1.340944,7.240058,2.417235,15.194609,13.553772,7.229971,0.835158,2.234482,0.94617,2.700606
4,3.909506,10.576516,0.934191,3.419572,1.871438,3.340811,1.868282,0.439865,13.58562,1.153366,...,2.738095,6.565509,4.341414,15.893832,11.929787,6.954033,1.853364,0.511027,2.599562,0.811364


In [3]:
# separate train and test sets
X_train, X_test, y_train, y_test = train_test_split(
    data[["var_1", "var_2", "var_3", "var_4", "var_5", "var_6", "var_7"]],
    data['target'],
    test_size=0.3,
    random_state=0)

X_train.shape, X_test.shape

((35000, 7), (15000, 7))

In [4]:
y_train.unique()

array([1, 0])

## mutual information

We'll start by using the mutual information to determine both relevance and redundancy.

In [5]:
# the target is categorical, so we use classification

relevance = mutual_info_classif(X_train, y_train, random_state=42)

relevance 

array([0.        , 0.00375371, 0.        , 0.00116387, 0.00053645,
       0.00498133, 0.0048928 ])

In [6]:
# now we find the position of the featute whose MI
# with the rarget is the highest

n = relevance.argmax()
n

np.int64(5)

In [7]:
# capture all featurs in a list

remaining = X_train.columns.to_list()

remaining

['var_1', 'var_2', 'var_3', 'var_4', 'var_5', 'var_6', 'var_7']

In [8]:
# isolate the feature with highest MI and
# redefine feature lists

feature = remaining[n]

selected = [feature]

remaining.remove(feature) 

feature, selected, remaining

('var_6', ['var_6'], ['var_1', 'var_2', 'var_3', 'var_4', 'var_5', 'var_7'])

In [9]:
# remove the selected feature from the relevance matrix
# we need this matrix for the coming steps.

relevance = np.delete(relevance, n)

relevance

array([0.        , 0.00375371, 0.        , 0.00116387, 0.00053645,
       0.0048928 ])

In [10]:
# we'll make a copy so we have a shorter name as variable

X = X_train.copy()

In [28]:
# now we determine the mutual information between all remaining features
# and the selected feature

redundancy = mutual_info_regression(X[remaining], X[feature], random_state=42)

redundancy

array([0.01480108, 0.02684905, 0.03007928, 0.12978507, 0.08237336,
       0.02644202])

In [12]:
# in this case, we'll use the ratio between relevance and redundancy
# but you could use the difference as well.

mrmr = relevance / redundancy
mrmr

array([0.        , 0.0455711 , 0.        , 0.02014171, 0.01180523,
       0.04777016])

In [13]:
# we find the feature with highest relevance to redundance ratio

n = mrmr.argmax()
n

np.int64(5)

In [14]:
# the newly selected feature
feature = remaining[n]

# the list with the features selected so far (now with 2 features)
selected.append(feature)

# the remaining features to examine
remaining.remove(feature)

feature, selected, remaining

('var_7', ['var_6', 'var_7'], ['var_1', 'var_2', 'var_3', 'var_4', 'var_5'])

In [15]:
# remove selected feature from relevance matrix
relevance = np.delete(relevance, n)

relevance

array([0.        , 0.00375371, 0.        , 0.00116387, 0.00053645])

In [16]:
# remove selected feature from redundanct matrix

redundancy = np.delete(redundancy, n)

redundancy

array([0.00660673, 0.08237041, 0.07526734, 0.05778429, 0.04544165])

In [17]:
# estimate the mutual information of the remaining features to 
# the feature that was selected in the second round
new_red = mutual_info_regression(X[remaining], X[feature], random_state=42)

# add the new values of MI to the redundancy matrix
redundancy = np.vstack([redundancy, new_red])

redundancy

array([[0.00660673, 0.08237041, 0.07526734, 0.05778429, 0.04544165],
       [0.00285312, 0.02644704, 0.05296772, 0.2988227 , 0.00385052]])

In [18]:
# now we take the mean redundancy between the remaining features and
# those selected in previous rounds

redundancy.mean(axis=0)

array([0.00472992, 0.05440873, 0.06411753, 0.1783035 , 0.02464609])

In [19]:
# we obtain the ratio between the relevance and redundancy

mrmr = relevance / redundancy.mean(axis=0)
mrmr

array([0.        , 0.06899096, 0.        , 0.00652749, 0.02176609])

In [20]:
# and we repeat the procedure:
# select featute with highest ratio
# update all feature lists
n = mrmr.argmax()

feature = remaining[n]

selected.append(feature)

remaining.remove(feature)

feature, selected, remaining

('var_2', ['var_6', 'var_7', 'var_2'], ['var_1', 'var_3', 'var_4', 'var_5'])

In [21]:
# update relevance and redundance matrix

relevance = np.delete(relevance, n)
redundancy = np.delete(redundancy, n, axis=1)

In [22]:
# estimate the mutual information of the remaining features to 
# the feature that was selected in the third round

new_red = mutual_info_regression(X[remaining], X[feature], random_state=42)

# add the new values of MI to the redundancy matrix
redundancy = np.vstack([redundancy, new_red])

In [23]:
# we obtain the ratio between the relevance and redundancy

mrmr = relevance / redundancy.mean(axis=0)
mrmr

array([0.        , 0.        , 0.00902945, 0.00898697])

And so on...

We repeat this procedure untill we obtain the desired number of features.

## anova

If you want to obtain the relevance and redundancy using anova and correlation instead of the mutual information, you need to replace the redundance and relevance functions as follows:

In [24]:
relevance = f_classif(X_train, y_train)[0]

relevance

array([ 0.29207399, 36.77380385, 13.04118528, 11.75743595, 14.15631812,
       30.63816717, 20.22340598])

In [25]:
remaining = X_train.columns.to_list()

feature = remaining[n]

selected = [feature]

remaining.remove(feature) 

feature, selected, remaining

('var_2', ['var_2'], ['var_1', 'var_3', 'var_4', 'var_5', 'var_6', 'var_7'])

In [26]:
redundance = np.abs(X[remaining].corrwith(X[feature]))

redundance

var_1    0.033981
var_3    0.159411
var_4    0.194955
var_5    0.354885
var_6    0.342255
var_7    0.143461
dtype: float64

## random forest

To obtain the relevance through random forests, we train a model and use the feature importance.

In [27]:
relevance = RandomForestClassifier(n_estimators=5).fit(X_train, y_train).feature_importances_

relevance

array([0.13811395, 0.14402894, 0.13938271, 0.14691492, 0.14584093,
       0.14148957, 0.14422899])

The rest of the procedure is identical.