# Automate manual MRMR with a loop

In this notebook, we'll automate the process of selecting features with MRMR by capturing the logic of the previous notebook in a loop.

In [1]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.datasets import load_breast_cancer

# to obtain the mutual information values
from sklearn.feature_selection import (
    mutual_info_classif,
    mutual_info_regression,
    f_classif,
    f_regression,
)

## Mutual information

In [2]:
X, y = load_breast_cancer(return_X_y = True, as_frame=True)

X.head()

Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,...,worst radius,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension
0,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,0.2419,0.07871,...,25.38,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189
1,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,0.1812,0.05667,...,24.99,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902
2,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,0.2069,0.05999,...,23.57,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758
3,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,0.2597,0.09744,...,14.91,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173
4,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,0.1809,0.05883,...,22.54,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678


In [3]:
y.unique()

array([0, 1])

In [4]:
# calculate relevance

relevance = mutual_info_classif(X, y, random_state=42)

# find feature with highest MI
n = relevance.argmax()

# remove feature from relevance matrix
relevance = np.delete(relevance, n)

In [5]:
# list of features

remaining = X.columns.to_list()

In [6]:
# isolate selected feature
feature = remaining[n]

selected = [feature]

# update list of features to examine
remaining.remove(feature) 

feature, selected, remaining

('worst perimeter',
 ['worst perimeter'],
 ['mean radius',
  'mean texture',
  'mean perimeter',
  'mean area',
  'mean smoothness',
  'mean compactness',
  'mean concavity',
  'mean concave points',
  'mean symmetry',
  'mean fractal dimension',
  'radius error',
  'texture error',
  'perimeter error',
  'area error',
  'smoothness error',
  'compactness error',
  'concavity error',
  'concave points error',
  'symmetry error',
  'fractal dimension error',
  'worst radius',
  'worst texture',
  'worst area',
  'worst smoothness',
  'worst compactness',
  'worst concavity',
  'worst concave points',
  'worst symmetry',
  'worst fractal dimension'])

In [7]:
# obtain reduncancy between remaining features and selected feature

redundancy = (mutual_info_regression(X[remaining], X[feature], random_state=42))

redundancy

array([1.55842558, 0.08125611, 1.69407809, 1.55349739, 0.03632087,
       0.21633758, 0.48657564, 0.61850209, 0.08650444, 0.15175462,
       0.35742416, 0.02455254, 0.39577613, 0.62505563, 0.14009995,
       0.08766004, 0.2517269 , 0.14806218, 0.09368083, 0.01501102,
       2.41036275, 0.0821414 , 2.28416992, 0.03892478, 0.27252143,
       0.37667288, 0.5729741 , 0.08989302, 0.09451001])

In [8]:
# obtain relation between relevance and redundancy

mrmr = relevance - redundancy
mrmr

array([-1.19614982,  0.01528343, -1.29171728, -1.19347404,  0.04341892,
       -0.00289815, -0.11112877, -0.17969564, -0.02078375, -0.14586661,
       -0.10812325, -0.02455254, -0.12016192, -0.28429658, -0.12444863,
       -0.01426988, -0.13428718, -0.02264763, -0.07946504,  0.02422432,
       -1.95913271,  0.03818992, -1.81985664,  0.05677229, -0.04731028,
       -0.06141349, -0.13671928,  0.00554175, -0.0294687 ])

In [9]:
# proceed the search inside a loop
# the range is the number of features to select minus 2
# so if we want to select 10 features, the range is 8

for i in range(8):
    
    n = mrmr.argmax()
    feature = remaining[n]
    selected.append(feature)
    remaining.remove(feature)

    relevance = np.delete(relevance, n)
    if i == 0:
        redundancy = np.delete(redundancy, n)
    else:
        redundancy = np.delete(redundancy, n, axis=1)
        
    new_red = mutual_info_regression(X[remaining], X[feature], random_state=42)
    redundancy = np.vstack([redundancy, new_red])

    mrmr = relevance - redundancy.mean(axis=0)

n = mrmr.argmax()
feature = remaining[n]
selected.append(feature)

selected

['worst perimeter',
 'worst smoothness',
 'worst texture',
 'mean concave points',
 'perimeter error',
 'worst concavity',
 'worst symmetry',
 'area error',
 'symmetry error',
 'worst concave points']

That's it! Now we have a list of 10 features that we selected using the MRMR framework with mutual information.