# Feature Selection - Mutual Information Gain (Entropy) For Classification

## Mutual Information

In probability theory and information theory, the mutual information (MI) of two random variables is a measure of the mutual dependence between the two variables. More specifically, it quantifies the "amount of information" (in units such as shannons, commonly called bits) obtained about one random variable through observing the other random variable. The concept of mutual information is intimately linked to that of entropy of a random variable, a fundamental notion in information theory that quantifies the expected "amount of information" held in a random variable.

Download Data Files https://github.com/laxmimerit/Data-Files-for-Feature-Selection

In [None]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns

In [None]:
from sklearn.ensemble import RandomForestClassifier

# VarianceThreshold - Feature selector that removes all low-variance features.
from sklearn.feature_selection import (
    SelectKBest,
    SelectPercentile,
    VarianceThreshold,
    mutual_info_classif,
    mutual_info_regression,
)
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

In [None]:
data = pd.read_csv("data/santander.csv", nrows=20000)
data.head()

In [None]:
x = data.drop("TARGET", axis=1)  # Features
y = data["TARGET"]  # Outcome

x.shape, y.shape

In [None]:
x_train, x_test, y_train, y_test = train_test_split(
    x, y, test_size=0.2, random_state=0, stratify=y
)
x_train.shape, x_test.shape, y_train.shape, y_test.shape

### Constant Features Removal

In [None]:
constant_filter = VarianceThreshold(threshold=0)
constant_filter.fit(x_train)

In [None]:
# No. of features after constants removal
constant_filter.get_support().sum()

In [None]:
# Returns True for all the features which are constants.
constant_list = [
    not temp for temp in constant_filter.get_support()
]  # Inversing the True to False and False to True
constant_list

In [None]:
# Name of all the features which are constants
x.columns[constant_list]

In [None]:
# removing all the constants from our Training and Test dataset.
x_train_filter = constant_filter.transform(x_train)
x_test_filter = constant_filter.transform(x_test)

In [None]:
# Now take a look at the original and the transformed data (after removing the constants)
x_train.shape, x_test.shape, x_train_filter.shape, x_test_filter.shape

## Quasi Constants Feature Removal

In [None]:
quasi_constant_filter = VarianceThreshold(threshold=0.01)

In [None]:
quasi_constant_filter.fit(x_train_filter)

In [None]:
quasi_constant_filter.get_support().sum()

In [None]:
x_train_quasi_filter = quasi_constant_filter.transform(x_train_filter)
x_test_quasi_filter = quasi_constant_filter.transform(x_test_filter)

In [None]:
# Now take a look at the original and the transformed data (after removing the constants)
x_train.shape, x_test.shape, x_train_filter.shape, x_test_filter.shape, x_train_quasi_filter.shape, x_train_quasi_filter.shape

## Duplicate Features Removal

In [None]:
x_train_T = x_train_quasi_filter.T
x_test_T = x_test_quasi_filter.T

In [None]:
# As we can see the pandas dataframe has been transformed in to numpy array after transpose.
type(x_train_T)

In [None]:
# Changing numpy array back to pandas dataframe
x_train_T = pd.DataFrame(x_train_T)
x_test_T = pd.DataFrame(x_test_T)

In [None]:
# Now we can see after transpose the rows has become columns and columns has become rows.
x_train_T.shape, x_test_T.shape

In [None]:
# Getting duplicate features count
x_train_T.duplicated().sum()

In [None]:
duplicated_features = x_train_T.duplicated()
duplicated_features

# True is duplicated and False is non duplicated rows.

In [None]:
# Removing duppicated features.
# After this the False becomes True and True becomes false.

# Inversing the True to False and False to True
features_to_keep = [not index for index in duplicated_features]
features_to_keep

In [None]:
# Final dataset after removing constants, quasi constants and duplicates.

# Transposing again to original form
x_train_unique = x_train_T[features_to_keep].T

# Transposing again to original form
x_test_unique = x_test_T[features_to_keep].T

In [None]:
x_train.shape, x_test.shape, x_train_unique.shape, x_test_unique.shape

## Build Model and Compare the Performance after and before removal.

In [None]:
def run_random_forest(x_train, x_test, y_train, y_test):
    clf = RandomForestClassifier(n_estimators=100, random_state=0, n_jobs=-1)
    clf.fit(x_train, y_train)
    y_pred = clf.predict(x_test)
    print("Accuracy on test set: ")
    print(accuracy_score(y_test, y_pred))

In [None]:
%%time
# Run on final data.
run_random_forest(x_train_unique, x_test_unique, y_train, y_test)

In [None]:
%%time
# Run on original data.
run_random_forest(x_train, x_test, y_train, y_test)

As we can see the accuracy and time taken is less after removing the constants, quasi constants and duplicates compare to the original data. 

What we can say here is that removing constants, quasi constants and duplicates doesn't depricates the accuracy it rather improves it.

## Calculate Mutual Information

In [None]:
mi = mutual_info_classif(x_train_unique, y_train)

In [None]:
len(mi)

In [None]:
mi

In [None]:
mi = pd.Series(mi)
mi.index = x_train_unique.columns

In [None]:
mi.sort_values(ascending=False, inplace=True)

In [None]:
mi.plot.bar(figsize=(16, 5))

As you can see after certain level there is not much information

In [None]:
# As you can see after certain level there is not much information.
# Selecting only certain percentile of entire data.
sel = SelectPercentile(mutual_info_classif, percentile=10).fit(x_train_unique, y_train)

In [None]:
x_train_unique.columns[sel.get_support()]

In [None]:
len(x_train_unique.columns[sel.get_support()])

In [None]:
help(sel)

In [None]:
x_train_mi = sel.transform(x_train_unique)
x_test_mi = sel.transform(x_test_unique)

In [None]:
x_train_mi.shape, x_test_mi.shape

In [None]:
%%time
run_random_forest(x_train_mi, x_test_mi, y_train, y_test)

## Mutual Information Gain in Regression

In [None]:
from sklearn.datasets import load_boston
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

In [None]:
boston = load_boston()

In [None]:
print(boston.DESCR)

In [None]:
x = pd.DataFrame(data=boston.data, columns=boston.feature_names)
x.head()

In [None]:
y = boston.target

In [None]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=0)

In [None]:
mi = mutual_info_regression(x_train, y_train)
mi = pd.Series(mi)
mi.index = x_train.columns
mi.sort_values(ascending=False, inplace=True)

In [None]:
# Now we can see which feature is giving more information
mi

In [None]:
mi.plot.bar()

In [None]:
sel = SelectKBest(mutual_info_regression, k=9).fit(x_train, y_train)

In [None]:
x_train.columns[sel.get_support()]

#### Calculate r2_score, mean_squared_error and SD for full feature

In [None]:
model = LinearRegression()
model.fit(x_train, y_train)
y_predict = model.predict(x_test)

In [None]:
r2_score(y_test, y_predict)

In [None]:
# Root Mean Squared Error (RMSE)
np.sqrt(mean_squared_error(y_test, y_predict))

In [None]:
# Standard Deviation
np.std(y)

#### Calculate r2_score, mean_squared_error and SD for selected feature

In [None]:
# Calculating for selected features
x_train_9 = sel.transform(x_train)
x_train_9.shape

In [None]:
x_test_9 = sel.transform(x_test)
x_test_9.shape

In [None]:
model = LinearRegression()
model.fit(x_train_9, y_train)
y_predict_9 = model.predict(x_test_9)

In [None]:
r2_score(y_test, y_predict_9)

In [None]:
# Root Mean Squared Error (RMSE)
np.sqrt(mean_squared_error(y_test, y_predict_9))