# Feature Selection - Autocorrelation - Pearson Correlation Coefficients Filter Method

A dataset can contain correlated features. Two or more than two features are correlated if they are close to each other in the linear space.

Goal is to remove the corellated features, i.e., features which are simillar to other features.

We will reomve the colinearilty using the Pearson Correlation Coefficient.

### Summary:
 - Feature Space to target correlation is desired
 - Feature to feature correlation is not desired
 - If two or more features are highly correlated thn eiter feature is redundant
 - Correlation in feature space increases model complexity
 - Removing correlated features improves model performance
 - Different model shows different performance over the correlated features

In [None]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns

In [None]:
from sklearn.ensemble import RandomForestClassifier

# VarianceThreshold - Feature selector that removes all low-variance features.
from sklearn.feature_selection import VarianceThreshold
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

In [None]:
data = pd.read_csv("data/santander.csv", nrows=20000)
data.head()

In [None]:
x = data.drop("TARGET", axis=1)  # Features
y = data["TARGET"]  # Outcome

x.shape, y.shape

In [None]:
x_train, x_test, y_train, y_test = train_test_split(
    x, y, test_size=0.2, random_state=0, stratify=y
)
x_train.shape, x_test.shape, y_train.shape, y_test.shape

### Constant Features Removal

In [None]:
constant_filter = VarianceThreshold(threshold=0)
constant_filter.fit(x_train)

In [None]:
# No. of features after constants removal
constant_filter.get_support().sum()

In [None]:
# Returns True for all the features which are constants.
constant_list = [
    not temp for temp in constant_filter.get_support()
]  # Inversing the True to False and False to True
constant_list

In [None]:
# Name of all the features which are constants
x.columns[constant_list]

In [None]:
# removing all the constants from our Training and Test dataset.
x_train_filter = constant_filter.transform(x_train)
x_test_filter = constant_filter.transform(x_test)

In [None]:
# Now take a look at the original and the transformed data (after removing the constants)
x_train.shape, x_test.shape, x_train_filter.shape, x_test_filter.shape

## Quasi Constants Feature Removal

In [None]:
quasi_constant_filter = VarianceThreshold(threshold=0.01)

In [None]:
quasi_constant_filter.fit(x_train_filter)

In [None]:
quasi_constant_filter.get_support().sum()

In [None]:
x_train_quasi_filter = quasi_constant_filter.transform(x_train_filter)
x_test_quasi_filter = quasi_constant_filter.transform(x_test_filter)

In [None]:
# Now take a look at the original and the transformed data (after removing the constants)
x_train.shape, x_test.shape, x_train_filter.shape, x_test_filter.shape, x_train_quasi_filter.shape, x_train_quasi_filter.shape

## Duplicate Features Removal

In [None]:
x_train_T = x_train_quasi_filter.T
x_test_T = x_test_quasi_filter.T

In [None]:
# As we can see the pandas dataframe has been transformed in to numpy array after transpose.
type(x_train_T)

In [None]:
# Changing numpy array back to pandas dataframe
x_train_T = pd.DataFrame(x_train_T)
x_test_T = pd.DataFrame(x_test_T)

In [None]:
# Now we can see after transpose the rows has become columns and columns has become rows.
x_train_T.shape, x_test_T.shape

In [None]:
# Getting duplicate features count
x_train_T.duplicated().sum()

In [None]:
duplicated_features = x_train_T.duplicated()
duplicated_features

# True is duplicated and False is non duplicated rows.

In [None]:
# Removing duppicated features.
# After this the False becomes True and True becomes false.

# Inversing the True to False and False to True
features_to_keep = [not index for index in duplicated_features]
features_to_keep

In [None]:
# Final dataset after removing constants, quasi constants and duplicates.

# Transposing again to original form
x_train_unique = x_train_T[features_to_keep].T

# Transposing again to original form
x_test_unique = x_test_T[features_to_keep].T

In [None]:
x_train.shape, x_test.shape, x_train_unique.shape, x_test_unique.shape

## Build Model and Compare the Performance after and before removal.

In [None]:
def run_random_forest(x_train, x_test, y_train, y_test):
    clf = RandomForestClassifier(n_estimators=100, random_state=0, n_jobs=-1)
    clf.fit(x_train, y_train)
    y_pred = clf.predict(x_test)
    print("Accuracy on test set: ")
    print(accuracy_score(y_test, y_pred))

In [None]:
%%time
# Run on final data.
run_random_forest(x_train_unique, x_test_unique, y_train, y_test)

In [None]:
%%time
# Run on original data.
run_random_forest(x_train, x_test, y_train, y_test)

As we can see the accuracy and time taken is less after removing the constants, quasi constants and duplicates compare to the original data. 

What we can say here is that removing constants, quasi constants and duplicates doesn't depricates the accuracy it rather improves it.

## Removing Correlated Data 

In [None]:
corrmat = x_train_unique.corr()

In [None]:
plt.figure(figsize=(12, 8))
sns.heatmap(corrmat)

In [None]:
# data -  Training or Testing data
# threshold - Point after which the data will be discarded
def get_correlation(data, threshold):

    corr_col = set()
    corrmat = data.corr()

    # Navigating from feature to feature
    for i in range(len(corrmat.columns)):
        for j in range(i):
            if abs(corrmat.iloc[i, j]) > threshold:
                colname = corrmat.columns[i]
                corr_col.add(colname)
    return corr_col

In [None]:
corr_features = get_correlation(x_train_unique, 0.85)
corr_features

In [None]:
len(corr_features)

In [None]:
# Removing all correlated features
x_train_uncorr = x_train_unique.drop(labels=corr_features, axis=1)
x_test_uncorr = x_test_unique.drop(labels=corr_features, axis=1)

In [None]:
x_train_uncorr.shape, x_test_uncorr.shape

In [None]:
%%time
run_random_forest(x_train_uncorr, x_test_uncorr, y_train, y_test)

Here we can see the accurracy is still close to the accuracy with original dataset but the total training time has been brought down.

## Feature Grouping

In the above case we discarded all the correlated features but now we will try with keeping one of the most important features. The feature which gives more information will be kept and the other will be discarded.

In [None]:
corrmat

In [None]:
# Vertically stacking the features. Ex., 0 vs 0 to 244
corrdata = corrmat.abs().stack()
corrdata

In [None]:
# Sorting all values in descending order
corrdata = corrdata.sort_values(ascending=False)
corrdata

# Now we can see the correlated data stacked together.
# Ex., Feature 29 is correlated with 58 and 58 is correlated with 29 and so on.

In [None]:
corrdata = corrdata[corrdata > 0.85]
corrdata = corrdata[corrdata < 1]
corrdata

In [None]:
# Converting to Pandas Dataframe
corrdata = pd.DataFrame(corrdata).reset_index()
corrdata.columns = ["Features1", "Features2", "Corr_Value"]
corrdata

In [None]:
# Grouping the correlated features. i.e, Group all the features which are correlated to 0 and so on.
grouped_features_list = []
correlated_groups_list = []

for feature in corrdata.Features1.unique():
    if feature not in grouped_features_list:
        correlated_block = corrdata[corrdata.Features1 == feature]
        grouped_features_list = (
            grouped_features_list
            + list(correlated_block.Features2.unique())
            + [feature]
        )
        correlated_groups_list.append(correlated_block)

In [None]:
len(correlated_groups_list)

In [None]:
for group in correlated_groups_list:
    print(group)

## Feature Importance - Based on Tree based classifier

In [None]:
important_features = []
for group in correlated_groups_list:

    # Total features
    features = list(group.Features1.unique()) + list(group.Features2.unique())

    rf = RandomForestClassifier(n_estimators=100, random_state=0)
    rf.fit(x_train_unique[features], y_train)

    importance = pd.concat(
        [pd.Series(features), pd.Series(rf.feature_importances_)], axis=1
    )
    importance.columns = ["Features", "Importance"]
    importance.sort_values(by="Importance", ascending=False, inplace=True)
    most_important_feature = importance.iloc[0]
    important_features.append(most_important_feature)

In [None]:
important_features

In [None]:
important_features = pd.DataFrame(important_features)

In [None]:
important_features.reset_index(inplace=True, drop=True)

In [None]:
important_features

In [None]:
features_to_consider = set(important_features["Features"])

In [None]:
features_to_discard = set(corr_features) - set(features_to_consider)

In [None]:
features_to_discard = list(features_to_discard)

In [None]:
x_train_grouped_uncorr = x_train_unique.drop(labels=features_to_discard, axis=1)
x_train_grouped_uncorr.shape

In [None]:
x_test_grouped_uncorr = x_test_unique.drop(labels=features_to_discard, axis=1)
x_test_grouped_uncorr.shape

In [None]:
%%time

# Original Data
run_random_forest(x_train, x_test, y_train, y_test)

In [None]:
%%time

# After discarding all correlated data
run_random_forest(x_train_uncorr, x_test_uncorr, y_train, y_test)

In [None]:
%%time

# After keeping most important correlated data
run_random_forest(x_train_grouped_uncorr, x_test_grouped_uncorr, y_train, y_test)

If you are handling a very large dataset then the feature selection is very important.