In [None]:
!pip install reliefe

In [None]:
!pip install umap -U

# Feature Ranking - TPS Nov21

## ReliefE: Feature Ranking in High-dimensional Spaces via Manifold Embeddings

* https://arxiv.org/pdf/2101.09577.pdf

## Import packages

In [None]:
import pandas as pd
import numpy as np
import datatable as dt
import optuna

import reliefe
from sklearn.model_selection import train_test_split

import gc; gc.enable()

import warnings
warnings.filterwarnings('ignore')

## User-Defined Helpers

### Down-Casting to Reduce Memory Usage

In [None]:
def reduce_memory_usage(df, verbose=True):
    numerics = ["int8", "int16", "int32", "int64", "float16", "float32", "float64"]
    start_mem = df.memory_usage().sum() / 1024 ** 2
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == "int":
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
            else:
                if (
                    c_min > np.finfo(np.float16).min
                    and c_max < np.finfo(np.float16).max
                ):
                    df[col] = df[col].astype(np.float16)
                elif (
                    c_min > np.finfo(np.float32).min
                    and c_max < np.finfo(np.float32).max
                ):
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
    end_mem = df.memory_usage().sum() / 1024 ** 2
    if verbose:
        print(
            "Mem. usage decreased to {:.2f} Mb ({:.1f}% reduction)".format(
                end_mem, 100 * (start_mem - end_mem) / start_mem
            )
        )
    return df

## Load Previously Prepared Data
### Train/Test Combined

In [None]:
PATH = '../input/dataprep-nov21/gauss.parquet'
X = pd.read_parquet(PATH)
X = reduce_memory_usage(X)
gc.collect()
X.head()

### Target

In [None]:
PATH = '../input/dataprep-nov21/target.parquet'
y = pd.read_parquet(PATH)
y = reduce_memory_usage(y)
gc.collect()
y.head()

### Remove Test-Set

In [None]:
N = y.shape[0]
X = X.head(N)
target = 'target'
y = y[target]
gc.collect()
y.head()

### 4-Bagged 5-Fold CV Labels

In [None]:
PATH = '../input/dataprep-nov21/folds.parquet'
folds = pd.read_parquet(PATH)
folds.head()

## ReliefE Example
### Calculate Ranks on Samples

In [None]:
# initialize feature importance table
fi = pd.DataFrame([], columns=X.columns.tolist()).T

i = 0
for c in folds.columns.tolist():
    for j in range(5):
        # select data
        key = (folds[c] == j)
        X_train, y_train = X.loc[key], y.loc[key]
        
        # sample for speed-up
        _, X_train, _, y_train = train_test_split(X_train, y_train, 
                                                  test_size=0.01, random_state=42, stratify=y_train)
        del _; gc.collect()
        
        # train model
        relief = reliefe.ReliefE()
        relief.fit(X_train.values, y_train.values)
        
        # record results
        fi[f'rank_{i}'] = relief.feature_importances_
        fi[f'rank_{i}'] = fi[f'rank_{i}'].rank(ascending=False)
        
        # update
        i += 1
        del X_train; del y_train; del relief; del key; gc.collect()

### Final Post Processing

In [None]:
rank_cols = [c for c in fi.columns.tolist() if 'rank' in c]
fi['avg_rank'] = fi[rank_cols].mean(axis=1)
fi['med_rank'] = fi[rank_cols].median(axis=1)

fi.sort_values(by='med_rank').head(20)

### Save Results

In [None]:
fi.reset_index().to_parquet('fi.parquet')