# Feature Selection

This notebook is trying out different strategies for feature selection based on sklearn as well as a few neural approaches.

## Dependencies

In [None]:
from collections import Counter

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn.ensemble import ExtraTreesClassifier, RandomForestClassifier
from sklearn.feature_selection import (
    RFE,
    RFECV,
    SelectFromModel,
    SelectKBest,
    SelectPercentile,
    VarianceThreshold,
    chi2,
    f_classif,
)
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import StratifiedKFold
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.svm import LinearSVC

## Preprocess Dataset

Download dataset to parent data directory.

In [None]:
!if [ ! -f ../data/dorothea.zip ]; then wget -P ../data/ https://archive.ics.uci.edu/static/public/169/dorothea.zip && unzip ../data/dorothea.zip -d ../data/; fi

In [None]:
class DataPreprocessor:
    def __init__(self, features_file, targets_file):
        self.features_file = features_file
        self.targets_file = targets_file
        self.df = None
    
    def _preprocess(self):
        data = []
        with open(self.features_file, 'r') as f:
            for line in f:
                active_features = line.strip().split()
                data.append(pd.Series({int(feature): 1 for feature in active_features}))
        features = pd.concat(data, axis=1).T.fillna(0).sort_index(axis=1)
        targets = pd.read_csv(self.targets_file, header=None, names=["target"])
        self.df = pd.concat([features, targets], axis=1)
    
    def __call__(self):
        self._preprocess()
        return self.df

In [None]:
wrangler = DataPreprocessor('../data/DOROTHEA/dorothea_train.data', '../data/DOROTHEA/dorothea_train.labels')

In [None]:
df = wrangler()

In [None]:
df.head()

In [None]:
X, y = df.drop('target', axis=1), df['target']

## Feature Data Types

In [None]:
feature_number_unique_values = {column: X[column].unique() for column in X.columns}

In [None]:
unique_values_by_columns = [value for _, value in feature_number_unique_values.items()]
tuples = [tuple(np.sort(arr)) for arr in unique_values_by_columns]

# Count the frequencies
frequency_table = Counter(tuples)

# Convert Counter to DataFrame
df = pd.DataFrame.from_records(list(frequency_table.items()), columns=['Array', 'Frequency'])

print(df)


So we see that each feature takes binary values 1 or 0, at least 1 of each.

Let's look at the target `y`.

In [None]:
y.unique()

The drug discovery target `y` is also binary, taking values `-1` and `1`.

# Feature Selection Algorithms

## Sweeping Univariate Feature Selection

Selecting features using univariate statistical tests of teh relationship between each feature and the target variable.

### Only Univariate Feature Based

#### Variance Threshold

Let's see a histogram of the variance of each feature.

In [None]:
variances = X.var()

plt.hist(variances, bins='auto', log=True)
plt.title('Histogram of Variances')
plt.xlabel('Variance')
plt.ylabel('Frequency')
plt.show()

In [None]:
log_variances = np.log(variances + 1e-9)

plt.hist(log_variances, bins='auto', log=True)
plt.title('Histogram of Log Variances', )
plt.xlabel('Log Variance')
plt.ylabel('Frequency')
plt.show()

In [None]:
plt.hist(variances, bins='auto', density=True, cumulative=True, histtype='step', alpha=0.8)
plt.title('CDF of Variances')
plt.xlabel('Log Variance')
plt.ylabel('Cumulative Probability')
plt.grid(True)

# Calculate the median of variances
median = np.median(variances)

# Plot the median as a dotted line
plt.axvline(median, color='b', linestyle='dotted', linewidth=2, label=f'Median Variance: {median:.2f}')

plt.legend()
plt.show()

In [None]:
plt.boxplot(variances)
plt.title('Boxplot of Variances')
plt.ylabel('Variance')
plt.grid(True)
plt.show()

In [None]:
plt.hist(log_variances, bins='auto', density=True, cumulative=True, histtype='step', alpha=0.8)
plt.title('CDF of Log Variances')
plt.xlabel('Log Variance')
plt.ylabel('Cumulative Probability')
plt.grid(True)

# Calculate the median of log_variances
median = np.median(log_variances)

# Plot the median as a dotted thick blue line
plt.axvline(median, color='b', linestyle='dotted', linewidth=2, label=f'Median Variance: {median:.2f}')

plt.legend()
plt.show()

In [None]:
plt.boxplot(variances)
plt.title('Boxplot of Variances')
plt.ylabel('Variance')
plt.grid(True)
plt.show()

In [None]:
variance_cutoff = 0.01
selector = VarianceThreshold(threshold=variance_cutoff)
selected_features = selector.fit_transform(X)

### 

### Model Based Feature Selection: Univariate Feature Target

It may be useful to one hot encode the target y.

In [None]:
y_one_hot = OneHotEncoder(sparse_output=False).fit_transform(y.to_numpy().reshape(-1, 1))

Classification based univariate feature selection using SelectBestK, SelectPercentile.

In [None]:
X_new = SelectKBest(f_classif, k=2).fit_transform(X, y)
X_new.shape

In [None]:
X_new = SelectKBest(chi2, k=2).fit_transform(X, y)
X_new.shape

In [None]:
# commented out slow running block
# X_new = SelectKBest(mutual_info_classif, k=2).fit_transform(X, y)
# X_new.shape

In [None]:
X_new = SelectPercentile(f_classif, percentile=0.01).fit_transform(X, y)
X_new.shape

In [None]:
X_new = SelectPercentile(chi2, percentile=0.01).fit_transform(X, y)
X_new.shape

In [None]:
# commented out slow running block
# X_new = SelectPercentile(mutual_info_classif, percentile=0.01).fit_transform(X, y)
# X_new.shape

### Recursive Feature Elimination (RFE)

RFE is the same as Seqential Feature Selection with backward elimination. It recursively removes the weakest feature (according to some model ranking - often coefficients listed in `coef_`). 

Performing RFE using Logistic Regression

In [None]:
logreg = LogisticRegression(penalty="l2")
selector = RFE(estimator=logreg, n_features_to_select=88110, step=1)
selector = selector.fit(X, y)

# Print the mask of selected features
print(selector.support_)

Cross Validate the number of features to eliminate in RFE routine.

In [None]:
min_features_to_select = 88110  # Minimum number of features to consider
clf = LogisticRegression()
cv = StratifiedKFold(5)

rfecv = RFECV(
    estimator=clf,
    step=1,
    cv=cv,
    scoring="accuracy",
    min_features_to_select=min_features_to_select,
    n_jobs=2,
)

rfecv.fit(X, y)

print(f"Optimal number of features: {rfecv.n_features_}")

RFE with Support vector classification.

In [None]:
min_features_to_select = 88110  # Minimum number of features to consider
clf = RandomForestClassifier()
cv = StratifiedKFold(5)

rfecv = RFECV(
    estimator=clf,
    step=1,
    cv=cv,
    scoring="accuracy",
    min_features_to_select=min_features_to_select,
    n_jobs=2,
)
rfecv.fit(X, y)

print(f"Optimal number of features: {rfecv.n_features_}")

### SelectFromModel

#### L1 based 

In [None]:

lsvc = LinearSVC(C=0.01, penalty="l1", dual=False).fit(X, y)
model = SelectFromModel(lsvc, prefit=True)
X_new = model.transform(X)
X_new.shape

In [None]:
logistic_l1 = LogisticRegression(penalty='l1', solver='liblinear').fit(X, y)
model = SelectFromModel(logistic_l1, prefit=True)
X_new = model.transform(X)
X_new.shape

#### Tree based

In [None]:
clf = ExtraTreesClassifier(n_estimators=50)
clf = clf.fit(X, y)
model = SelectFromModel(clf, prefit=True)
X_new = model.transform(X)
X_new.shape               

### Sequential Feature Selection

In [None]:
# long running process
# knn = KNeighborsClassifier(n_neighbors=5)
# sfs_backward = SequentialFeatureSelector(
#     knn, n_features_to_select = X.shape[1] - 5, direction="backward"
# ).fit(X, y)

# print(
#     "Number Features selected by forward sequential selection: "
#     f"{len([sfs_forward.get_support()])}"
# )

### Pipeline Feature Selection

In [None]:
clf = Pipeline([
  ('feature_selection', SelectFromModel(LinearSVC(dual="auto", penalty="l1"))),
  ('classification', RandomForestClassifier())
])
clf.fit(X, y)

### Neural Learning

#### MLP gradient based feature selection for each class (or relative regression from a baseline)

#### Relative gradient 


#### Permutation based feature selection

#### AutoEncoder (relu activations as importance of first layer)

#### Gradient based method

#### Regularization of Neural network

#### RFE with Neural Network

#### TabTransformer based selection (if possible)