# Feature Selection Techniques

###### Referrals : https://www.analyticsvidhya.com/blog/2020/10/a-comprehensive-guide-to-feature-selection-using-wrapper-methods-in-python/

## 1. Supervised Feature Selection Techniques
## 2. Unsupervised Feature Selection Techniques

### Supervised Feature Selection Techniques

### 1. Filter methods
### 2. Wrapper methods

#### Wrapper methods

#### 1. Forward Elimination

In [None]:
#Forward Elimination starts with an empty set of features and iteratively combines features

In [None]:
# We start with an empty set of selected features and initialize the best accuracy to 0.

# In each iteration, we evaluate the contribution of each remaining feature to the model's accuracy.

# The feature that leads to the highest accuracy improvement is added to the selected features set.

# The loop continues until the desired number of features (num_features_to_select) is reached.

# Finally, we print the selected features' indices and the best accuracy achieved during the forward selection process.

In [None]:
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

# Load the Iris dataset
iris = load_iris()
X = iris.data
y = iris.target

# Split the dataset into a training and testing set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize an empty set of selected features
selected_features = []
best_accuracy = 0.0

# Create a random forest classifier (you can use any classifier of your choice)
clf = RandomForestClassifier(random_state=42)

# Number of features to select (you can set this based on your requirements)
num_features_to_select = 3

while len(selected_features) < num_features_to_select:
    best_feature = None
    best_feature_accuracy = 0.0

    for feature_idx in range(X_train.shape[1]):
        if feature_idx not in selected_features:
            # Include the current feature in the selected features
            current_features = selected_features + [feature_idx]

            # Train the classifier on the current features
            clf.fit(X_train[:, current_features], y_train)

            # Make predictions on the test set
            y_pred = clf.predict(X_test[:, current_features])

            # Calculate accuracy
            accuracy = accuracy_score(y_test, y_pred)

            # Check if this feature selection leads to a better accuracy
            if accuracy > best_feature_accuracy:
                best_feature_accuracy = accuracy
                best_feature = feature_idx

    # Add the best feature to the selected features
    selected_features.append(best_feature)

    # Update the best accuracy achieved so far
    if best_feature_accuracy > best_accuracy:
        best_accuracy = best_feature_accuracy

print("Selected Features (Indices):", selected_features)
print("Best Accuracy:", best_accuracy)


##### Implementing using built in functions

In [7]:
pip install mlxtend

Collecting mlxtend
  Downloading mlxtend-0.23.0-py3-none-any.whl (1.4 MB)
     ---------------------------------------- 1.4/1.4 MB 3.4 MB/s eta 0:00:00
Installing collected packages: mlxtend
Successfully installed mlxtend-0.23.0
Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip available: 22.3.1 -> 23.2.1
[notice] To update, run: C:\Users\Predator\Desktop\Jupyter_Notebook\new\Scripts\python.exe -m pip install --upgrade pip


In [23]:
#importing the necessary libraries
from mlxtend.feature_selection import SequentialFeatureSelector as SFS
from sklearn.linear_model import LinearRegression
# Sequential Forward Selection(sfs)
sfs = SFS(LinearRegression(),
          k_features=11,
          forward=True,
          floating=False,
          scoring = 'r2',
          cv = 0)

In [9]:
from sklearn.datasets import load_diabetes

In [10]:
diabetes=load_diabetes()

In [11]:
diabetes

{'data': array([[ 0.03807591,  0.05068012,  0.06169621, ..., -0.00259226,
          0.01990749, -0.01764613],
        [-0.00188202, -0.04464164, -0.05147406, ..., -0.03949338,
         -0.06833155, -0.09220405],
        [ 0.08529891,  0.05068012,  0.04445121, ..., -0.00259226,
          0.00286131, -0.02593034],
        ...,
        [ 0.04170844,  0.05068012, -0.01590626, ..., -0.01107952,
         -0.04688253,  0.01549073],
        [-0.04547248, -0.04464164,  0.03906215, ...,  0.02655962,
          0.04452873, -0.02593034],
        [-0.04547248, -0.04464164, -0.0730303 , ..., -0.03949338,
         -0.00422151,  0.00306441]]),
 'target': array([151.,  75., 141., 206., 135.,  97., 138.,  63., 110., 310., 101.,
         69., 179., 185., 118., 171., 166., 144.,  97., 168.,  68.,  49.,
         68., 245., 184., 202., 137.,  85., 131., 283., 129.,  59., 341.,
         87.,  65., 102., 265., 276., 252.,  90., 100.,  55.,  61.,  92.,
        259.,  53., 190., 142.,  75., 142., 155., 225.,  59

In [13]:
import pandas as pd
diabetes_df=pd.DataFrame(diabetes.data,columns=diabetes.feature_names)
diabetes_df["target"]=diabetes.target

In [14]:
diabetes_df.head()

Unnamed: 0,age,sex,bmi,bp,s1,s2,s3,s4,s5,s6,target
0,0.038076,0.05068,0.061696,0.021872,-0.044223,-0.034821,-0.043401,-0.002592,0.019907,-0.017646,151.0
1,-0.001882,-0.044642,-0.051474,-0.026328,-0.008449,-0.019163,0.074412,-0.039493,-0.068332,-0.092204,75.0
2,0.085299,0.05068,0.044451,-0.00567,-0.045599,-0.034194,-0.032356,-0.002592,0.002861,-0.02593,141.0
3,-0.089063,-0.044642,-0.011595,-0.036656,0.012191,0.024991,-0.036038,0.034309,0.022688,-0.009362,206.0
4,0.005383,-0.044642,-0.036385,0.021872,0.003935,0.015596,0.008142,-0.002592,-0.031988,-0.046641,135.0


In [15]:
from mlxtend.feature_selection import SequentialFeatureSelector

In [16]:
diabetes_df.shape

(442, 11)

In [17]:
from sklearn.model_selection import train_test_split

In [18]:
X=diabetes_df.drop("target",axis=1)
y=diabetes_df["target"]

In [19]:
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=2)

In [20]:
from sklearn.ensemble import RandomForestClassifier

In [27]:
forward=SequentialFeatureSelector(RandomForestClassifier(n_jobs=1,
                                                         k_features=1,
                                                         forward=True,
                                                         floating=False,
                                                         verbose=2,
                                                         scoring="accuracy",
                                                         cv=5).fit(X_train,y_train))
                                            
                                                         

TypeError: RandomForestClassifier.__init__() got an unexpected keyword argument 'k_features'

In [1]:
import pandas as pd

In [2]:
from mlxtend.feature_selection import SequentialFeatureSelector
from sklearn.ensemble import RandomForestClassifier

In [3]:
from sklearn.datasets import load_breast_cancer

In [5]:
cancer=load_breast_cancer()
cancer

{'data': array([[1.799e+01, 1.038e+01, 1.228e+02, ..., 2.654e-01, 4.601e-01,
         1.189e-01],
        [2.057e+01, 1.777e+01, 1.329e+02, ..., 1.860e-01, 2.750e-01,
         8.902e-02],
        [1.969e+01, 2.125e+01, 1.300e+02, ..., 2.430e-01, 3.613e-01,
         8.758e-02],
        ...,
        [1.660e+01, 2.808e+01, 1.083e+02, ..., 1.418e-01, 2.218e-01,
         7.820e-02],
        [2.060e+01, 2.933e+01, 1.401e+02, ..., 2.650e-01, 4.087e-01,
         1.240e-01],
        [7.760e+00, 2.454e+01, 4.792e+01, ..., 0.000e+00, 2.871e-01,
         7.039e-02]]),
 'target': array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0,
        0, 0, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0,
        1, 1, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0,
        1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 0, 1,
        1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0

In [6]:
cancer_df=pd.DataFrame(cancer.data,columns=cancer.feature_names)

In [7]:
cancer_df["target"]=cancer.target

In [8]:
cancer_df.head()

Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,...,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension,target
0,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,0.2419,0.07871,...,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189,0
1,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,0.1812,0.05667,...,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902,0
2,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,0.2069,0.05999,...,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758,0
3,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,0.2597,0.09744,...,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173,0
4,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,0.1809,0.05883,...,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678,0


In [9]:
cancer_df.isnull().sum()

mean radius                0
mean texture               0
mean perimeter             0
mean area                  0
mean smoothness            0
mean compactness           0
mean concavity             0
mean concave points        0
mean symmetry              0
mean fractal dimension     0
radius error               0
texture error              0
perimeter error            0
area error                 0
smoothness error           0
compactness error          0
concavity error            0
concave points error       0
symmetry error             0
fractal dimension error    0
worst radius               0
worst texture              0
worst perimeter            0
worst area                 0
worst smoothness           0
worst compactness          0
worst concavity            0
worst concave points       0
worst symmetry             0
worst fractal dimension    0
target                     0
dtype: int64

In [10]:
from sklearn.model_selection import train_test_split

In [11]:
X=cancer_df.drop("target",axis=1)
y=cancer_df["target"]

In [12]:
X_train,X_test,y_train,y_test=train_test_split(X,y,random_state=2,test_size=0.2)

In [13]:
forward=SequentialFeatureSelector(RandomForestClassifier(n_jobs=1,
                                                         =1,
                                                         forward=True,
                                                         floating=False,
                                                         verbose=2,
                                                         scoring="accuracy",
                                                         cv=5).fit(X_train,y_train))

TypeError: RandomForestClassifier.__init__() got an unexpected keyword argument 'k_features'

In [15]:
from mlxtend.feature_selection import SequentialFeatureSelector
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

# Load the Breast Cancer dataset
cancer = load_breast_cancer()
X = cancer.data
y = cancer.target

# Split the dataset into a training and testing set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create a RandomForestClassifier
model = RandomForestClassifier(random_state=42)

# Create the SequentialFeatureSelector for forward selection
num_features_to_select = 5  # You can set the desired number of features
forward_selector = SequentialFeatureSelector(model, k_features=num_features_to_select, forward=True, scoring='accuracy', cv=5)

# Fit the forward selector to the training data
forward_selector.fit(X_train, y_train)

# Get the selected feature indices
selected_feature_indices = forward_selector.k_feature_idx_

# Train a model using the selected features
selected_features = X_train[:, selected_feature_indices]
model.fit(selected_features, y_train)

# Make predictions on the test set using the selected features
X_test_selected = X_test[:, selected_feature_indices]
y_pred = model.predict(X_test_selected)

# Calculate the accuracy on the test set
accuracy = accuracy_score(y_test, y_pred)

print("Selected Feature Indices:", selected_feature_indices)
print("Accuracy with Selected Features:", accuracy)


Selected Feature Indices: (0, 1, 23, 24, 27)
Accuracy with Selected Features: 0.956140350877193


In [1]:
from sklearn.datasets import load_breast_cancer
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from mlxtend.feature_selection import SequentialFeatureSelector
from sklearn.model_selection import train_test_split

In [2]:
cancer=load_breast_cancer()

In [4]:
import pandas as pd
cancer_df=pd.DataFrame(cancer.data,columns=cancer.feature_names)

In [5]:
cancer_df["target"]=cancer.target

In [6]:
X=cancer_df.drop("target",axis=1)
y=cancer_df["target"]

In [7]:
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=2)

In [9]:
my_model=LogisticRegression()
forward=SequentialFeatureSelector(my_model,k_features=4,forward=True,scoring="accuracy",cv=5,verbose=2).fit(X_train,y_train)


[2023-09-30 11:38:40] Features: 1/4 -- score: 0.9208791208791209
[2023-09-30 11:38:41] Features: 2/4 -- score: 0.9384615384615385
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(

[2023-09-30 11:38:42] Features: 4/4 -- score: 0.9582417582417582

In [22]:
selected_features_index = forward.k_feature_idx_
selected_features_index

(20, 21, 25, 26)

In [25]:
# Train a model using the selected features
selected_features = X_train[:, selected_features_index]
my_model.fit(selected_features, y_train)

# Make predictions on the test set using the selected features
X_test_selected = X_test[:, selected_features_index]
y_pred = my_model.predict(X_test_selected)

InvalidIndexError: (slice(None, None, None), (20, 21, 25, 26))

In [26]:
cancer_df.shape

(569, 31)

#### 2. Backward elimination

In [29]:
from sklearn.datasets import load_breast_cancer
from mlxtend.feature_selection import SequentialFeatureSelector
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

In [30]:
mycancer=load_breast_cancer()

In [31]:
X=mycancer.data
y=mycancer.target

In [32]:
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=42)

In [34]:
model=RandomForestClassifier(random_state=42)

In [35]:
backward_model=SequentialFeatureSelector(model,
                                         k_features=5,
                                         forward=False,
                                         verbose=2,
                                         cv=5,
                                         scoring="accuracy").fit(X_train,y_train)


[2023-09-30 12:23:53] Features: 29/5 -- score: 0.9604395604395604
[2023-09-30 12:25:22] Features: 28/5 -- score: 0.964835164835165
[2023-09-30 12:26:51] Features: 27/5 -- score: 0.9648351648351647
[2023-09-30 12:28:04] Features: 26/5 -- score: 0.9648351648351647
[2023-09-30 12:28:35] Features: 25/5 -- score: 0.9648351648351647
[2023-09-30 12:29:02] Features: 24/5 -- score: 0.9670329670329672
[2023-09-30 12:29:28] Features: 23/5 -- score: 0.9670329670329669
[2023-09-30 12:29:52] Features: 22/5 -- score: 0.9670329670329669
[2023-09-30 12:30:15] Features: 21/5 -- score: 0.9736263736263737
[2023-09-30 12:30:37] Features: 20/5 -- score: 0.9714285714285713
[2023-09-30 12:30:58] Features: 19/5 -- score: 0.9736263736263737
[2023-09-30 12:31:18] Features: 18/5 -- score: 0.9714285714285715
[2023-09-30 12:31:36] Features: 17/5 -- score: 0.9714285714285713
[2023-09-30 12:31:54] Features: 16/5 -- score: 0.9758241758241759
[2023-09-30 12:32:09] Features: 15/5 -- score: 0.9758241758241759
[2023-09-3

In [36]:
indices=backward_model.k_feature_idx_
indices

(1, 8, 23, 24, 25)

In [37]:
backward_model.k_feature_names_

('1', '8', '23', '24', '25')

In [38]:
# Train a model using the selected features
selected_features = X_train[:, indices]
model.fit(selected_features, y_train)

# Make predictions on the test set using the selected features
X_test_selected = X_test[:, indices]
y_pred = model.predict(X_test_selected)

In [39]:
accuracy_score(y_test,y_pred)

0.9649122807017544

#### 3. Bidirectional Feature Extraction

In [40]:
from mlxtend.feature_selection import SequentialFeatureSelector
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

# Load the Breast Cancer dataset
cancer = load_breast_cancer()
X = cancer.data
y = cancer.target

# Split the dataset into a training and testing set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create a RandomForestClassifier
model = RandomForestClassifier(random_state=42)

# Create the SequentialFeatureSelector for forward selection
forward_selector = SequentialFeatureSelector(model, k_features=1, forward=True, verbose=2,scoring='accuracy', cv=5)

# Fit the forward selector to the training data
forward_selector.fit(X_train, y_train)

# Get the selected feature indices from forward selection
selected_feature_indices_forward = forward_selector.k_feature_idx_

# Create the SequentialFeatureSelector for backward elimination
backward_selector = SequentialFeatureSelector(model, k_features=1, forward=False, verbose=2,scoring='accuracy', cv=5)

# Fit the backward selector to the training data
backward_selector.fit(X_train, y_train)

# Get the selected feature indices from backward elimination
selected_feature_indices_backward = backward_selector.k_feature_idx_

# Combine the selected feature indices from both steps
selected_feature_indices = list(set(selected_feature_indices_forward) & set(selected_feature_indices_backward))

# Train a model using the selected features
selected_features = X_train[:, selected_feature_indices]
model.fit(selected_features, y_train)

# Make predictions on the test set using the selected features
X_test_selected = X_test[:, selected_feature_indices]
y_pred = model.predict(X_test_selected)

# Calculate the accuracy on the test set
accuracy = accuracy_score(y_test, y_pred)

print("Selected Feature Indices:", selected_feature_indices)
print("Accuracy with Selected Features:", accuracy)



[2023-09-30 13:17:20] Features: 1/1 -- score: 0.887912087912088
[2023-09-30 13:17:57] Features: 29/1 -- score: 0.9604395604395604
[2023-09-30 13:18:32] Features: 28/1 -- score: 0.964835164835165
[2023-09-30 13:19:05] Features: 27/1 -- score: 0.9648351648351647
[2023-09-30 13:19:37] Features: 26/1 -- score: 0.9648351648351647
[2023-09-30 13:20:08] Features: 25/1 -- score: 0.9648351648351647
[2023-09-30 13:20:34] Features: 24/1 -- score: 0.9670329670329672
[2023-09-30 13:21:00] Features: 23/1 -- score: 0.9670329670329669
[2023-09-30 13:21:25] Features: 22/1 -- score: 0.9670329670329669
[2023-09-30 13:21:49] Features: 21/1 -- score: 0.9736263736263737
[2023-09-30 13:22:11] Features: 20/1 -- score: 0.9714285714285713
[2023-09-30 13:22:32] Features: 19/1 -- score: 0.9736263736263737
[2023-09-30 13:22:52] Features: 18/1 -- score: 0.9714285714285715
[2023-09-30 13:23:10] Features: 17/1 -- score: 0.9714285714285713
[2023-09-30 13:23:28] Features: 16/1 -- score: 0.9758241758241759
[2023-09-30 

ValueError: Found array with 0 feature(s) (shape=(455, 0)) while a minimum of 1 is required by RandomForestClassifier.

#### 4. Exhaustive feature extraction

In [42]:
from sklearn.datasets import load_breast_cancer
from mlxtend.feature_selection import ExhaustiveFeatureSelector
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier

In [43]:
can=load_breast_cancer()

In [44]:
X=can.data
y=can.target

In [45]:
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=42)
umodel=RandomForestClassifier(random_state=42)

In [46]:
from sklearn.metrics import accuracy_score

In [None]:
exhaustive=ExhaustiveFeatureSelector(umodel,
                                     min_features=2,
                                     max_features=5,
                                     scoring="accuracy",
                                     cv=2).fit(X_train,y_train)
                                     