# Feature selection (simple)

Download the fruits data [here](https://drive.google.com/file/d/1M8tiAWDZclABJN1Meq9oEHCLJTNyHdpP/view?usp=share_link)

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.svm import SVC
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix

In [None]:
np.random.seed(1)

# Load dataset
data = 
X, y = 

## Add some noisy data to degrade features
random_columns = np.random.permutation(22)
random_features = np.random.RandomState(1000).uniform(0, 0.5, size=(X.shape[0], 20))
X = np.hstack((X, random_features))
X = X[:, random_columns]

print (X.shape, y.shape)

In [None]:
# Normalize data to 0-1
X = MinMaxScaler().fit_transform(X)

# Split dataset to select feature and evaluate the classifier
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)
print (X_train.shape, X_test.shape, y_train.shape, y_test.shape)
print ("\n", random_columns, "\n", np.arange(22))

## Classification using SVC

In [None]:
# Train a classifier using SVC on train data


In [None]:
# Take predictions and compute some metrics (acc and CM)


## Use Recursive Feature Elimination

### RFE -> [Documentation](https://scikit-learn.org/stable/modules/generated/sklearn.feature_selection.RFE.html#sklearn.feature_selection.RFE)


In [None]:
from sklearn.feature_selection import RFE

# Create a new SVC classifier 
clf =

# Compute RFE
rfe = RFE( estimator = _ , n_features_to_select = _ )

best_features = rfe.fit( _ , _ )


In [None]:
# See results
print("Num Features: %d" % best_features.n_features_)
print("Selected Features: %s" % best_features.ranking_)
print("                 : %s" % np.arange(22))

In [None]:
list_best = np.where(best_features.ranking_ == 1)[0].tolist()
print (list_best)

In [None]:
# Select best features
best_train = _ 
best_test  = _ 

print(best_train.shape, best_test.shape)

In [None]:
# Retrain classifier on best features and make predictions


In [None]:
# Compute metrics
acc_best = 
cm_best  =

print("For Recursive feature elimination")



## Use univariate selection to determine best features


### SelectKBest -> [Documentation](https://scikit-learn.org/stable/modules/generated/sklearn.feature_selection.SelectKBest.html#sklearn.feature_selection.SelectKBest)

Metrics: [F_classif (ANOVA)](https://scikit-learn.org/stable/modules/generated/sklearn.feature_selection.f_classif.html#sklearn.feature_selection.f_classif), [Chi2](https://scikit-learn.org/stable/modules/generated/sklearn.feature_selection.chi2.html#sklearn.feature_selection.chi2), [mutual_information](https://scikit-learn.org/stable/modules/generated/sklearn.feature_selection.mutual_info_classif.html#sklearn.feature_selection.mutual_info_classif)

In [None]:
from sklearn.feature_selection import SelectKBest, f_classif, chi2 #ANOVA

skb = SelectKBest( _ , k = _ )
skb.fit( _ , _ )

best_train = _
best_test = _

print (best_train.shape)

(112, 2)


In [None]:
print("Num Features: %d" % best_train.shape[0])
print("Selected Features: %s" % np.array(skb.get_support(),dtype=int).tolist())
print("                 : %s" % np.arange(22))

In [None]:
# Retrain classifier on best features and make predictions



In [None]:
# Compute metrics
acc_best = 
cm_best  =

print("For Recursive feature elimination")



### SelectPercentile -> [Documentation](https://scikit-learn.org/stable/modules/generated/sklearn.feature_selection.SelectPercentile.html#sklearn.feature_selection.SelectPercentile)

In [None]:
from sklearn.feature_selection import SelectPercentile, f_classif, chi2 #ANOVA

spc = SelectPercentile( _ , percentile = _ )
spc.fit( _ , _ )

best_train = 
best_test = 

print (best_train.shape)

In [None]:
print("Num Features: %d" % best_train.shape[0])
print("Selected Features: %s" % _ )
print("                 : %s" % np.arange(22))

In [None]:
# Retrain classifier on best features and make predictions



In [None]:
# Compute metrics
acc_best = 
cm_best  =

print("For Recursive feature elimination")



## Use Metatransformer to determine best features

### SelectFromModel -> [Documentation](https://scikit-learn.org/stable/modules/generated/sklearn.feature_selection.SelectFromModel.html#sklearn.feature_selection.SelectFromModel)

In [None]:
from sklearn.feature_selection import SelectFromModel

# Create a new SVC classifier 
clf = _ 

# Compute metatransformer
sfm = SelectFromModel(estimator = clf)

best_features = sfm.fit(X_train, y_train)

In [None]:
# Transform best features
best_train = _ 
best_test  = _ 

print(best_train.shape, best_test.shape)

In [None]:
print("Num Features: %d" % sfm.n_features_in_)
print("Selected Features: %s" % _ )
print("                 : %s" % np.arange(22))

In [None]:
# Retrain classifier on best features and make predictions



In [None]:
# Compute metrics
acc_best = 
cm_best  =

print("For Recursive feature elimination")

