# Feature Selection

In [None]:
import pandas as pd
from sklearn.feature_selection import chi2
from sklearn.feature_selection import mutual_info_classif
from sklearn.preprocessing import MinMaxScaler

In [None]:
hotel_rev = pd.read_csv('data/HotelRevHelpfulness.csv')
print(hotel_rev.shape)
hotel_rev.head()

In [None]:
y = hotel_rev.pop('reviewHelpfulness').values
hotel_rev.pop('hotelId')
X = hotel_rev.values
hotel_rev.shape

## Filter-based Feature Selection
### Feature Scoring - two methods  
1. Chi square statistic
2. Information Gain

In [None]:
chi2_scores, pvals = chi2(X, y)
chi2_scores
# The chi square scores for all the features

In [None]:
# the pvals
pvals

In [None]:
i_scores = mutual_info_classif(X,y)
i_scores
# The i-gain scores for all the features

In [None]:
from scipy import stats
stats.spearmanr(chi2_scores, i_scores)
# correlation is low

In [None]:
mi = dict()

for i,j in zip(hotel_rev.columns,i_scores):
    mi[i]=j

df_mi = pd.DataFrame.from_dict(mi,orient='index',columns=['I-Gain'])
df_mi.sort_values(by=['I-Gain'],ascending=False,inplace=True)
df_mi.head(10)

In [None]:
chi = dict()
        
chi = {k: (v1, v2) for k, v1, v2 in zip(hotel_rev.columns,chi2_scores, pvals)}        

df_chi = pd.DataFrame.from_dict(chi,orient='index',columns=['Chi2','PVal'])
df_chi.sort_values(by=['Chi2'],ascending=False,inplace=True)
df_chi[df_chi["PVal"] <= 0.05]     # keep those with pvalue of 0.05

### Image Segmentation Data

In [None]:
seg_data = pd.read_csv('data/segmentation-all.csv')
print(seg_data.shape)
seg_data.head()

In [None]:
y = seg_data.pop('Class').values
X = seg_data.values
feature_names = seg_data.columns

### Feature Selection

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectKBest, chi2, mutual_info_classif
from sklearn.metrics import accuracy_score
from sklearn.naive_bayes import GaussianNB
mnb = GaussianNB()

In [None]:
mi = dict()

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=2, test_size=1/2)
i_scores = mutual_info_classif(X_train, y_train)

for i,j in zip(feature_names,i_scores):
    mi[i]=j
 
df = pd.DataFrame.from_dict(mi,orient='index',columns=['I-Gain'])
df.sort_values(by=['I-Gain'],ascending=False,inplace=True)
df.head(10)

In [None]:
import matplotlib.pyplot as plt
import matplotlib.ticker as mticker

%matplotlib inline

n = len(df.index)
rr = range(1,n)
fig, ax = plt.subplots()
ax.bar(df.index, df["I-Gain"], label='I-Gain',width=.35)

ax.xaxis.set_major_locator(mticker.FixedLocator(range(0,n)))
ax.set_xticklabels(list(df.index), rotation = 90)

ax.set_xlabel('Features')
ax.set_ylabel('I-Gain')
ax.legend()

plt.show()

## Select *k* Best Features
We rank the features using information gain (well mutual information) and select the _k_ best to build a classifier.  
We iterate through increasing values of *k*.  
`SelectKBest` is a _transform_ that transforms the training data.


In [None]:
acc_scores = []
for kk in range(1, X.shape[1]+1):
    FS_trans = SelectKBest(mutual_info_classif, 
                           k=kk).fit(X_train, y_train)
    X_tR_new = FS_trans.transform(X_train)
    X_tS_new = FS_trans.transform(X_test)
    seg_NB = mnb.fit(X_tR_new, y_train)
    y_dash = seg_NB.predict(X_tS_new)
    acc = accuracy_score(y_test, y_dash)
    acc_scores.append(acc)

df['Accuracy'] = acc_scores
df.head(10)

In [None]:
import matplotlib.pyplot as plt
import matplotlib.ticker as mticker

%matplotlib inline

n = len(df.index)
rr = range(1,n)
fig, ax = plt.subplots()
ax2 = ax.twinx()
ax.bar(df.index, df["I-Gain"], label='I-Gain',width=.35)
ax2.plot(df.index, df["Accuracy"], color='red', label='Accuracy')

ax.xaxis.set_major_locator(mticker.FixedLocator(range(0,n)))

ax.set_xticklabels(list(df.index), rotation = 90)

ax.set_xlabel('Features')
ax.set_ylabel('I-Gain')
ax2.set_ylabel('Accuracy')
ax.legend()

plt.show()

---
## Wrapper
Forward Sequential Search on Image Segmentation data.  

In [None]:
import pandas as pd
from sklearn.neighbors import KNeighborsClassifier
from mlxtend.feature_selection import SequentialFeatureSelector as SFS

knn = KNeighborsClassifier(n_neighbors=4)
seg_data = pd.read_csv('data/segmentation-all.csv')
print(seg_data.shape)
seg_data.head()

In [None]:
y = seg_data.pop('Class').values
X = seg_data.values
feature_names = seg_data.columns


In [None]:
feature_names

Run forward sequential wrapper search to search across all features using 10-fold xval.  

In [None]:
sfs_forward = SFS(knn, 
                  k_features=10, 
                  forward=True, 
                  floating=False, 
                  verbose=1,
                  scoring='accuracy',
                  cv=5)


sfs_forward = sfs_forward.fit(X, y, 
                              custom_feature_names=feature_names)




#### Plot and see that performance stabilises after 6 features

In [None]:
from mlxtend.plotting import plot_sequential_feature_selection as plot_sfs
import matplotlib.pyplot as plt

fig1 = plot_sfs(sfs_forward.get_metric_dict(), 
                ylabel='Accuracy',
                kind='std_dev')

plt.ylim([0.6, 1])
plt.title('Sequential Forward Selection (w. StdDev)')
plt.grid()
plt.show()
print(sfs_forward.k_feature_names_)

#### Try backward selection back to 4 features

In [None]:
sfs_backward = SFS(knn, 
                  k_features=4, 
                  forward=False, 
                  floating=False, 
                  verbose=1,
                  scoring='accuracy',
                  cv=10, n_jobs = -1)

sfs_backward = sfs_backward.fit(X, y, 
                              custom_feature_names=feature_names)

#### Plot and see similar performance

In [None]:
from mlxtend.plotting import plot_sequential_feature_selection as plot_sfs
import matplotlib.pyplot as plt

fig1 = plot_sfs(sfs_backward.get_metric_dict(), 
                ylabel='Accuracy',
                kind='std_dev')

plt.ylim([0.7, 1])
plt.title('Sequential Backward Selection (w. StdDev)')
plt.grid()
plt.show()
print(sfs_backward.k_feature_names_)

### Lasso feature selection
Logistic regression with L1 regularisation.   
`SelectFromModel` will select the top features out of max features   
The `C` parameter in `LogisticRegression` is the regularisation parameter, smaller values means stronger regularisation, default is 1.  You can experiment with the value of `C` to see how many features go to zero.     
You can select a specific number of features from `SelectFromModel` using the `max_features` parameter



In [None]:
seg_data = pd.read_csv('data/segmentation-all.csv')
print(seg_data.shape)
seg_data.head()

In [None]:
y = seg_data.pop('Class').values
X = seg_data.values
feature_names = seg_data.columns

In [None]:
from sklearn.feature_selection import SelectFromModel
from sklearn.linear_model import LogisticRegression

lr_selector = SelectFromModel(LogisticRegression(penalty="l1", 
                     C=.001, solver="liblinear"), max_features=X.shape[1])
lr_selector.fit(X, y)

X=pd.DataFrame(X)
lr_support = lr_selector.get_support()
lr_feature = X.loc[:,lr_support].columns.tolist()
print(str(len(lr_feature)), 'selected features')  
print('Selected features:')
lr_feature

In [None]:
for i in lr_feature:
    print(feature_names[i])



### Random Forest feature selection
Algorithm will select top features out of max features.   
Note that the features selected are different

In [None]:
from sklearn.feature_selection import SelectFromModel
from sklearn.ensemble import RandomForestClassifier

rf_selector = SelectFromModel(RandomForestClassifier(n_estimators=100),  max_features=X.shape[1])
rf_selector.fit(X, y)

X=pd.DataFrame(X)
rf_support = rf_selector.get_support()
rf_feature = X.loc[:,rf_support].columns.tolist()
print(str(len(rf_feature)), 'selected features')   
print('Selected features:')
rf_feature