## Feature selection using two-stage filtering method  
We applied a two-stage filtering feature selection method for both bow and tfi-df text representations only.    
filter 1 - remove feature with low variance  
filter 2 - remove redundant features using MI score   

pree.t@cmu.ac.th  

In [2]:
import joblib
import os
import numpy as np
from scipy import sparse
from sklearn.feature_selection import VarianceThreshold, mutual_info_classif

import matplotlib.pyplot as plt

os.path.dirname(os.getcwd())
model_path = os.path.dirname(os.getcwd()) + '\\model\\original\\'
model_path_fs_train = os.path.dirname(os.getcwd()) + '\\model\\feature selection\\train\\'
model_path_fs_test = os.path.dirname(os.getcwd()) + '\\model\\feature selection\\test\\'

In [3]:
# preparing all data 
feat1, y1 = joblib.load(model_path+'text_bow1_kt.pkl')
feat2, y2 = joblib.load(model_path+'text_bow1_ws.pkl')
feat3, y3 = joblib.load(model_path+'text_bow2_kt.pkl')
feat4, y4 = joblib.load(model_path+'text_bow2_ws.pkl')
feat5, y5 = joblib.load(model_path+'text_tfidf1_kt.pkl')
feat6, y6 = joblib.load(model_path+'text_tfidf1_ws.pkl')
feat7, y7 = joblib.load(model_path+'text_tfidf2_kt.pkl')
feat8, y8 = joblib.load(model_path+'text_tfidf2_ws.pkl')

all_feats = [["text_bow1_kt", feat1, y1],
               ["text_bow1_ws", feat2, y2],
               ["text_bow2_kt", feat3, y3],
               ["text_bow2_ws", feat4, y4],
               ["text_tfidf1_kt", feat5, y5],
               ["text_tfidf1_ws", feat6, y6],
               ["text_tfidf2_kt", feat7, y7],
               ["text_tfidf2_ws", feat8, y8]
            ]

In [None]:
# train-test split 80/20 for all text representations
from sklearn.model_selection import train_test_split
for i in range(len(all_feats)):
    X_train, X_test, y_train, y_test = train_test_split(all_feats[i][1], all_feats[i][2], train_size=0.8, test_size=0.2)
    all_feats[i].extend([X_train, X_test, y_train, y_test])

In [None]:
def plot_feature_scores(threshold_val):
    fig = plt.figure(figsize=(9, 7), dpi=80) 
    ax = plt.axes()
    ax.axhline(threshold_val, ls='dotted', c='r')
    ax.plot(vt.variances_)
    return

## Select using fixed dimension

In [None]:

def twosteps_fs(X, y, vt_dim, mi_dim):
        '''
        parem:  X = features, y = target
                vt_dim = dimension for first filter,
                mi_dim = diminsion last filter
        return: array 
        '''
        vt = VarianceThreshold()
        vt.fit(X)
        feature_scores = vt.variances_

        idx = np.flip(np.argsort(feature_scores))
        tmp = np.take(X, idx.flatten(), axis=1)        
        X_vt = tmp[:, :vt_dim]

        feature_scores = mutual_info_classif(X_vt, np.ravel(y), random_state=0)
        mi_idx = np.flip(np.argsort(feature_scores))
        tmp = np.take(X_vt, mi_idx.flatten(), axis=1)        
        X_vt_mi = tmp[:, :mi_dim]
        return X_vt_mi

In [None]:
%%time
vt_dim = 1500
mi_dim = 700
for i in range(len(all_feats)):
    print("selecting: ", all_feats[i][0])
    # fs on training set
    X = all_feats[i][3].A
    y = all_feats[i][5].A
    X_vt_mi = twosteps_fs(X, y, vt_dim, mi_dim)
    arr = np.hstack((sparse.csr_matrix(X_vt_mi), sparse.csr_matrix(y)))
    joblib.dump(arr, model_path + all_feats[i][0] + "_fs_train.pkl")

    # fs on testing set
    X = all_feats[i][4].A
    y = all_feats[i][6].A
    X_vt_mi = twosteps_fs(X, y, vt_dim, mi_dim)
    arr = np.hstack((sparse.csr_matrix(X_vt_mi), sparse.csr_matrix(y)))
    joblib.dump(arr, model_path + all_feats[i][0] + "_fs_test.pkl")

## Manually select using specfic thresholds

In [None]:
# select using manual threshold
X = all_feats[0][3].A
y = all_feats[0][5].A

print(X.shape)
print(y.shape)

## Filter 1 - remove features with low variance

In [None]:
# bow1_kt 0.00035
# bow1_ws 0.003
# bow2_kt 0.00065
# bow2_ws
# tfidf1_kt
# tfidf1_ws
# tfidf2_kt
# tfidf2_ws
threshold_val = 0.00035
vt = VarianceThreshold(threshold=threshold_val)
vt.fit(X)
mask = vt.get_support()

In [None]:
idx = np.where(mask==False)
print("total number of feature will be removed:", len(idx[0]))
X_vt =  np.delete(X, idx, 1)
X_vt.shape

## Filter 2 remove using MI score

In [None]:
#calcuate mi score of the remaining terms
feature_scores = mutual_info_classif(X_vt, np.ravel(y), random_state=0)

In [None]:
# bow1_kt = 0.0001
# bow1_ws = 0.0005
mi_threshold_val = 0.0001
plot_feature_scores(mi_threshold_val)

In [None]:
# set another mi score threshold manually, so that we can have reasonable size
plt.plot(-np.sort(-feature_scores))

In [None]:
feature_scores_final = feature_scores[feature_scores > mi_threshold_val]
plt.plot(-np.sort(-feature_scores_final))

In [None]:
# select with threshold
mi_idx = np.argwhere(feature_scores > mi_threshold_val)
X_vt_mi = np.take(X_vt, mi_idx.flatten(), axis=1)
X_vt_mi.shape

In [None]:
arr = np.hstack((sparse.csr_matrix(X_vt_mi), sparse.csr_matrix(y)))
joblib.dump(arr, model_path+'text_bow1_ws_fs.pkl')

## Test with LR

In [4]:
feat1, y1 = joblib.load(model_path_fs_train+'text_bow1_kt_fs_train.pkl')
feat2, y2 = joblib.load(model_path_fs_train+'text_bow1_ws_fs_train.pkl')
feat3, y3 = joblib.load(model_path_fs_train+'text_bow2_kt_fs_train.pkl')
feat4, y4 = joblib.load(model_path_fs_train+'text_bow2_ws_fs_train.pkl')
feat5, y5 = joblib.load(model_path_fs_train+'text_tfidf1_kt_fs_train.pkl')
feat6, y6 = joblib.load(model_path_fs_train+'text_tfidf1_ws_fs_train.pkl')
feat7, y7 = joblib.load(model_path_fs_train+'text_tfidf2_kt_fs_train.pkl')
feat8, y8 = joblib.load(model_path_fs_train+'text_tfidf2_ws_fs_train.pkl')

all_feats_fs_train = [["text_bow1_kt_fs_train", feat1, y1],
               ["text_bow1_ws_fs_train", feat2, y2],
               ["text_bow2_kt_fs_train", feat3, y3],
               ["text_bow2_ws_fs_train", feat4, y4],
               ["text_tfidf1_kt_fs_train", feat5, y5],
               ["text_tfidf1_ws_fs_train", feat6, y6],
               ["text_tfidf2_kt_fs_train", feat7, y7],
               ["text_tfidf2_ws_fs_train", feat8, y8]
            ]

feat1, y1 = joblib.load(model_path_fs_test+'text_bow1_kt_fs_test.pkl')
feat2, y2 = joblib.load(model_path_fs_test+'text_bow1_ws_fs_test.pkl')
feat3, y3 = joblib.load(model_path_fs_test+'text_bow2_kt_fs_test.pkl')
feat4, y4 = joblib.load(model_path_fs_test+'text_bow2_ws_fs_test.pkl')
feat5, y5 = joblib.load(model_path_fs_test+'text_tfidf1_kt_fs_test.pkl')
feat6, y6 = joblib.load(model_path_fs_test+'text_tfidf1_ws_fs_test.pkl')
feat7, y7 = joblib.load(model_path_fs_test+'text_tfidf2_kt_fs_test.pkl')
feat8, y8 = joblib.load(model_path_fs_test+'text_tfidf2_ws_fs_test.pkl')

all_feats_fs_test = [["text_bow1_kt_fs_train", feat1, y1],
               ["text_bow1_ws_fs_test", feat2, y2],
               ["text_bow2_kt_fs_test", feat3, y3],
               ["text_bow2_ws_fs_test", feat4, y4],
               ["text_tfidf1_kt_fs_test", feat5, y5],
               ["text_tfidf1_ws_fs_test", feat6, y6],
               ["text_tfidf2_kt_fs_test", feat7, y7],
               ["text_tfidf2_ws_fs_test", feat8, y8]
            ]

In [5]:
# test with original
print("Performance with the original:")
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
model = LogisticRegression(C=2., penalty="l2", solver="liblinear", dual=False, multi_class="ovr")

for i in range(len(all_feats)):
    # train-test split 80/20
    X_train, X_test, y_train, y_test = train_test_split(all_feats[i][1], all_feats[i][2], test_size=0.2, random_state=0)
    X_train, X_valid, y_train, y_valid = train_test_split(X_train.A, np.ravel(y_train.A), test_size=0.25, random_state=0)
    model.fit(X_train, y_train)
    print(all_feats[i][0] + ": val= ", round(model.score(X_valid, y_valid), 4), ", test= " , round(model.score(X_test.A, np.ravel(y_test.A)), 4))

Performance with the original:
text_bow1_kt: val=  0.7417 , test=  0.7472
text_bow1_ws: val=  0.6728 , test=  0.6774
text_bow2_kt: val=  0.7181 , test=  0.7223
text_bow2_ws: val=  0.6272 , test=  0.6189
text_tfidf1_kt: val=  0.7514 , test=  0.7504
text_tfidf1_ws: val=  0.6928 , test=  0.6943
text_tfidf2_kt: val=  0.7244 , test=  0.7268
text_tfidf2_ws: val=  0.626 , test=  0.623


In [7]:
# test with two-step fs (train)
print("Performance with the proposed feature selection:")
for i in range(len(all_feats_fs_train)):
    # train-test split 80/20
    X_train, X_test, y_train, y_test = train_test_split(all_feats_fs_train[i][1], all_feats_fs_train[i][2], test_size=0.2, random_state=0)
    X_train, X_valid, y_train, y_valid = train_test_split(X_train.A, np.ravel(y_train.A), test_size=0.25, random_state=0)
    model.fit(X_train, y_train)
    print(all_feats_fs_train[i][0] + ": val= ", round(model.score(X_valid, y_valid), 4), ", test= " , round(model.score(X_test.A, np.ravel(y_test.A)), 4))

Performance with the proposed feature selection:
text_bow1_kt_fs_train: val=  0.7123 , test=  0.7114
text_bow1_ws_fs_train: val=  0.6681 , test=  0.6568
text_bow2_kt_fs_train: val=  0.6728 , test=  0.6735
text_bow2_ws_fs_train: val=  0.5849 , test=  0.5853
text_tfidf1_kt_fs_train: val=  0.7131 , test=  0.7158
text_tfidf1_ws_fs_train: val=  0.6655 , test=  0.6774
text_tfidf2_kt_fs_train: val=  0.67 , test=  0.6763
text_tfidf2_ws_fs_train: val=  0.5951 , test=  0.6047
