## Feature selection using two-stage filtering method  
We applied a two-stage filtering feature selection method for both bow and tfi-df text representation only.    
filter 1 - remove feature with low variance  
filter 2 - remove redundant features using MI score   

pree.t@cmu.ac.th  

In [1]:
import joblib
import os
import numpy as np
from scipy import sparse
from sklearn.feature_selection import VarianceThreshold
from sklearn.feature_selection import mutual_info_classif

import matplotlib.pyplot as plt

os.path.dirname(os.getcwd())

model_path = os.path.dirname(os.getcwd()) + '\\model\\'
lexicon_path = os.path.dirname(os.getcwd()) + '\\lexicon\\'

In [2]:
# preparing all data 

text_bow1_kt = joblib.load(model_path+'text_bow1_kt.pkl')
text_bow1_ws = joblib.load(model_path+'text_bow1_ws.pkl')

text_bow2_kt = joblib.load(model_path+'text_bow2_kt.pkl')
text_bow2_ws = joblib.load(model_path+'text_bow2_ws.pkl')

text_tfidf1_kt = joblib.load(model_path+'text_tfidf1_kt.pkl')
text_tfidf1_ws = joblib.load(model_path+'text_tfidf1_ws.pkl')

text_tfidf2_kt = joblib.load(model_path+'text_tfidf2_kt.pkl')
text_tfidf2_ws = joblib.load(model_path+'text_tfidf2_ws.pkl')

all_texts = [["text_bow1_kt", text_bow1_kt],
               ["text_bow1_ws", text_bow1_ws],
               ["text_bow2_kt", text_bow2_kt],
               ["text_bow2_ws", text_bow2_ws],
               ["text_tfidf1_kt", text_tfidf1_kt],
               ["text_tfidf1_ws", text_tfidf1_ws],
               ["text_tfidf2_kt", text_tfidf2_kt],
               ["text_tfidf2_ws", text_tfidf2_ws]
            ]
# lex_bow1_kt = joblib.load(lexicon_path+'lex_bow1_kt.pkl')
# lex_bow1_ws = joblib.load(lexicon_path+'lex_bow1_ws.pkl')

# lex_bow2_kt = joblib.load(lexicon_path+'lex_bow2_kt.pkl')
# lex_bow2_ws = joblib.load(lexicon_path+'lex_bow2_ws.pkl')

In [3]:
def plot_feature_scores(threshold_val):
    fig = plt.figure(figsize=(9, 7), dpi=80) 
    ax = plt.axes()
    ax.axhline(threshold_val, ls='dotted', c='r')
    ax.plot(vt.variances_)
    return

## Select using fix dim

In [4]:

'''
parem:  X = features, y= target
        vt_dim = dimension for first filter,
        mi_dim = diminsion last filter
return: array 
'''
def twosteps_fs(X, y, vt_dim, mi_dim):
        vt = VarianceThreshold()
        vt.fit(X)
        feature_scores = vt.variances_

        idx = np.flip(np.argsort(feature_scores))
        tmp = np.take(X, idx.flatten(), axis=1)        
        X_vt = tmp[:, :vt_dim]

        feature_scores = mutual_info_classif(X_vt, np.ravel(y), random_state=0)
        mi_idx = np.flip(np.argsort(feature_scores))
        tmp = np.take(X_vt, mi_idx.flatten(), axis=1)        
        X_vt_mi = tmp[:, :mi_dim]
        return X_vt_mi

In [5]:
%%time
vt_dim = 1500
mi_dim = 700
for i in range(len(all_texts)):
    X, y = all_texts[i][1]
    X = X.A
    y = y.A
    X_vt_mi = twosteps_fs(X, y, vt_dim, mi_dim)

    arr = np.hstack((sparse.csr_matrix(X_vt_mi), sparse.csr_matrix(y)))
    joblib.dump(arr, model_path + all_texts[i][0] + "_fs.pkl")

CPU times: total: 29min 31s
Wall time: 29min 31s


## Manually select using specfic thresholds

In [None]:
# select using manual threshold
X, y = text_bow1_kt
X = X.A
y = y.A

print(X.shape)
print(y.shape)

## Filter 1 - remove features with low variance

In [None]:
# bow1_kt 0.00035
# bow1_ws 0.003
# bow2_kt 0.00065
# bow2_ws
# tfidf1_kt
# tfidf1_ws
# tfidf2_kt
# tfidf2_ws
threshold_val = 0.00035
vt = VarianceThreshold(threshold=threshold_val)
vt.fit(X)
mask = vt.get_support()

In [None]:
idx = np.where(mask==False)
print("total number of feature will be removed:", len(idx[0]))
X_vt =  np.delete(X, idx, 1)
X_vt.shape

## Filter 2 remove using MI score

In [None]:
#calcuate mi score of the remaining terms
feature_scores = mutual_info_classif(X_vt, np.ravel(y), random_state=0)

In [None]:
# bow1_kt = 0.0001
# bow1_ws = 0.0005
mi_threshold_val = 0.0001
plot_feature_scores(mi_threshold_val)

In [None]:
# set another mi score threshold manually, so that we can have reasonable size
plt.plot(-np.sort(-feature_scores))

In [None]:
feature_scores_final = feature_scores[feature_scores > mi_threshold_val]
plt.plot(-np.sort(-feature_scores_final))

In [None]:
# select with threshold
mi_idx = np.argwhere(feature_scores > mi_threshold_val)
X_vt_mi = np.take(X_vt, mi_idx.flatten(), axis=1)
X_vt_mi.shape

In [None]:
arr_new = np.hstack((sparse.csr_matrix(X_vt_mi), sparse.csr_matrix(y)))
joblib.dump(arr_new, model_path+'text_bow1_ws_kt.pkl')

## Test with LR

In [3]:
text_bow1_kt_fs = joblib.load(model_path+'text_bow1_kt_fs.pkl')
text_bow1_ws_fs = joblib.load(model_path+'text_bow1_ws_fs.pkl')

text_bow2_kt_fs = joblib.load(model_path+'text_bow2_kt_fs.pkl')
text_bow2_ws_fs = joblib.load(model_path+'text_bow2_ws_fs.pkl')

text_tfidf1_kt_fs = joblib.load(model_path+'text_tfidf1_kt_fs.pkl')
text_tfidf1_ws_fs = joblib.load(model_path+'text_tfidf1_ws_fs.pkl')

text_tfidf2_kt_fs = joblib.load(model_path+'text_tfidf2_kt_fs.pkl')
text_tfidf2_ws_fs = joblib.load(model_path+'text_tfidf2_ws_fs.pkl')

all_texts_fs = [["text_bow1_kt_fs", text_bow1_kt_fs],
               ["text_bow1_ws_fs", text_bow1_ws_fs],
               ["text_bow2_kt_fs", text_bow2_kt_fs],
               ["text_bow2_ws_fs", text_bow2_ws_fs],
               ["text_tfidf1_kt_fs", text_tfidf1_kt_fs],
               ["text_tfidf1_ws_fs", text_tfidf1_ws_fs],
               ["text_tfidf2_kt_fs", text_tfidf2_kt_fs],
               ["text_tfidf2_ws_fs", text_tfidf2_ws_fs]
            ]

In [4]:
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import LogisticRegression

In [5]:
# better use train test split sklearn twice
def train_val_test_split(X, y, train_size, val_size, test_size):
    X_train_val, X_test, y_train_val, y_test = train_test_split(X, y, test_size = test_size, random_state=42)
    relative_train_size = train_size / (val_size + train_size)
    X_train, X_valid, y_train, y_valid = train_test_split(X_train_val, y_train_val,
                                                      train_size = relative_train_size, test_size = 1-relative_train_size)
    return X_train, X_valid, X_test, y_train, y_valid, y_test

In [8]:
# test with original
model = LogisticRegression(C=2., penalty="l2", solver="liblinear", dual=False, multi_class="ovr")
for i in range(len(all_texts)):
    X = all_texts_fs[i][1][0].A
    y = all_texts_fs[i][1][1].A
    X_train, X_valid, X_test, y_train, y_valid, y_test = train_val_test_split(X, np.ravel(y), train_size=0.6, val_size=0.2, test_size=0.2)
    model.fit(X_train, y_train)
    print(all_texts[i][0] + ": val= ", str(round(model.score(X_valid, y_valid), 4)), ", test= " , str(round(model.score(X_test, y_test), 4)))

text_bow1_kt: val=  0.7163 , test=  0.7198
text_bow1_ws: val=  0.6591 , test=  0.6668
text_bow2_kt: val=  0.6826 , test=  0.6755
text_bow2_ws: val=  0.5933 , test=  0.5877
text_tfidf1_kt: val=  0.7214 , test=  0.7174
text_tfidf1_ws: val=  0.6758 , test=  0.6739
text_tfidf2_kt: val=  0.6729 , test=  0.6785
text_tfidf2_ws: val=  0.5957 , test=  0.5929


In [9]:
# test with original
for i in range(len(all_texts_fs)):
    X = all_texts_fs[i][1][0].A
    y = all_texts_fs[i][1][1].A
    X_train, X_valid, X_test, y_train, y_valid, y_test = train_val_test_split(X, np.ravel(y), train_size=0.6, val_size=0.2, test_size=0.2)
    model.fit(X_train, y_train)
    print(all_texts_fs[i][0] + ": val= ", str(round(model.score(X_valid, y_valid), 4)), ", test= " , str(round(model.score(X_test, y_test), 4)))

text_bow1_kt_fs: val=  0.7186 , test=  0.7201
text_bow1_ws_fs: val=  0.6614 , test=  0.6668
text_bow2_kt_fs: val=  0.6667 , test=  0.6753
text_bow2_ws_fs: val=  0.5927 , test=  0.5849
text_tfidf1_kt_fs: val=  0.7181 , test=  0.7182
text_tfidf1_ws_fs: val=  0.6694 , test=  0.6743
text_tfidf2_kt_fs: val=  0.6713 , test=  0.6783
text_tfidf2_ws_fs: val=  0.5944 , test=  0.5944
