In [1]:
# use feature importance for feature selection
from numpy import loadtxt
from numpy import sort
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.feature_selection import SelectFromModel
from sklearn.ensemble import ExtraTreesClassifier,RandomForestClassifier 
from preprocessing.Normalize import Normalize
import helper.SeriesHelper as series_helper
import time
import numpy as np

In [2]:
t = time.time()
X = Normalize().get_normalized_data()
cols = np.array(list(X.columns))
index = np.array(list(X.index))
Y = series_helper.get_relapse_value_from_series_matrix(X)
X = X.to_numpy()
time.time()-t


40.161009550094604

In [3]:
t = time.time()
# split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.33, random_state=20)
time.time()-t

0.16072964668273926

In [4]:
def xgb_classifier_feature_importance():
    t = time.time()
    # fit model on all training data
    model = XGBClassifier()
    print("INIT MODEL : ",time.time()-t)
    t=time.time()
    model.fit(X_train, y_train)
    print("FIT MODEL : ",time.time()-t)
    t = time.time()
    # make predictions for test data and evaluate
    y_pred = model.predict(X_test)
    print("PREDICT :: ",time.time()-t)
    predictions = [round(value) for value in y_pred]
    accuracy = accuracy_score(y_test, predictions)
    print("Accuracy: %.2f%%" % (accuracy * 100.0))
    return model.feature_importances_



In [5]:
def random_forest_classifier_feature_importance():
    t = time.time()
    # fit model on all training data
    model = RandomForestClassifier()
    print("INIT MODEL : ",time.time()-t)
    t=time.time()
    model.fit(X_train, y_train)
    print("FIT MODEL : ",time.time()-t)
    t = time.time()
    # make predictions for test data and evaluate
    y_pred = model.predict(X_test)
    print("PREDICT :: ",time.time()-t)
    predictions = [round(value) for value in y_pred]
    accuracy = accuracy_score(y_test, predictions)
    print("Accuracy: %.2f%%" % (accuracy * 100.0))
    return model.feature_importances_

In [6]:
def extra_tree_classifier_feature_importance():
    t = time.time()
    # fit model on all training data
    model = ExtraTreesClassifier()
    print("INIT MODEL : ",time.time()-t)
    t=time.time()
    model.fit(X_train, y_train)
    print("FIT MODEL : ",time.time()-t)
    t = time.time()
    # make predictions for test data and evaluate
    y_pred = model.predict(X_test)
    print("PREDICT :: ",time.time()-t)
    predictions = [round(value) for value in y_pred]
    accuracy = accuracy_score(y_test, predictions)
    print("Accuracy: %.2f%%" % (accuracy * 100.0))
    return model.feature_importances_

In [7]:
def sort_feature_importance(feature_importances):
    t = time.time()
    feature_importance=dict(zip(cols, feature_importances))    
    # feature_importance
    feature_importance_sorted = np.array(sorted(feature_importance.items(),reverse=True, key=lambda x: x[1]))
    print(" Time for Sorting feature importance ",time.time()-t)
    return feature_importance_sorted

In [8]:
def train_classifer_range(feature_importance_sorted,selection_model,feature_rank_beg,feature_rank_end,step):
    for i in range(feature_rank_beg,feature_rank_end,step):
        np.sum(feature_importance_sorted[0:i,1].astype(float))
        # so select first 450 genes
        important_gene = feature_importance_sorted[0:i,0]
        # important_gene selected from matrix
        important_gene_index = np.array([])
        for x in important_gene:
            important_gene_index = np.append(important_gene_index,int(np.where(cols == x)[0][0]))
        important_gene_index = important_gene_index.astype(int)
        X_new = X[:,important_gene_index]
        X_new_train, X_new_test, y_new_train, y_new_test = train_test_split(X_new, Y, test_size=0.33, random_state=7)
        t = time.time()
        selection_model.fit(X_new_train, y_new_train)
        # eval model
        y_pred = selection_model.predict(X_new_test)
        predictions = [round(value) for value in y_pred]
        accuracy = accuracy_score(y_new_test, predictions)
        print("Thresh=%.3f, n=%d, Accuracy: %.2f%%" % (np.sum(feature_importance_sorted[0:int(i),1].astype(float)), X_new_train.shape[1], accuracy*100.0))
        print("Time for above threshold : ",time.time()-t)


In [9]:
def train_xgb_classifier_range(feature_importance_sorted,feature_rank_beg,feature_rank_end,step):
    selection_model = XGBClassifier()
    train_classifer_range(feature_importance_sorted,selection_model,feature_rank_beg,feature_rank_end,step)

In [10]:
def train_extra_tree_classifier_range(feature_importance_sorted,feature_rank_beg,feature_rank_end,step):
    selection_model = ExtraTreesClassifier()
    train_classifer_range(feature_importance_sorted,selection_model,feature_rank_beg,feature_rank_end,step)

In [11]:
def train_random_forest_classifier_range(feature_importance_sorted,feature_rank_beg,feature_rank_end,step):
    selection_model = RandomForestClassifier()
    train_classifer_range(feature_importance_sorted,selection_model,feature_rank_beg,feature_rank_end,step)

# Neural Network

In [12]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.wrappers.scikit_learn import KerasClassifier
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

In [13]:
def neural_network_model(no_of_feature):
	# create model
    model = Sequential()
    model.add(Dense(no_of_features, input_dim=no_of_features, activation='relu'))
    model.add(Dense(int(no_of_features/2), activation='relu'))
    model.add(Dense(int(no_of_features/5), activation='relu'))
    model.add(Dense(int(no_of_features/10), activation='relu'))
    model.add(Dense(1, activation='sigmoid'))
    # Compile model
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model

In [14]:
def neural_network_train(feature_importance_sorted,feature_rank_beg,feature_rank_end,step):
    for i in range(feature_rank_beg,feature_rank_end,step):
        np.sum(feature_importance_sorted[0:i,1].astype(float))
        # so select first 450 genes
        important_gene = feature_importance_sorted[0:i,0]
        # important_gene selected from matrix
        important_gene_index = np.array([])
        for x in important_gene:
            important_gene_index = np.append(important_gene_index,int(np.where(cols == x)[0][0]))
        important_gene_index = important_gene_index.astype(int)
        X_new = X[:,important_gene_index]
        X_new_train, X_new_test, y_new_train, y_new_test = train_test_split(X_new, Y, test_size=0.33, random_state=7)
        t = time.time()
        # evaluate model with standardized dataset
        estimators = []
        estimators.append(('standardize', StandardScaler()))
        estimators.append(('mlp', KerasClassifier(build_fn=create_larger,no_of_features=i ,epochs=100, batch_size=5, verbose=0)))
        pipeline = Pipeline(estimators)
        kfold = StratifiedKFold(n_splits=10, shuffle=True)
        results = cross_val_score(pipeline, X_new, Y, cv=kfold)
        print("Thresh=%.3f, n=%d" % (np.sum(feature_importance_sorted[0:int(i),1].astype(float)), X_new_train.shape[1]),end="\t")
        print("Larger: %.2f%% (%.2f%%)" % (results.mean()*100, results.std()*100))
        print("Time for above threshold : ",time.time()-t)

# Below is code for running XGB classifier

In [15]:
# xgb_feature_importance = xgb_classifier_feature_importance()

In [16]:
# sorted_xgb_feature_importance = sort_feature_importance(xgb_feature_importance)

In [17]:
# np.sum(sorted_xgb_feature_importance[0:450,1].astype(float))

In [18]:
# train_xgb_classifier_range(sorted_xgb_feature_importance,100,510,10)

# Below is code for running Extra Tree classifier

In [19]:
extra_tree_feature_importance = extra_tree_classifier_feature_importance()

INIT MODEL :  0.0
FIT MODEL :  0.5999014377593994
PREDICT ::  0.044933319091796875
Accuracy: 61.62%


In [20]:
sorted_extra_tree_feature_importance = sort_feature_importance(extra_tree_feature_importance)

Time for Sorting feature importance  0.057808876037597656


In [21]:
np.sum(sorted_extra_tree_feature_importance[0:450,1].astype(float))

0.4737415205475361

In [22]:
train_extra_tree_classifier_range(sorted_extra_tree_feature_importance,100,1300,100)

Thresh=0.147, n=100, Accuracy: 62.37%
Time for above threshold :  0.04535055160522461
Thresh=0.256, n=200, Accuracy: 61.62%
Time for above threshold :  0.04589438438415527
Thresh=0.350, n=300, Accuracy: 61.62%
Time for above threshold :  0.045665740966796875
Thresh=0.434, n=400, Accuracy: 58.84%
Time for above threshold :  0.051071882247924805
Thresh=0.512, n=500, Accuracy: 63.13%
Time for above threshold :  0.052629947662353516
Thresh=0.583, n=600, Accuracy: 63.89%
Time for above threshold :  0.06411480903625488
Thresh=0.648, n=700, Accuracy: 63.13%
Time for above threshold :  0.06493592262268066
Thresh=0.706, n=800, Accuracy: 64.14%
Time for above threshold :  0.06509995460510254
Thresh=0.757, n=900, Accuracy: 59.60%
Time for above threshold :  0.07097458839416504
Thresh=0.805, n=1000, Accuracy: 62.12%
Time for above threshold :  0.07703065872192383
Thresh=0.851, n=1100, Accuracy: 61.87%
Time for above threshold :  0.07367205619812012
Thresh=0.895, n=1200, Accuracy: 65.66%
Time for a

# Random Forest Code

In [23]:
random_forest_feature_importance = random_forest_classifier_feature_importance()

INIT MODEL :  0.0
FIT MODEL :  1.1953527927398682
PREDICT ::  0.0419619083404541
Accuracy: 62.12%


In [24]:
sorted_random_forest_feature_importance = sort_feature_importance(random_forest_feature_importance)

Time for Sorting feature importance  0.0658423900604248


In [25]:
np.sum(sorted_random_forest_feature_importance[0:450,1].astype(float))

0.8883179121579475

In [26]:
train_random_forest_classifier_range(sorted_random_forest_feature_importance,100,650,50)

Thresh=0.345, n=100, Accuracy: 59.60%
Time for above threshold :  0.09093832969665527
Thresh=0.467, n=150, Accuracy: 62.88%
Time for above threshold :  0.09773993492126465
Thresh=0.569, n=200, Accuracy: 62.63%
Time for above threshold :  0.11196231842041016
Thresh=0.653, n=250, Accuracy: 63.38%
Time for above threshold :  0.10176467895507812
Thresh=0.726, n=300, Accuracy: 62.63%
Time for above threshold :  0.11572456359863281
Thresh=0.789, n=350, Accuracy: 61.62%
Time for above threshold :  0.12639427185058594
Thresh=0.841, n=400, Accuracy: 63.89%
Time for above threshold :  0.16385793685913086
Thresh=0.888, n=450, Accuracy: 62.12%
Time for above threshold :  0.15333867073059082
Thresh=0.920, n=500, Accuracy: 62.63%
Time for above threshold :  0.1452465057373047
Thresh=0.946, n=550, Accuracy: 63.13%
Time for above threshold :  0.1525440216064453
Thresh=0.971, n=600, Accuracy: 62.63%
Time for above threshold :  0.16765093803405762


# Code For RFE

In [27]:
from sklearn.feature_selection import RFE


In [28]:
def rfe(model,no_of_features,X,Y):
    rfe = RFE(model, no_of_features)
    rfe = rfe.fit(X, Y)
    # summarize the selection of the attributes
    print("****"*5,"RFE","****"*5)
    print("--------------- RFE SUPPORT ---------------")
    print(rfe.support_)
    print("--------------- RFE RANKING ---------------")
    print(rfe.ranking_)


In [29]:
xbg_model = RandomForestClassifier()
rfe(xbg_model,400,X_train,y_train)

******************** RFE ********************
--------------- RFE SUPPORT ---------------
[False False False ... False False False]
--------------- RFE RANKING ---------------
[21816 21815 21814 ... 16354 18160 18453]
