In [1]:
# forward feature selection according to Random Forest feature importance


def fi_forward_feature_selector(x ,y ,n_features,test):
    '''
    Input:
    x = Feature set
    y = Target
    n_features = Number of how many features to iterate
    test = Test set
    
    Output:
    
    Prints every iteration, with current number of features fitted and corresponding accuracy - cv - predicted positive class
    size.
    
    drop_features = Features that drop both cv and accuracy when added to fit
    use_features = Features that does not drop cv or accuracy when added to fit
    pos_dropper = Features that drop the predicted number of positive class
    pos_non_dropper = Features that does not drop the predicted number of positive class
    '''
    start_time = time.time()
    acc_list = []
    features_list = []
    drop_features = []
    use_features = []
    acc_point = 0
    cv_point = 0
    pos_class = 0
    cv = StratifiedKFold(n_splits=3, shuffle=True, random_state=24)
    cvs = []
    positive_class = []
    pos_dropper = []
    pos_non_dropper = []
    
    model_0 = RandomForestClassifier(n_estimators=200,random_state=24,n_jobs=-1)
    model_0.fit(x,y)
    feature_importances_df = pd.DataFrame(model_0.feature_importances_, 
                                          index= x.columns,
                                          columns=['importance']).sort_values('importance', ascending=False)

    for i in range(n_features):
        
        features_selected = list(feature_importances_df.index[:i+1])
        
        x_train, x_test, y_train, y_test = train_test_split(x,y,test_size=0.25,shuffle=True,stratify=y,random_state=24)
        
        model = RandomForestClassifier(n_estimators=200,random_state=24,n_jobs=-1)
        model.fit(x_train[features_selected], y_train)
        y_pred = model.predict(x_test[features_selected])
        acc = accuracy_score(y_test,y_pred)
        
        acc_list.append(acc)
        features_list.append(features_selected)
        
        model_cv = RandomForestClassifier(n_estimators=200,random_state=24)
        scores = cross_val_score(model_cv, x[features_selected], y, cv=cv, n_jobs=-1, scoring="accuracy",verbose=0,error_score='raise')
        cvs.append(np.mean(scores))   
            
        model_new = RandomForestClassifier(n_estimators=200,random_state=24,n_jobs=-1)
        model_new.fit(x[features_selected],y)
        array = list(model_new.predict(test[features_selected]))
        count_of_zeros = array.count(0)
        count_of_ones = array.count(1)
        positive_class.append(count_of_ones)
        
        if acc_list[-1] <= acc_point and cvs[-1] <= cv_point:
            drop_features.append(features_selected[-1])
            acc_point = acc_list[-1]
            cv_point = cvs[-1]
        else:
            use_features.append(features_selected[-1])
            acc_point = acc_list[-1]
            cv_point = cvs[-1]   
        
        if positive_class[-1] <= pos_class:
            pos_dropper.append(features_selected[-1])
            pos_class = positive_class[-1]
        else:
            pos_non_dropper.append(features_selected[-1])
            pos_class = positive_class[-1]
        
        
        print(10*'*******', f'\n First {i+1} features: {features_selected[:i+1]} \n\n Prediction Accuracy: {acc_list[i]} \n')
        print(f' CV : {np.mean(scores)}, std: {np.std(scores)} ')
        print("\n")
        print(' -- Class counts -- \n')
        print(f'1 : {count_of_ones} , 0 : {count_of_zeros}')
            
            
    max_cv = max(cvs)
    max_cv_index = cvs.index(max_cv)
    
    max_acc = max(acc_list)
    max_acc_index = acc_list.index(max_acc)
    
    #max_ones = max(positive_class)
    #max_ones_index = positive_class.index(max_ones)
    
    end_time = time.time()
    total_time = end_time - start_time
    print(10*'############')
    print(f'\n Max CV: {max_cv} \n\n first {max_cv_index+1} features: {features_list[max_cv_index]}','\n')
    print(f'\n\n Max Accuracy: {max_acc} \n\n first {max_acc_index+1} features: {features_list[max_acc_index]}','\n')
    print(f'\n\n Positive Class with Max CV: {positive_class[max_cv_index]} \n\n first {max_cv_index+1} features: {features_list[max_cv_index]}','\n')
    
    print(f'Total time taken: {total_time} seconds \n')
    
    return drop_features, use_features, pos_dropper, pos_non_dropper