In [1]:
import numpy as np
from numpy.random import rand
from functionHO import Fun
import math
import pandas as pd
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt

In [2]:
def init_position(lb, ub, N, dim):
    X = np.zeros([N, dim], dtype='float')
    for i in range(N):
        for d in range(dim):
            X[i,d] = lb[0,d] + (ub[0,d] - lb[0,d]) * rand()        
    
    return X

In [3]:
def binary_conversion(X, thres, N, dim):
    Xbin = np.zeros([N, dim], dtype='int')
    for i in range(N):
        for d in range(dim):
            if X[i,d] > thres:
                Xbin[i,d] = 1
            else:
                Xbin[i,d] = 0
    
    return Xbin

In [4]:
def boundary(x, lb, ub):
    if x < lb:
        x = lb
    if x > ub:
        x = ub
    
    return x

In [5]:
# Levy Flight
def levy_distribution(beta, dim):
    # Sigma     
    nume  = math.gamma(1 + beta) * np.sin(np.pi * beta / 2)
    deno  = math.gamma((1 + beta) / 2) * beta * 2 ** ((beta - 1) / 2)
    sigma = (nume / deno) ** (1 / beta) 
    # Parameter u & v 
    u     = np.random.randn(dim) * sigma
    v     = np.random.randn(dim)
    # Step 
    step  = u / abs(v) ** (1 / beta)
    LF    = step
    
    return LF

In [6]:
def jfs(xtrain, ytrain, opts):
    # Parameters
    ub     = 1
    lb     = 0
    thres  = 0.5
    gamma  = 0.01
    beta   = 1.5    # levy component
    P      = 0.8    # switch probability
    
    N        = opts['N']
    max_iter = opts['T']
    if 'P' in opts:
        P    = opts['P'] 
    if 'beta' in opts:
        beta = opts['beta'] 
        
    # Dimension
    dim = np.size(xtrain, 1)
    if np.size(lb) == 1:
        ub = ub * np.ones([1, dim], dtype='float')
        lb = lb * np.ones([1, dim], dtype='float')
        
    # Initialize position 
    X     = init_position(lb, ub, N, dim)
    
    # Binary conversion
    Xbin  = binary_conversion(X, thres, N, dim)
    
    # Fitness at first iteration
    fit   = np.zeros([N, 1], dtype='float')
    Xgb   = np.zeros([1, dim], dtype='float')
    fitG  = float('inf')
    
    for i in range(N):
        fit[i,0] = Fun(xtrain, ytrain, Xbin[i,:], opts)
        if fit[i,0] < fitG:
            Xgb[0,:] = X[i,:]
            fitG     = fit[i,0]
    
    # Pre
    curve = np.zeros([1, max_iter], dtype='float') 
    t     = 0
    
    curve[0,t] = fitG.copy()
    print("Generation:", t + 1)
    print("Best (MGFPA):", curve[0,t])
    t += 1
    
    while t < max_iter:  
        Xnew  = np.zeros([N, dim], dtype='float') 
        
        for i in range(N):
            # Global pollination 
            if rand() < P:
                # Levy distribution (3)
                L = levy_distribution(beta, dim)
                for d in range(dim):
                    #--- update
                    if rand() < 0.5:
                        # Global pollination (2)
                        Xnew[i,d] = X[i,d] + gamma * L[d] * (Xgb[0,d] - X[i,d]) 
                    else:
                        #--- Different flower A, B in same species
                        R   = np.random.permutation(N) 
                        A   = R[0] 
                        B   = R[1]
                        #--- Epsilon [0 to 1]
                        r2 = rand()
                        #--- Pollination (6)
                        Xnew[i,d] = (max(X[A,d], X[B,d]) - min(X[A,d], X[B,d])) * r2 + min(X[A,d], X[B,d])          
                    
                    # Boundary
                    Xnew[i,d] = boundary(Xnew[i,d], lb[0,d], ub[0,d])
                    
            # Local pollination
            else:
                # Different flower A, B in same species
                R   = np.random.permutation(N) 
                A   = R[0] 
                B   = R[1]
                # Epsilon [0 to 1]
                r1  = rand()
                for d in range(dim):
                    # Local pollination (4)
                    Xnew[i,d] = X[i,d] + r1 * (X[A,d] - X[B,d])           
                    # Boundary
                    Xnew[i,d] = boundary(Xnew[i,d], lb[0,d], ub[0,d])
                
        # Binary conversion
        Xbin = binary_conversion(Xnew, thres, N, dim)
        
        # Greedy selection
        for i in range(N):
            Fnew = Fun(xtrain, ytrain, Xbin[i,:], opts)
            if Fnew < fit[i,0]:
                X[i,:]   = Xnew[i,:]
                fit[i,0] = Fnew            
                
            if fit[i,0] < fitG:
                Xgb[0,:] = X[i,:]
                fitG     = fit[i,0]
             
        # Store result
        curve[0,t] = fitG.copy()
        print("Generation:", t + 1)
        print("Best (MGFPA):", curve[0,t])
        t += 1            

            
    # Best feature subset
    Gbin       = binary_conversion(Xgb, thres, 1, dim) 
    Gbin       = Gbin.reshape(dim)
    pos        = np.asarray(range(0, dim))    
    sel_index  = pos[Gbin == 1]
    num_feat   = len(sel_index)
    # Create dictionary
    mgfpa_data = {'sf': sel_index, 'c': curve, 'nf': num_feat}
    
    return mgfpa_data 

In [16]:
# load data
data  = pd.read_csv('clean_feature_columns.csv')

In [21]:
label = data['churn'].to_numpy()
feat = data.drop('churn',axis=1).to_numpy()


feat  = np.asarray(feat)   # feature vector
label = np.asarray(label)     # label vector

In [22]:
feat.shape

(100000, 1026)

In [23]:
label.shape

(100000,)

In [30]:
def create_label(num):
    if num > 0 :
        return 'y'
    return 'n'

In [31]:
create_label_vectorized = np.vectorize(create_label)

In [32]:
label = create_label_vectorized(label)

In [24]:
data.shape

(100000, 1027)

In [34]:
# split data into train & validation (70 -- 30)
xtrain, xtest, ytrain, ytest = train_test_split(feat, label, test_size=0.3, stratify=label)
fold = {'xt':xtrain, 'yt':ytrain, 'xv':xtest, 'yv':ytest}

In [35]:
# parameter
k     = 5     # k-value in KNN
N     = 10    # number of salps
T     = 100   # maximum number of iterations
maxLt = 10    # maximum iteration for local search algorithm
opts  = {'k':k, 'fold':fold, 'N':N, 'T':T, 'maxLt':maxLt}

In [36]:
# perform feature selection
fmdl  = jfs(feat, label, opts)
sf    = fmdl['sf']

Generation: 1
Best (MGFPA): 0.46476352046783626
Generation: 2
Best (MGFPA): 0.45808247953216374
Generation: 3
Best (MGFPA): 0.45808247953216374
Generation: 4
Best (MGFPA): 0.45808247953216374
Generation: 5
Best (MGFPA): 0.45808247953216374
Generation: 6
Best (MGFPA): 0.456341
Generation: 7
Best (MGFPA): 0.45227601364522413
Generation: 8
Best (MGFPA): 0.45227601364522413
Generation: 9
Best (MGFPA): 0.450101081871345
Generation: 10
Best (MGFPA): 0.450101081871345
Generation: 11
Best (MGFPA): 0.4471534931773879
Generation: 12
Best (MGFPA): 0.4471534931773879
Generation: 13
Best (MGFPA): 0.4471534931773879
Generation: 14
Best (MGFPA): 0.4471534931773879
Generation: 15
Best (MGFPA): 0.4471534931773879
Generation: 16
Best (MGFPA): 0.4471534931773879
Generation: 17
Best (MGFPA): 0.4471534931773879
Generation: 18
Best (MGFPA): 0.4471534931773879
Generation: 19
Best (MGFPA): 0.44677030799220274
Generation: 20
Best (MGFPA): 0.44677030799220274
Generation: 21
Best (MGFPA): 0.44677030799220274
Gen

In [None]:
sf