In [1]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.cross_validation import cross_val_score
from sklearn.metrics import mutual_info_score
import operator
import timeit

In [2]:
train = pd.read_csv('dataset/kddtrain_2class_normalized.csv')

In [3]:
train_y = train['attack_name']
train_x = train.drop(['attack_name'], axis = 1)
train_array = np.array(train_x)

In [4]:
print(train_x.shape)
print(train_y.shape)

(125973, 41)
(125973,)


In [5]:
#x: number of ants to be initialized
def init_subset(x, size):
    choice = np.zeros((x, 41))
    for i in range(x):
        #size = np.random.randint(10,39)
        choice_init = np.random.choice(np.arange(0, 41, 1), size=size, replace=False)
        choice[i][choice_init] = 1
    return choice

In [10]:
def fitness(particle):
    X = train_array[:, np.nonzero(particle)][:,0]
    clf = LogisticRegression(n_jobs=4)
    return cross_val_score(clf, X, train_y, cv=3, scoring='accuracy').mean()

In [11]:
def mrmr(i, t):
    mi = []
    for a in np.nonzero(t)[0]:
        mi.append(mutual_info_score(train.ix[:,i], train.ix[:,a]))
    mi = sum(mi)
    mi = mutual_info_score(train.ix[:,i], train['attack_name'])-((1/len(np.nonzero(t)[0]))* mi)
    return mi

In [12]:
def usm(t, p):
    n=0.2
    k=0.1
    off = np.random.choice(np.nonzero(t)[0], size=p, replace=False)
    t[off] = 0
    for itr in range(p):
        usm_val={}
        for i in np.nonzero(t==0)[0]:
            m_r = mrmr(i, t)
            usm_val[i] =(np.power(phero[i],n)*np.power(m_r,k))
        s = sum(usm_val.values())
        for c in usm_val.items():
            usm_val[c[0]] = c[1]/s
        t[max(usm_val.items(), key=operator.itemgetter(1))[0]] = 1

In [13]:
num_ants = 100
num_itr = 20
phero = np.ones((1,41))

k=50
k_step=3
w=0.1
w_step=0.06

p=2
g=10
w=0.1
fittest_accuracy = 0
old_accuracy = np.zeros(100)
size = 15


#intialize intial ant population
subset = init_subset(num_ants, size)

for i in range(num_itr):
    print(i)
    
    start = timeit.default_timer()
    
    #calculate accuracy for each ants
    print("finding accuracy")
    accuracy = []
    for j in range(num_ants):
        accuracy.append(fitness(subset[j]))
    
    new_accuracy = np.array(accuracy)
    
    del_accuracy = new_accuracy - old_accuracy
    
    old_accuracy = np.array(accuracy)
    
    max_accuracy = max(accuracy)
    
    print("max_accuracy: ", max_accuracy)
    print("fittest_ant accuracy: ", fittest_accuracy)
    
    #find k best
    print("finding k_best")
    index_top = np.array(accuracy).argsort()[-k:][::-1]
    k_best = subset[index_top,:]
    
    #find g best
    print("finding g_best")
    index_top = np.array(accuracy).argsort()[-k:][::-1]
    g_best = subset[index_top,:]
    np.place(g_best, g_best==0, -1)
    
    #fittest ant
    if max_accuracy > fittest_accuracy:
        fittest_accuracy = max_accuracy
        fittest_ant = subset[index_top[0],:]
    
    #global importance update
    print("updating pheromone")
    del_t_reward = np.dot(g_best[np.where(del_accuracy[index_top]>=0)].T, np.array(accuracy)[np.where(del_accuracy[index_top]>=0)])/k
    
    del_t_penalty = np.dot(g_best[np.where(del_accuracy[index_top]<0)].T, np.array(accuracy)[np.where(del_accuracy[index_top]<0)])/k
    
    phero = w * (del_t_reward - del_t_penalty)
    
    #calculate the change in pheromone 
    #del_t = (np.amin(np.array(accuracy)) - np.array(accuracy)[index_top]) \
    #/ np.amin((np.amin(np.array(accuracy)) - np.array(accuracy)[index_top]))
    
    #update the pheromone according to kbest
    #delta = np.multiply(k_best, del_t.reshape(k,1))
    #phero += np.sum(delta, axis=0)
    
    #intialize next ant population
    new_subset = init_subset(num_ants-k, size)
    
    #evaluate USM
    for t in k_best:
        usm(t, p)
    
    subset = np.concatenate((k_best, new_subset), axis=0)
    
    k=k+5
    w=w+0.1
    
    end = timeit.default_timer()
    
    print("time of execution: ", end - start)

0
finding accuracy
max_accuracy:  0.95312489334
fittest_ant accuracy:  0
finding k_best
finding g_best
updating pheromone


KeyboardInterrupt: 

In [14]:
max(accuracy)

0.95312489333994022

In [15]:
high = np.nonzero(fittest_ant)[0]

In [16]:
len(high)

15

In [17]:
test = pd.read_csv('dataset/KDDTest+_normalized_2.csv')

In [18]:
test_y = test['attack_name']
test_x = test.drop(['attack_name'], axis = 1)

In [19]:
log_reg = LogisticRegression(n_jobs=-1)
log_reg.fit(train_x.ix[:,high], train_y)
pred = log_reg.predict(test_x.ix[:,high])

In [20]:
from sklearn.metrics import accuracy_score
accuracy_score(test_y, pred)

0.7238289567068843