In [36]:
import numpy as np
import scipy
from sklearn.preprocessing import MinMaxScaler
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning) 

dataset = np.loadtxt("./features.txt", delimiter=" ");
print(dataset.shape)

X = dataset[:, :35]
Y = dataset[:, 35]
botnetDataset = dataset[dataset[:,35]==1, :]
normalDataset = dataset[dataset[:,35]==0, :]

botnetX = botnetDataset[:,:35]
botnetY = botnetDataset[:, 35]

normalX = normalDataset[:,:35]
normalY = normalDataset[:, 35]

scaler = MinMaxScaler(feature_range=(0, 1))
rescaledBotnetX = scaler.fit_transform(botnetX)
rescaledNormalX = scaler.fit_transform(normalX)
rescaledX = scaler.fit_transform(X)

(52580, 36)


In [49]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()

featureColumns = ["V"+str(i) for i in range(1, dataset.shape[1]+1)]
df = pd.DataFrame(dataset, columns=featureColumns)
df.describe()

Unnamed: 0,V1,V2,V3,V4,V5,V6,V7,V8,V9,V10,...,V27,V28,V29,V30,V31,V32,V33,V34,V35,V36
count,52580.0,52580.0,52580.0,52580.0,52580.0,52580.0,52580.0,52580.0,52580.0,52580.0,...,52580.0,52580.0,52580.0,52580.0,52580.0,52580.0,52580.0,52580.0,52580.0,52580.0
mean,1.282103,1.002548,0.506599,4.392716,0.0,1.134119,4.899315,3.813389,2.728129,3.072898,...,10569.08,9641.168,352838000000.0,4518.123,20.755192,307.37073,2961.994,0.002377,0.001727,0.922005
std,1.107787,0.050419,13.209787,116.384575,0.0,0.478054,117.14605,110.460058,97.378284,100.735436,...,891488.0,562230.7,57707740000000.0,593990.1,64.693913,9051.404227,49827.49,0.055285,0.039456,0.268167
min,1.0,1.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,...,0.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0
25%,1.0,1.0,0.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0,...,124.0,124.0,0.0,0.0,1.0,1.0,3.0,0.0,0.0,1.0
50%,1.0,1.0,0.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0,...,186.0,186.0,0.0,0.0,1.0,1.0,3.0,0.0,0.0,1.0
75%,1.0,1.0,0.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0,...,186.0,186.0,0.0,0.0,2.0,3.0,8.0,0.0,0.0,1.0
max,19.0,2.0,1260.0,21407.0,0.0,34.0,21407.0,21407.0,21407.0,21407.0,...,185300000.0,73524130.0,1.081136e+16,103977700.0,256.0,347295.0,3384562.0,3.0,1.0,1.0


In [45]:
'''
Feature Information
Feature Set 1: Generic features
    F1 - Total Source IPs per destination IP
    F2 - Total Protocols used for communication per destination IP
    F3 - Total Bidirectional flows per destination IP
    F4 - Total Client flows per destination IP
    F5 - Total Server flows per destination IP
    F6 - Protocols used for communication represented as bit string
    
Feature Set 2: Aggregate features
    F7 : F12 - Total, Max, Min, Mean, Variance, Std of Flows per destination IP
    F13 : F18 - Total, Max, Min, Mean, Variance, Std of Packets per destination IP
    F19 : F24 - Total, Max, Min, Mean, Variance, Std of Bytes per destination IP
    F25 : F30 - Total, Max, Min, Mean, Variance, Std of SourceBytes per destination IP
    
Feature Set 3: Subnet Features
    F31 - No. of distinct IPs in dstIP/24 subnet
    F32 - Total Flows in dstIP/24 subnet
    F33 - Total Packets in dstIP/24 subnet
    
Feature Set 4: Periodic Communication Features
    F34 - Total periodic communications involved per destination IP
    F35 - Ratio of total source IPs involved in periodic communication over total source IPs involved per dst IP
'''

genericFeatures = list(range(6))
aggregateFeatures = list(range(6, 30))
subnetFeatures = list(range(30, 33))
periodicCommnFeatures = [33, 34]

genericAggregate = genericFeatures + aggregateFeatures
genericSubnet = genericFeatures + subnetFeatures
genericPeriodic = genericFeatures + periodicCommnFeatures
aggregateSubnet = aggregateFeatures + subnetFeatures
aggregatePeriodic = aggregateFeatures + periodicCommnFeatures
subnetPeriodic = subnetFeatures + periodicCommnFeatures

withoutGenericFeatures = [columnIndex for columnIndex in range(35) if columnIndex not in genericFeatures]
withoutAggregateFeatures = [columnIndex for columnIndex in range(35) if columnIndex not in aggregateFeatures]
withoutSubnetFeatures = [columnIndex for columnIndex in range(35) if columnIndex not in subnetFeatures]
withoutPeriodicCommnFeatures = [columnIndex for columnIndex in range(35) if columnIndex not in periodicCommnFeatures]

configuration = [genericFeatures, aggregateFeatures, subnetFeatures, periodicCommnFeatures, 
                 genericAggregate, genericSubnet, genericPeriodic, aggregateSubnet, aggregatePeriodic, subnetPeriodic,
                 withoutGenericFeatures, withoutAggregateFeatures, withoutSubnetFeatures, withoutPeriodicCommnFeatures]

def updateColumns(columns):
    truncBotX = rescaledBotnetX[:, columns]
    truncNormalX = rescaledNormalX[:, columns]
    truncX = rescaledX[:, columns]
    return truncBotX, truncNormalX, truncX

In [46]:
def GetNextBotnetSet(rescaledX, Y):
    step = 4000
    prevStep = 0
    while step < rescaledX.shape[0]:
        yield rescaledX[prevStep:step, :], Y[prevStep:step]
        prevStep = step
        step = step + 4000
        
    return rescaledX[prevStep:, :], Y[prevStep:]

In [47]:
from sklearn.cross_validation import train_test_split

for config in configuration:
    truncBotX, truncNormalX, truncX = updateColumns(config)
    scores = []
    for botX, botY in GetNextBotnetSet(truncBotX, botnetY):
        X = np.concatenate((botX, truncNormalX))
        Y = np.concatenate((botY, normalY))
        X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.4, random_state=42)
        model2 = RandomForestClassifier(n_estimators=100)
        model2 = model2.fit(X_train, Y_train)
        scores.append(model2.score(X_test, Y_test))
    print("Avg Accuracy for features ", config, ": ", np.mean(scores))

Avg Accuracy for features  [0, 1, 2, 3, 4, 5] :  0.998097295074
Avg Accuracy for features  [6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29] :  0.996348863519
Avg Accuracy for features  [30, 31, 32] :  0.962485858274
Avg Accuracy for features  [33, 34] :  0.505656690322
Avg Accuracy for features  [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29] :  0.998251568446
Avg Accuracy for features  [0, 1, 2, 3, 4, 5, 30, 31, 32] :  0.998200143988
Avg Accuracy for features  [0, 1, 2, 3, 4, 5, 33, 34] :  0.997994446159
Avg Accuracy for features  [6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32] :  0.997865885015
Avg Accuracy for features  [6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 33, 34] :  0.996760259179
Avg Accuracy for features  [30, 31, 32, 33, 34] :  0.962434433817
Avg Accuracy for fea

In [48]:
from sklearn.cross_validation import cross_val_score, cross_val_predict
from sklearn import metrics

for config in configuration:
    truncBotX, truncNormalX, truncX = updateColumns(config)
    scores_avg = []
    for botX, botY in GetNextBotnetSet(truncBotX, botnetY):
        X = np.concatenate((botX, truncNormalX))
        Y = np.concatenate((botY, normalY))
        scores = cross_val_score(RandomForestClassifier(), X, Y, scoring='accuracy', cv=10)
        predicted = cross_val_predict(RandomForestClassifier(), X, Y, cv=10)
        scores_avg.append(scores.mean())
    print("Avg Accuracy for features ", config, ": ", np.mean(scores_avg))

Avg Accuracy for features  [0, 1, 2, 3, 4, 5] :  0.996924236197
Avg Accuracy for features  [6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29] :  0.995011417089
Avg Accuracy for features  [30, 31, 32] :  0.94510194192
Avg Accuracy for features  [33, 34] :  0.505431832367
Avg Accuracy for features  [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29] :  0.99693467649
Avg Accuracy for features  [0, 1, 2, 3, 4, 5, 30, 31, 32] :  0.993509562446
Avg Accuracy for features  [0, 1, 2, 3, 4, 5, 33, 34] :  0.99688312199
Avg Accuracy for features  [6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32] :  0.994281103956
Avg Accuracy for features  [6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 33, 34] :  0.995011353661
Avg Accuracy for features  [30, 31, 32, 33, 34] :  0.944597649094
Avg Accuracy for featur