In [6]:
import numpy as np
import scipy
from sklearn.preprocessing import MinMaxScaler
from sklearn.exceptions import UndefinedMetricWarning
import warnings
from sklearn.cross_validation import train_test_split
from sklearn.cross_validation import cross_val_score, cross_val_predict
from sklearn import metrics
from sklearn.linear_model import LogisticRegression

warnings.filterwarnings("ignore", category=DeprecationWarning) 
warnings.filterwarnings("ignore", category=UndefinedMetricWarning) 

dataset = np.loadtxt("./features.txt", delimiter=" ");
print(dataset.shape)

X = dataset[:, :35]
Y = dataset[:, 35]
botnetDataset = dataset[dataset[:,35]==1, :]
normalDataset = dataset[dataset[:,35]==0, :]

botnetX = botnetDataset[:,:35]
botnetY = botnetDataset[:, 35]

normalX = normalDataset[:,:35]
normalY = normalDataset[:, 35]

scaler = MinMaxScaler(feature_range=(0, 1))
rescaledBotnetX = scaler.fit_transform(botnetX)
rescaledNormalX = scaler.fit_transform(normalX)
rescaledX = scaler.fit_transform(X)


(52580, 36)


In [7]:
'''
Feature Information
Feature Set 1: Generic features
    F1 - Total Source IPs per destination IP
    F2 - Total Protocols used for communication per destination IP
    F3 - Total Bidirectional flows per destination IP
    F4 - Total Client flows per destination IP
    F5 - Total Server flows per destination IP
    F6 - Protocols used for communication represented as bit string
    
Feature Set 2: Aggregate features
    F7 : F12 - Total, Max, Min, Mean, Variance, Std of Flows per destination IP
    F13 : F18 - Total, Max, Min, Mean, Variance, Std of Packets per destination IP
    F19 : F24 - Total, Max, Min, Mean, Variance, Std of Bytes per destination IP
    F25 : F30 - Total, Max, Min, Mean, Variance, Std of SourceBytes per destination IP
    
Feature Set 3: Subnet Features
    F31 - No. of distinct IPs in dstIP/24 subnet
    F32 - Total Flows in dstIP/24 subnet
    F33 - Total Packets in dstIP/24 subnet
    
Feature Set 4: Periodic Communication Features
    F34 - Total periodic communications involved per destination IP
    F35 - Ratio of total source IPs involved in periodic communication over total source IPs involved per dst IP
'''

genericFeatures = list(range(6))
aggregateFeatures = list(range(6, 30))
subnetFeatures = list(range(30, 33))
periodicCommnFeatures = [33, 34]

genericAggregate = genericFeatures + aggregateFeatures
genericSubnet = genericFeatures + subnetFeatures
genericPeriodic = genericFeatures + periodicCommnFeatures
aggregateSubnet = aggregateFeatures + subnetFeatures
aggregatePeriodic = aggregateFeatures + periodicCommnFeatures
subnetPeriodic = subnetFeatures + periodicCommnFeatures

withoutGenericFeatures = [columnIndex for columnIndex in range(35) if columnIndex not in genericFeatures]
withoutAggregateFeatures = [columnIndex for columnIndex in range(35) if columnIndex not in aggregateFeatures]
withoutSubnetFeatures = [columnIndex for columnIndex in range(35) if columnIndex not in subnetFeatures]
withoutPeriodicCommnFeatures = [columnIndex for columnIndex in range(35) if columnIndex not in periodicCommnFeatures]
allFeatures = range(35)

configuration = [genericFeatures, aggregateFeatures, subnetFeatures, periodicCommnFeatures, 
                 genericAggregate, genericSubnet, genericPeriodic, aggregateSubnet, aggregatePeriodic, subnetPeriodic,
                 withoutGenericFeatures, withoutAggregateFeatures, withoutSubnetFeatures, withoutPeriodicCommnFeatures,
                 allFeatures
                ]

configToString = { repr(genericFeatures) : "FeatureSet 1",
                   repr(aggregateFeatures) : "FeatureSet 2",
                   repr(subnetFeatures) : "FeatureSet 3",
                   repr(periodicCommnFeatures) : "FeatureSet 4",
                   repr(genericAggregate) : "FeatureSet (1,2)",
                   repr(genericSubnet) : "FeatureSet (1,3)",
                   repr(genericPeriodic) : "FeatureSet (1,4)",
                   repr(aggregateSubnet) : "FeatureSet (2,3)",
                   repr(aggregatePeriodic) : "FeatureSet (2,4)",
                   repr(subnetPeriodic) : "FeatureSet (3,4)",
                   repr(withoutGenericFeatures) : "FeatureSet (2,3,4)",
                   repr(withoutAggregateFeatures) : "FeatureSet (1,3,4)",
                   repr(withoutSubnetFeatures) : "FeatureSet (1,2,4)",
                   repr(withoutPeriodicCommnFeatures) : "FeatureSet (1,2,3)",
                   repr(allFeatures) : "All Features"
                 }

def updateColumns(columns):
    truncBotX = rescaledBotnetX[:, columns]
    truncNormalX = rescaledNormalX[:, columns]
    truncX = rescaledX[:, columns]
    return truncBotX, truncNormalX, truncX

In [8]:
def GetNextBotnetSet(rescaledX, Y):
    step = 4000
    prevStep = 0
    while step < rescaledX.shape[0]:
        yield rescaledX[prevStep:step, :], Y[prevStep:step]
        prevStep = step
        step = step + 4000
        
    return rescaledX[prevStep:, :], Y[prevStep:]

In [9]:
for config in configuration:
    truncBotX, truncNormalX, truncX = updateColumns(config)
    scores = []
    fpr = []
    fnr = []
    for botX, botY in GetNextBotnetSet(truncBotX, botnetY):
        X = np.concatenate((botX, truncNormalX))
        Y = np.concatenate((botY, normalY))
        X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)
        model2 = LogisticRegression()
        model2 = model2.fit(X_train, Y_train)
        predicted = model2.predict(X_test)
        scores.append(model2.score(X_test, Y_test))
        fpr.append(metrics.precision_score(Y_test, predicted))
        fnr.append(metrics.recall_score(Y_test, predicted))
    print("Feature Config: %s, Avg Accuracy: %.3f, Avg Precision Rate: %.3f, Avg Recall Rate: %.3f" %
            (configToString[repr(config)], np.mean(scores), np.mean(fpr), np.mean(fnr)))

Feature Config: FeatureSet 1, Avg Accuracy: 0.758, Avg Precision Rate: 0.695, Avg Recall Rate: 0.928
Feature Config: FeatureSet 2, Avg Accuracy: 0.494, Avg Precision Rate: 0.333, Avg Recall Rate: 0.003
Feature Config: FeatureSet 3, Avg Accuracy: 0.696, Avg Precision Rate: 0.668, Avg Recall Rate: 0.886
Feature Config: FeatureSet 4, Avg Accuracy: 0.492, Avg Precision Rate: 0.000, Avg Recall Rate: 0.000
Feature Config: FeatureSet (1,2), Avg Accuracy: 0.763, Avg Precision Rate: 0.701, Avg Recall Rate: 0.929
Feature Config: FeatureSet (1,3), Avg Accuracy: 0.825, Avg Precision Rate: 0.786, Avg Recall Rate: 0.901
Feature Config: FeatureSet (1,4), Avg Accuracy: 0.768, Avg Precision Rate: 0.706, Avg Recall Rate: 0.929
Feature Config: FeatureSet (2,3), Avg Accuracy: 0.700, Avg Precision Rate: 0.672, Avg Recall Rate: 0.886
Feature Config: FeatureSet (2,4), Avg Accuracy: 0.494, Avg Precision Rate: 0.333, Avg Recall Rate: 0.004
Feature Config: FeatureSet (3,4), Avg Accuracy: 0.700, Avg Precision Ra

In [10]:
for config in configuration:
    truncBotX, truncNormalX, truncX = updateColumns(config)
    scores = []
    fpr = []
    fnr = []
    for botX, botY in GetNextBotnetSet(truncBotX, botnetY):
        X = np.concatenate((botX, truncNormalX))
        Y = np.concatenate((botY, normalY))
        model = LogisticRegression()
        scores.append( cross_val_score(model, X, Y, scoring='accuracy', cv=10).mean() )
        fpr.append( cross_val_score(model, X, Y, scoring='precision', cv=10).mean() )
        fnr.append( cross_val_score(model, X, Y, scoring='recall', cv=10).mean() )
    print("Feature Config: %s, Avg Accuracy: %.3f, Avg Precision Rate: %.3f, Avg Recall Rate: %.3f" %
            (configToString[repr(config)], np.mean(scores), np.mean(fpr), np.mean(fnr)))

Feature Config: FeatureSet 1, Avg Accuracy: 0.776, Avg Precision Rate: 0.712, Avg Recall Rate: 0.921
Feature Config: FeatureSet 2, Avg Accuracy: 0.508, Avg Precision Rate: 0.214, Avg Recall Rate: 0.004
Feature Config: FeatureSet 3, Avg Accuracy: 0.685, Avg Precision Rate: 0.597, Avg Recall Rate: 0.885
Feature Config: FeatureSet 4, Avg Accuracy: 0.506, Avg Precision Rate: 0.000, Avg Recall Rate: 0.000
Feature Config: FeatureSet (1,2), Avg Accuracy: 0.782, Avg Precision Rate: 0.718, Avg Recall Rate: 0.921
Feature Config: FeatureSet (1,3), Avg Accuracy: 0.828, Avg Precision Rate: 0.772, Avg Recall Rate: 0.893
Feature Config: FeatureSet (1,4), Avg Accuracy: 0.787, Avg Precision Rate: 0.724, Avg Recall Rate: 0.922
Feature Config: FeatureSet (2,3), Avg Accuracy: 0.689, Avg Precision Rate: 0.600, Avg Recall Rate: 0.885
Feature Config: FeatureSet (2,4), Avg Accuracy: 0.618, Avg Precision Rate: 0.566, Avg Recall Rate: 0.984
Feature Config: FeatureSet (3,4), Avg Accuracy: 0.687, Avg Precision Ra