In [1]:
import numpy as np
import pandas as pd
import os

In [3]:
dataset_name = 'CICIDS_2017'
models = ['LR', 'GB', 'NN']
individual_methods = ['chi2', 'ANOVA', 'mutualinfo', 'sfs(gb)', 'sfs(lr)', 'im(gb)', 'im(lr)']
set_methods = ['union', 'intersection', 'quorum']
columns = ['MD_size', 'MD_score', 'MD_test', 'MPR_size', 'MPR_score', 'MPR_test', 'MS_size', 'MS_score', 'MS_test']
tolerence = 0.03
factor = 0.03

# Stopping Points

In [4]:
def MaxDelta(score):
    max_delta = 0
    index = len(score) - 1
    for i in range(len(score)-1, 0, -1):
        delta = score[i] - score[i-1]
        if delta > max_delta:
            max_delta = delta
            index = i

    return index

def MinPerfReq(score, tolerence=0.03):
    best_score = score[-1] * (1 - tolerence)
    index = len(score) - 1
    for i in range(len(score)-1, 0, -1):
        if score[i] < best_score:
            index = i+1
            break

    return index

def MaxScore(score, size, factor=0.03):
    best_performance = 0
    index = len(score) - 1
    for i in range(len(score)-1, 0, -1):
        current_size = size[i]
        adj_score = score[i] - (factor * current_size)
        if adj_score > best_performance:
            best_performance = adj_score
            index = i

    return index

# Greedy

In [5]:
stop_df = []
indices = []
for model in models:
    # read the results, extract their score parts.
    cv_df = pd.read_csv(os.path.join('../Results/', dataset_name, 'Greedy_CV_F1_' + model + '.csv'), index_col=0)
    test_df = pd.read_csv(os.path.join('../Results/', dataset_name, 'Greedy_Test_F1_' + model + '.csv'), index_col=0)
    cv_score = cv_df.iloc[0].apply(lambda x: float(x.translate( { ord(i): None for i in '()'} ).split(',')[0]))
    test_score = test_df.iloc[0]
    size = np.arange(1, cv_df.shape[1]+1)
    stop1 = MaxDelta(cv_score)
    stop2 = MinPerfReq(cv_score, tolerence)
    stop3 = MaxScore(cv_score, size, factor)
    stop_df.append([stop1+1, cv_score[stop1], test_score[stop1],
                    stop2+1, cv_score[stop2], test_score[stop2],
                    stop3+1, cv_score[stop3], test_score[stop3],])
    indices.append('greedy_' + model)

pd.DataFrame(stop_df, index=indices, columns=columns).to_csv(os.path.join('../Results/', dataset_name, 'Greedy_Stopping_Point_2.csv'))

stop_df = []
indices = []
for model in models:
    # read the results, extract their score parts.
    cv_df = pd.read_csv(os.path.join('../Results/Paper/', dataset_name, 'Greedy_CV_F1_' + model + '.csv'), index_col=0)
    test_df = pd.read_csv(os.path.join('../Results/Paper/', dataset_name, 'Greedy_Test_F1_' + model + '.csv'), index_col=0)
    cv_score = cv_df.iloc[0].apply(lambda x: float(x.translate( { ord(i): None for i in '()'} ).split(',')[0]))
    test_score = test_df.iloc[0]
    size = np.arange(1, cv_df.shape[1]+1)
    stop1 = MaxDelta(cv_score)
    stop2 = MinPerfReq(cv_score, tolerence)
    stop3 = MaxScore(cv_score, size, factor)
    stop_df.append([stop1+1, cv_score[stop1], test_score[stop1],
                    stop2+1, cv_score[stop2], test_score[stop2],
                    stop3+1, cv_score[stop3], test_score[stop3],])
    indices.append('greedy_' + model)

pd.DataFrame(stop_df, index=indices, columns=columns).to_csv(os.path.join('../Results/Paper/', dataset_name, 'Greedy_Stopping_Point_2.csv'))

# Set

In [5]:
stop_df = []
indices = []
size_df = pd.read_csv(os.path.join('../Results/', dataset_name, 'Set_Features.csv'), index_col=0)
for model in models:
    # read the results, extract their score parts.
    cv_df = pd.read_csv(os.path.join('../Results/', dataset_name, 'Set_CV_F1_' + model + '.csv'), index_col=0)
    test_df = pd.read_csv(os.path.join('../Results/', dataset_name, 'Set_Test_F1_' + model + '.csv'), index_col=0)
    
    for method in set_methods:
        cv_score = cv_df.loc[method].apply(lambda x: float(x.translate( { ord(i): None for i in '()'} ).split(',')[0]))

        test_score = test_df.loc[method]
        size = size_df.loc[method+'_size'].apply(lambda x: float(x))

        stop1 = MaxDelta(cv_score)
        stop2 = MinPerfReq(cv_score)
        stop3 = MaxScore(cv_score, size)
        stop_df.append([stop1+1, cv_score[stop1], test_score[stop1],
                        stop2+1, cv_score[stop2], test_score[stop2],
                        stop3+1, cv_score[stop3], test_score[stop3],])
        indices.append(method + '_' + model)

pd.DataFrame(stop_df, index=indices, columns=columns).to_csv(os.path.join('../Results/', dataset_name, 'Set_Stopping_Point.csv'))
    

In [6]:
stop_df = []
indices = []
for model in models:
    # read the results, extract their score parts.
    cv_df = pd.read_csv(os.path.join('../Results/', dataset_name, 'Individual_CV_F1_' + model + '.csv'), index_col=0)
    test_df = pd.read_csv(os.path.join('../Results/', dataset_name, 'Individual_Test_F1_' + model + '.csv'), index_col=0)
    for method in individual_methods: 
        cv_score = cv_df.loc[method].apply(lambda x: float(x.translate( { ord(i): None for i in '()'} ).split(',')[0]))
        test_score = test_df.loc[method]
        size = np.arange(1, cv_df.shape[1]+1)
        stop1 = MaxDelta(cv_score)
        stop2 = MinPerfReq(cv_score)
        stop3 = MaxScore(cv_score, size)
        stop_df.append([stop1+1, cv_score[stop1], test_score[stop1],
                        stop2+1, cv_score[stop2], test_score[stop2],
                        stop3+1, cv_score[stop3], test_score[stop3],])
        indices.append(method + '_' + model)
    
pd.DataFrame(stop_df, index=indices, columns=columns).to_csv(os.path.join('../Results/', dataset_name, 'Individual_Stopping_Point.csv'))