In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os
import time
from tqdm import trange
import warnings

from sklearn.feature_selection import SelectKBest, mutual_info_classif, SelectFromModel, SequentialFeatureSelector, RFE
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import f1_score
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_val_score
from sklearnex import patch_sklearn, unpatch_sklearn

n_cpu = os.cpu_count()
seed = 24

warnings.filterwarnings('ignore')

patch_sklearn()

# Load Data and Preprocessing

In [None]:
train_raw = pd.read_csv('./Data/train.csv')
print(train_raw.info())
test_raw = pd.read_csv('./Data/test.csv')

In [None]:
col_drop = []
for col in train_raw.columns:
    if train_raw[col].value_counts().shape[0] == 1:
        col_drop.append(col)

print(col_drop)

In [None]:
train_X = train_raw.drop(['class', 'num_outbound_cmds'], axis=1).select_dtypes(include='number')
train_Y = train_raw['class'].map({'normal': 1, 'anomaly': 0})
test_X = test_raw.drop(['class', 'num_outbound_cmds'], axis=1).select_dtypes(include='number')
test_Y = test_raw['class'].map({'normal': 1, 'anomaly': 0})
test_X = (test_X - train_X.min(axis=0)) / (train_X.max(axis=0) - train_X.min(axis=0))
test_X = test_X.clip(0, 1)
train_X = (train_X - train_X.min(axis=0)) / (train_X.max(axis=0) - train_X.min(axis=0))
print(train_X.shape, test_X.shape)

In [None]:
sns.heatmap(train_X.corr())

# Feature Selection

## Correlation-Based

In [None]:
correlation_matrix = train_X.corr()
model = RandomForestClassifier(n_estimators=200, n_jobs=n_cpu-1, random_state=seed)
selected_features = []

original_features = train_X.columns.tolist()
print(original_features)
start_time = time.time()
while len(original_features) > 0:
    indices = correlation_matrix[correlation_matrix.loc[:, original_features[0]] >= 0.8].index.tolist()
    print(indices)
    if len(indices) == 1:
        selected_features.append(indices[0])
        original_features.remove(indices[0])
        continue

    selector = SelectFromModel(model, threshold=-np.inf, max_features=1)
    selector.fit(train_X[indices], train_Y)
    selected_features.append([b for a, b in zip(selector.get_support(), indices) if a][0])
    original_features = [col for col in original_features if col not in indices]

print(f'time cost: {time.time()-start_time}')
print(f'selected features: {selected_features}')

train_X = train_X[selected_features]
test_X = test_X[selected_features]
print(train_X.shape, test_X.shape)

# Individual

In [None]:
model = RandomForestClassifier(n_estimators=200, n_jobs=n_cpu-1, random_state=seed)

selector = SelectKBest(mutual_info_classif, k='all')
selector.fit(train_X, train_Y)
sorted_index = np.argsort(selector.scores_)
mi_features = train_X.columns[sorted_index].tolist()
print(mi_features)

del selector

kf = StratifiedKFold(shuffle=True, random_state=seed)

selector = SequentialFeatureSelector(model, n_features_to_select=1, scoring='f1', cv=kf, n_jobs=n_cpu-1)
sfs_features = []
for i in trange(train_X.shape[1]-1):
    train_X2 = train_X.drop(sfs_features, axis=1)
    selector.fit(train_X2, train_Y)
    f = train_X2.columns[selector.get_support()][0]
    sfs_features.append(f)

sfs_features.append(train_X.columns.drop(sfs_features)[0])
print(sfs_features)

del selector

selector = RFE(model, n_features_to_select=1)
selector.fit(train_X, train_Y)
sorted_index = np.argsort(selector.ranking_)
rfe_features = train_X.columns[sorted_index].tolist()
print(rfe_features)

del selector

selector = RFE(model, n_features_to_select=1)
selector.fit(train_X, train_Y)
sorted_index = np.argsort(selector.estimator_.feature_importances_)
imp_features = train_X.columns[sorted_index].tolist()
print(imp_features)

del selector

# Cross Validation / Test

In [None]:
def get_scores(model, model_name):
    score_all = pd.DataFrame()
    plt.subplots(2, 2)
    for k, (feature_set, name) in enumerate(zip([mi_features, sfs_features, rfe_features, imp_features], ['Univariate', 'SFS', 'RFE', 'Importance'])):
        cv_score = []
        test_score = []
        for i in trange(train_X.shape[1]):
            train_X2 = train_X[feature_set[:i+1]].copy()
            cv = cross_val_score(model, train_X2, train_Y, scoring='f1', cv=kf)
            cv_score.append(cv.mean())

            model.fit(train_X2, train_Y)
            predict = model.predict(test_X[feature_set[:i+1]])
            test_score.append(f1_score(test_Y, predict))
        score_all[f'cv_score_{name}_{model_name}'] = cv_score
        score_all[f'test_score_{name}_{model_name}'] = test_score

        plt.subplot(2, 2, k)
        plt.title(name)
        plt.plot(range(train_X.shape[1]), cv_score, color='blue', linestyle='-', label='CV Score')
        plt.plot(range(train_X.shape[1]), test_score, color='red', linestyle='-', label='Test Score')
    plt.tight_layout()
    plt.show()

    return score_all

In [None]:
model = LogisticRegression(C=100, solver='liblinear', random_state=seed, n_jobs=n_cpu-1)
score_all_LR = get_scores(model, 'LR')

In [None]:
model = GradientBoostingClassifier(n_estimators=200, random_state=seed)
score_all_GB = get_scores(model, 'GB')

In [None]:
import tensorflow as tf
import tensorflow.keras as keras

def create_model(input_shape):
    model = keras.Sequential()
    model.add(keras.layers.Dense(50, activation='relu', input_shape=(input_shape,)))
    model.add(keras.layers.Dropout(0.2))
    model.add(keras.layers.Dense(50, activation='relu'))
    model.add(keras.layers.Dropout(0.2))
    model.add(keras.layers.Dense(50, activation='relu'))
    model.add(keras.layers.Dropout(0.2))
    model.add(keras.layers.Dense(50, activation='relu'))
    model.add(keras.layers.Dropout(0.2))
    model.add(keras.layers.Dense(1, activation='sigmoid'))
    model.compile(optimizer=keras.optimizers.Adam(), loss=keras.losses.binary_crossentropy)
    return model

In [None]:
tf.random.set_seed(seed)

kf = StratifiedKFold(shuffle=True, random_state=seed)

score_all_NN = pd.DataFrame()
plt.subplots(2, 2)
for k, (feature_set, name) in enumerate(zip([mi_features, sfs_features, rfe_features, imp_features], ['Univariate', 'SFS', 'RFE', 'Importance'])):
    cv_score = []
    test_score = []
    for k in trange(train_X.shape[1]):
        model = create_model(k+1)

        cv = []
        train_X2 = train_X[feature_set[:k+1]].copy()
        for train_index, test_index in kf.split(train_X2, train_Y):
            x_train_fold, x_test_fold = train_X2.iloc[train_index, :], train_X2.iloc[test_index, :]
            y_train_fold, y_test_fold = train_Y.iloc[train_index], train_Y.iloc[test_index]

            model.fit(x_train_fold.values, y_train_fold.values,
                    epochs=15, batch_size=100,
                    use_multiprocessing=True, verbose=0)
            
            predict = model.predict(test_X[feature_set[:k+1]], use_multiprocessing=True)
            predict = np.where(predict < 0.5, 0, 1)
            cv.append(f1_score(test_Y, predict))
        cv_score.append(np.mean(cv_score))

        model.fit(train_X2.values, train_Y.values,
                epochs=15, batch_size=100,
                use_multiprocessing=True, verbose=0)
        predict = model.predict(test_X[feature_set[:k+1]], use_multiprocessing=True)
        predict = np.where(predict < 0.5, 0, 1)
        test_score.append(f1_score(test_Y, predict))

        del model

    score_all_NN[f'cv_score_{name}_NN'] = cv_score
    score_all_NN[f'test_score_{name}_NN'] = test_score

    plt.subplot(2, 2, k)
    plt.title(name)
    plt.plot(range(train_X.shape[1]), cv_score, color='blue', linestyle='-', label='CV Score')
    plt.plot(range(train_X.shape[1]), test_score, color='red', linestyle='-', label='Test Score')
plt.tight_layout()
plt.show()

In [None]:
results = pd.concat([score_all_LR, score_all_GB, score_all_NN], axis=1)