# Analysis of different Feature Reconstruction mechanisms.

Add needed imports.

In [None]:
%reload_ext autoreload
%autoreload 2
import pandas as pd
import numpy as np
import threadpoolctl
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.impute import IterativeImputer, KNNImputer
import matplotlib as mpl

from intellect.model.torch.model import MlpEncoder
from intellect.model.sklearn.model import EnhancedMlpRegressor
from intellect.io import dump, create_dir
from intellect.inspect import set_seed
from intellect.dataset import ProblemType, portions_from_data

threadpoolctl.threadpool_limits(limits=2);
mpl.rcParams['figure.dpi']= 70
pd.set_option("display.max_columns", 100)
pd.set_option("display.max_rows", 20)

In [73]:
N_HIDDEN_LAYERS = 4
N_HIDDEN_UNITS = 128

# training parameters
BATCH_SIZE = 256
MAX_EPOCHS = 100
EPOCHS_WO_IMPROVE = 10

VALIDATION_SIZE = 0.2
BENIGN_LABELS = ["BENIGN"]
DATASET_PORTIONS = (0.6, 0.1, 0.1, 0.2)
KEPT_FEATURES_RATIO = 0.5

# traffic categories that only this client (organization) has
CLIENT_CATEGORIES = ["BENIGN", "DDoS"]

DATASET = "../second_publication_ONGOING/dataset_shrinked.h5"
OUTPUT_DIR = "./reconstruct_output/"

In [10]:
set_seed()
_, _, finetune, test = portions_from_data(DATASET, benign_labels=BENIGN_LABELS, ptype=ProblemType.BINARY, ratios=DATASET_PORTIONS)

In [37]:
set_seed()
filtered_features = np.random.choice(finetune.features, round(finetune.n_features*KEPT_FEATURES_RATIO), replace=False)
finetune_client = finetune.filter_categories(CLIENT_CATEGORIES).balance_categories()
test_client = test.filter_categories(CLIENT_CATEGORIES).balance_categories()
idxes = test_client.filter_features(filtered_features, get_removal_idx=True)

In [8]:
create_dir(OUTPUT_DIR)

In [79]:
def random_forest_regressor():
    set_seed()
    finetune_client_filtered = finetune_client.filter_features(filtered_features, default=0)
    test_client_filtered = test_client.filter_features(filtered_features, default=0.)
    p = RandomForestRegressor()
    p.fit(finetune_client_filtered.X, finetune_client.X)
    dump(p, OUTPUT_DIR + "sklearn_rfr.pkl")
    return mean_squared_error(test_client.X, p.predict(test_client_filtered.X))

def linear_regression():
    set_seed()
    finetune_client_filtered = finetune_client.filter_features(filtered_features, default=0)
    test_client_filtered = test_client.filter_features(filtered_features, default=0.)
    p = LinearRegression()
    p.fit(finetune_client_filtered.X, finetune_client.X)
    dump(p, OUTPUT_DIR + "sklearn_lr.pkl")
    return mean_squared_error(test_client.X, p.predict(test_client_filtered.X))

def sklearn_autoencoder():
    set_seed()
    finetune_client_filtered = finetune_client.filter_features(filtered_features, default=0)
    test_client_filtered = test_client.filter_features(filtered_features, default=0.)
    p = EnhancedMlpRegressor(hidden_layer_sizes=(N_HIDDEN_UNITS,)*N_HIDDEN_LAYERS, max_iter=MAX_EPOCHS, batch_size=BATCH_SIZE,
                             validation_fraction=VALIDATION_SIZE, warm_start=True, shuffle=True)
    p.fit(finetune_client_filtered.X, finetune_client.X)
    dump(p, OUTPUT_DIR + "sklearn_ae.pkl")
    return mean_squared_error(test_client.X, p.predict(test_client_filtered.X))

def torch_autoencoder():
    set_seed()
    finetune_client_filtered = finetune_client.filter_features(filtered_features, default=0)
    test_client_filtered = test_client.filter_features(filtered_features, default=0.)
    
    finetune_client_filtered.y = finetune_client.X

    p: MlpEncoder = MlpEncoder(finetune_client.features, N_HIDDEN_LAYERS)
    history = p.fit(finetune_client_filtered, validation_dataset=VALIDATION_SIZE, batch_size=BATCH_SIZE, max_epochs=MAX_EPOCHS, epochs_wo_improve=EPOCHS_WO_IMPROVE,
          metric=mean_squared_error, higher_better=False, shuffle=True);
    p.save(OUTPUT_DIR + "torch_autoencoder")
    history_df = pd.DataFrame(history)
    dump(history_df, OUTPUT_DIR + "torch_autoencoder_history.csv")
    return mean_squared_error(test_client.X, p.predict(test_client_filtered.X))

def iterative_imputer():
    set_seed()
    finetune_client_filtered = finetune_client.filter_features(filtered_features, default=np.nan)
    test_client_filtered = test_client.filter_features(filtered_features, default=np.nan)
    p = IterativeImputer()
    p.fit(finetune_client_filtered.X, finetune_client.X)
    dump(p, OUTPUT_DIR + "sklearn_ii.pkl")
    out = test_client_filtered.X.to_numpy()
    out[:, idxes] = p.transform(test_client_filtered.X)
    return mean_squared_error(test_client.X, out)

def knn_imputer():
    set_seed()
    finetune_client_filtered = finetune_client.filter_features(filtered_features, default=np.nan)
    test_client_filtered = test_client.filter_features(filtered_features, default=np.nan)
    p = KNNImputer()
    p.fit(finetune_client_filtered.X, finetune_client.X)
    dump(p, OUTPUT_DIR + "sklearn_knni.pkl")
    out = test_client_filtered.X.to_numpy()
    out[:, idxes] = p.transform(test_client_filtered.X)
    return mean_squared_error(test_client.X, out)

In [55]:
knn_imputer()

80306894820406.47

In [57]:
iterative_imputer()

80306894820406.47

In [60]:
random_forest_regressor()

41656774906540.85

In [61]:
linear_regression()

4.450642799241474e+23

In [72]:
sklearn_autoencoder()

488401872233145.75

In [None]:
torch_autoencoder()