We are looking for a model which gives better scores than the logistic regression baseline model. We will search it on the a startified sample of the data.

In [1]:
import pandas as pd
import numpy as np
import pickle

import seaborn as sns
from matplotlib import pyplot as plt
import boto3

from io import BytesIO, StringIO
from time import perf_counter
from copy import deepcopy

In [2]:
from sklearn.svm import LinearSVC
from sklearn.ensemble import BaggingClassifier, RandomForestClassifier, ExtraTreesClassifier
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import balanced_accuracy_score, f1_score, accuracy_score

In [3]:
pd.set_option("display.max_rows", 100, "display.max_columns", None)
pd.options.mode.chained_assignment = None

In [4]:
MY_BUCKET = 'sagemaker-studio-8x6b1t9vueh'
file_name = 'df_processed2.pkl'

s3 = boto3.resource('s3')
preprocessed_df_ref = s3.Object(MY_BUCKET, file_name)

In [5]:
def fetch_and_process_df():
    df = pickle.loads(preprocessed_df_ref.get()['Body'].read())
    le = LabelEncoder()
    val = le.fit_transform(df['Protocol'])
    df['Protocol'] = le.fit_transform(df['Protocol']).astype('uint8')
    # Converting Flags and Counts to uint8
    
    for col in df.columns:
        dtype = str(df[col].dtypes)
        if 'int32' in dtype:
            df[col] = df[col].astype('uint8')
    
    # Convert to dummy variable.
    df = pd.get_dummies(df, columns=['Protocol'])
    
    # Converting Label to Nominal Label class
    y_label_encoder = LabelEncoder()
    df['Label'] = y_label_encoder.fit_transform(df['Label']).astype('uint8')
    
    y = df['Label']
    df.drop(columns=['Label'], inplace=True)
    X = df
    del df
    
    return X, y

In [6]:
X, y = fetch_and_process_df()

In [7]:
random_state = 42

Stratified Sampling on 20% of the data.

In [13]:
X, _, y, _ = train_test_split(X, y, train_size=0.2, stratify=y, random_state=random_state)

In [8]:
kfold = StratifiedKFold(n_splits=5)

In [9]:
def apply_scalars_to_training_data(X_train):
    scalers = []
    for col in X_train.columns:
        x = X_train[col]
        if 'uint8' not in str(x.dtypes):
            scaler = StandardScaler()
            x = x.to_numpy()
            x = scaler.fit_transform(x.reshape((-1, 1)))
            X_train[col] = x.reshape((-1, ))
            scalers.append(scaler)
        else:
            scalers.append(None)
            
    return scalers

In [10]:
def apply_scalars_to_validation_data(X_test, scalers):
    for scaler, col in zip(scalers, X_test.columns):
        if scaler is None:
            continue
    
        x = X_test[col].to_numpy()
        x = scaler.transform(x.reshape((-1, 1)))
        X_test[col] = x.reshape((-1, ))

In [11]:
def validate_model(clf, X, y, save_models=True):
    metrics = []
    
    print(f'Checking for {clf}')
    
    for idx, (train_index, val_index) in enumerate(kfold.split(X, y)):
        X_train, y_train = X.iloc[train_index], y.iloc[train_index]
        scales = apply_scalars_to_training_data(X_train)
        
        start_time = perf_counter()
        clf.fit(X_train.values, y_train.to_numpy())
        fit_time = perf_counter()
        
        print(f'Fiting time {fit_time - start_time:.2f} seconds')
        
        X_valid, y_valid = X.iloc[val_index], y.iloc[val_index]
        apply_scalars_to_validation_data(X_valid, scales)
        
        start_time = perf_counter()
        y_pred = clf.predict(X_valid.values)
        pred_time = perf_counter()
        
        print(f'Eval time {pred_time - start_time:.2f} seconds')
        
        metric = {'clf': deepcopy(clf)} if save_models else {}
        
        metrics.append({
            **metric,
            'balanced_accuracy': balanced_accuracy_score(y_valid, y_pred),
            'weighted_f1': f1_score(y_valid, y_pred, average='weighted'),
            'macro_f1': f1_score(y_valid, y_pred, average='macro'),
            'accuracy': accuracy_score(y_valid, y_pred)
        })
        
        print(f'idx{idx} done')

    return metrics


## Using ml.m5.4xlarge (16 vCPU + 64 GiB) for faster performance.

In [14]:
AUTO = -1

In [22]:
models = [
    BaggingClassifier(n_estimators=100,  random_state=random_state, n_jobs=AUTO),
    RandomForestClassifier(n_estimators=100, class_weight='balanced_subsample', random_state=random_state,
                          n_jobs=AUTO),
    ExtraTreesClassifier(n_estimators=100, class_weight='balanced_subsample', random_state=random_state, 
                         n_jobs=AUTO)
]

In [23]:
metrics = {}

In [24]:
for idx, model in enumerate(models):
    metrics[idx] = validate_model(model, X, y)

Checking for BaggingClassifier(base_estimator=None, bootstrap=True, bootstrap_features=False,
                  max_features=1.0, max_samples=1.0, n_estimators=100,
                  n_jobs=-1, oob_score=False, random_state=42, verbose=0,
                  warm_start=False)
Fiting time 1407.28 seconds
Eval time 6.80 seconds
idx0 done
Fiting time 1542.04 seconds
Eval time 7.09 seconds
idx1 done
Fiting time 1538.99 seconds
Eval time 7.23 seconds
idx2 done
Fiting time 1302.25 seconds
Eval time 7.26 seconds
idx3 done
Fiting time 1320.73 seconds
Eval time 7.31 seconds
idx4 done
Checking for RandomForestClassifier(bootstrap=True, ccp_alpha=0.0,
                       class_weight='balanced_subsample', criterion='gini',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       max_samples=None, min_impurity_decrease=0.0,
                       min_impurity_split=None, min_samples_leaf=1,
                       min_samples_split=2, min_weight_fract

In [26]:
metrics

{0: [{'balanced_accuracy': 0.8635690256907828,
   'weighted_f1': 0.9796226098877924,
   'macro_f1': 0.8667910456986466,
   'accuracy': 0.9830288935436176,
   'clf': BaggingClassifier(base_estimator=None, bootstrap=True, bootstrap_features=False,
                     max_features=1.0, max_samples=1.0, n_estimators=100,
                     n_jobs=-1, oob_score=False, random_state=42, verbose=0,
                     warm_start=False)},
  {'balanced_accuracy': 0.862452733601823,
   'weighted_f1': 0.9797398627863457,
   'macro_f1': 0.8625041601673554,
   'accuracy': 0.9830766387382948,
   'clf': BaggingClassifier(base_estimator=None, bootstrap=True, bootstrap_features=False,
                     max_features=1.0, max_samples=1.0, n_estimators=100,
                     n_jobs=-1, oob_score=False, random_state=42, verbose=0,
                     warm_start=False)},
  {'balanced_accuracy': 0.8516157635451065,
   'weighted_f1': 0.9797081176838656,
   'macro_f1': 0.8595215602426713,
   'accurac

In [None]:
for name, model in zip(['Bagging Classifier', 'Random Forest Classifier', 'Extra Trees Classifier'], metrics):
    print(f'Model {name}')
    balanced_acc = [metric['balanced_accuracy'] for metric in metrics[model]]
    weighted_f1 = [metric['weighted_f1'] for metric in metrics[model]]
    macro_f1 = [metric['macro_f1'] for metric in metrics[model]]
    accuracy = [metric['accuracy'] for metric in metrics[model]]
    
    print(f'Average Balanced Accuracy {sum(balanced_acc) / len(balanced_acc)}')
    print(f'Average Weighted F1 {sum(weighted_f1) / len(weighted_f1)}')
    print(f'Average Macro F1 {sum(macro_f1) / len(macro_f1)}')
    print(f'Average accuracy F1 {sum(accuracy) / len(accuracy)}')
    

Training time order: Extra Trees (1 min 30 secs) < Random Forest (2min 20 secs) << Bagging Classifier (23+ min)

Average Balanced Accuracy = 0.86031338305177554 (Bagging Classifier)

Average Balanced Accuracy = 0.873387004496311332 (Random Forest)

Average Balanced Accuracy = 0.871938320498389 (Extra Trees)

Average Weighted F1 Score = 0.97970102187353544 (Bagging Classifier)

Average Weighted F1 Score = 0.95568727904385798 (Random Forest)

Average Weighted F1 Score = 0.95753179955453514 (Extra Trees)

#### Random Forest had the highest balanced accuracy followed by Extra Trees Classifier. But the Extra Trees Classifier had a higher Weighted F1 Score and a lower training time. Therefore we go ahead with the Extra Trees Classifer.

Saving to Bucket

In [None]:
metrics_filename = 'metrics.pkl'
metrics_ref = s3.Object(MY_BUCKET, metrics_filename)
_ = metrics_ref(body=pickle.dumps(metrics))

In [27]:
X, y = fetch_and_process_df()

In [12]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=random_state)

In [15]:
models = [
    ExtraTreesClassifier(n_estimators=50, class_weight='balanced_subsample', random_state=random_state, 
                         n_jobs=AUTO),
    ExtraTreesClassifier(n_estimators=100, class_weight='balanced_subsample', random_state=random_state, 
                         n_jobs=AUTO),
    ExtraTreesClassifier(n_estimators=200, class_weight='balanced_subsample', random_state=random_state, 
                         n_jobs=AUTO)
]

In [16]:
metrics = {}

In [17]:
for idx, model in enumerate(models):
    metrics[idx] = validate_model(model, X_train, y_train, save_models=False)

Checking for ExtraTreesClassifier(bootstrap=False, ccp_alpha=0.0,
                     class_weight='balanced_subsample', criterion='gini',
                     max_depth=None, max_features='auto', max_leaf_nodes=None,
                     max_samples=None, min_impurity_decrease=0.0,
                     min_impurity_split=None, min_samples_leaf=1,
                     min_samples_split=2, min_weight_fraction_leaf=0.0,
                     n_estimators=50, n_jobs=-1, oob_score=False,
                     random_state=42, verbose=0, warm_start=False)
Fiting time 226.11 seconds
Eval time 9.86 seconds
idx0 done
Fiting time 225.51 seconds
Eval time 10.16 seconds
idx1 done
Fiting time 226.92 seconds
Eval time 9.65 seconds
idx2 done
Fiting time 223.63 seconds
Eval time 9.97 seconds
idx3 done
Fiting time 225.51 seconds
Eval time 9.76 seconds
idx4 done
Checking for ExtraTreesClassifier(bootstrap=False, ccp_alpha=0.0,
                     class_weight='balanced_subsample', criterion='gini',
   

Checking metrics on test set.

In [18]:
for model in models:
    y_pred = model.predict(X_test)
    
    metric = {
        'balanced_accuracy': balanced_accuracy_score(y_test, y_pred),
        'weighted_f1': f1_score(y_test, y_pred, average='weighted'),
        'macro_f1': f1_score(y_test, y_pred, average='macro'),
        'accuracy': accuracy_score(y_test, y_pred)
    }
    
    print(metric)

{'balanced_accuracy': 0.08333333333333333, 'weighted_f1': 0.7539450784882845, 'macro_f1': 0.07562917634907472, 'accuracy': 0.8307475973385905}
{'balanced_accuracy': 0.08333333333333333, 'weighted_f1': 0.7539450784882845, 'macro_f1': 0.07562917634907472, 'accuracy': 0.8307475973385905}
{'balanced_accuracy': 0.08333333333333333, 'weighted_f1': 0.7539450784882845, 'macro_f1': 0.07562917634907472, 'accuracy': 0.8307475973385905}


Checking metrics on entire set.

In [None]:
for model in models:
    y_pred = model.predict(X)
    
    metric = {
        'balanced_accuracy': balanced_accuracy_score(y, y_pred),
        'weighted_f1': f1_score(y, y_pred, average='weighted'),
        'macro_f1': f1_score(y, y_pred, average='macro'),
        'accuracy': accuracy_score(y, y_pred)
    }
    
    print(metric)
    

We achieved the same metrics for all the models on the entire dataset.