In [1]:
import pandas as pd
import numpy as np
import pickle

import seaborn as sns
from matplotlib import pyplot as plt
import boto3

from io import BytesIO, StringIO
from time import perf_counter

In [35]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import balanced_accuracy_score, f1_score, roc_auc_score, accuracy_score

In [3]:
pd.set_option("display.max_rows", 100, "display.max_columns", None)

In [4]:
MY_BUCKET = 'sagemaker-studio-8x6b1t9vueh'
file_name = 'df_processed2.pkl'

s3 = boto3.resource('s3')
preprocessed_df_ref = s3.Object(MY_BUCKET, file_name)

In [None]:
def fetch_and_process_df():
    df = pickle.loads(preprocessed_df_ref.get()['Body'].read())
    le = LabelEncoder()
    val = le.fit_transform(df['Protocol'])
    df['Protocol'] = le.fit_transform(df['Protocol']).astype('uint8')
    # Converting Flags and Counts to uint8
    
    for col in df.columns:
        dtype = str(df[col].dtypes)
        if 'int32' in dtype:
            df[col] = df[col].astype('uint8')
    
    # Convert to dummy variable.
    df = pd.get_dummies(df, columns=['Protocol'])
    
    # Converting Label to Nominal Label class
    y_label_encoder = LabelEncoder()
    df['Label'] = y_label_encoder.fit_transform(df['Label']).astype('uint8')
    
    y = df['Label']
    df.drop(columns=['Label'], inplace=True)
    X = df
    del df
    
    return X, y

In [None]:
X, y = fetch_and_process_df()

In [14]:
random_state = 42
AUTO = -1

In [15]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=random_state)

Apply Standard Scalar without changing dtypes.

In [None]:
scalers = []
for col in X_train.columns:
    x = X_train[col]
    if 'uint8' not in str(x.dtypes):
        scaler = StandardScaler()
        x = x.to_numpy()
        x = scaler.fit_transform(x.reshape((-1, 1)))
        X_train[col] = x.reshape((-1, ))
        scalers.append(scaler)
    else:
        scalers.append(None)

In [None]:
for scaler, col in zip(scalers, X_test.columns):
    if scaler is None:
        continue
    
    x = X_test[col].to_numpy()
    x = scaler.transform(x.reshape((-1, 1)))
    X_test[col] = x.reshape((-1, ))


One vs Rest. Weighted classes. Stochastic Average Gradient Solver. 1000 iterations.

In [31]:
clf = LogisticRegression(random_state=random_state, solver='sag', 
                         class_weight='balanced', max_iter=1000, 
                         multi_class='multinomial', n_jobs=AUTO)
start_training_time = perf_counter()
_ = clf.fit(X_train, y_train)
end_training_time = perf_counter()

In [32]:
print(f'Time taken to fit the model {end_training_time - start_training_time} seconds')

Time taken to fit the model 1429.3951300529998 seconds


In [23]:
y_pred = clf.predict(X_test)

In [25]:
acc = balanced_accuracy_score(y_test, y_pred)

In [26]:
acc

0.7133497325291106

In [36]:
acc_unbalanced = accuracy_score(y_test, y_pred)
acc_unbalanced

0.6730538442582553

In [42]:
unbalanaced_f1 = f1_score(y_test, y_pred, average='macro')
unbalanaced_f1

0.5428696572663683

In [43]:
weighted_f1 = f1_score(y_test, y_pred, average='weighted')
weighted_f1

0.7498501772890271

In [None]:
roc = roc_auc_score(y_test, y_pred, average='weighted', multi_class='ovo')
roc

In [34]:
y_pred_1 = clf.predict(X_test)

In [38]:
acc_1 = balanced_accuracy_score(y_test, y_pred_1)
acc_1

0.7103108954788677

In [41]:
acc_unbalanced_1 = accuracy_score(y_test, y_pred_1)
acc_unbalanced_1

0.2967259117792016

In [44]:
weighted_f1_1 = f1_score(y_test, y_pred_1, average='weighted')
weighted_f1_1

0.3390841185922816

In [45]:
unbalanced_f1_1 = f1_score(y_test, y_pred_1, average='macro')
unbalanced_f1_1

0.39015495496356506