This notebook conduct dataset cleaning on CICIDS2017 in the following aspect:
* remove space before each feature names
* assign 0 for non-negative features when there is negative values
* drop zero variance columns (only have 1 unique value)
* remove inf, -inf, nan, and duplicate rows
* drop columns with identical values

However, two recent papers [1](#first), [2](#second) have been discussing that CIC-IDS-2017 and CSE-CIC-IDS-2018 have many errors throughout the dataset creation lifecycle, such as in attack orchestration, feature generation, documentation, and labeling.
* Attack orchestration errors: These are errors that occur during the execution of the attack scenarios on the network.
* Feature generation errors: These are errors that occur during the extraction of features from the network traffic using CICFlowMeter.
* Documentation errors: These are errors that occur during the description and explanation of the datasets and their components. 
* Labeling errors: These are errors that occur during the assignment of labels to the network flows based on their class or category.

[1](#first) have published their improved version of these two datasets [here](https://intrusion-detection.distrinet-research.be/CNS2022/Dataset_Download.html).

In [8]:
import os
import json
import numpy as np
import pandas as pd
from pprint import pprint
from itertools import combinations, product

#sklearn
from sklearn.manifold import TSNE
from sklearn.decomposition import PCA
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, RobustScaler, QuantileTransformer, MinMaxScaler
from sklearn.metrics import precision_recall_curve, auc, roc_curve, recall_score, precision_score, f1_score

#graph
import seaborn as sns
import matplotlib.pyplot as plt

# Original CICIDS2017

In [9]:
dataset_csv_path = '/Users/shreenidhishetty/Documents/ECC Project/MachineLearningCVE'
csv_file_names = ['Friday-WorkingHours-Afternoon-PortScan.pcap_ISCX.csv', 
                  'Friday-WorkingHours-Afternoon-DDos.pcap_ISCX.csv', 
                  'Friday-WorkingHours-Morning.pcap_ISCX.csv', 
                  'Monday-WorkingHours.pcap_ISCX.csv', 
                  'Thursday-WorkingHours-Afternoon-Infilteration.pcap_ISCX.csv', 
                  'Thursday-WorkingHours-Morning-WebAttacks.pcap_ISCX.csv', 
                  'Tuesday-WorkingHours.pcap_ISCX.csv', 
                  'Wednesday-workingHours.pcap_ISCX.csv']

complete_paths = []
for csv_file_name in csv_file_names:
    complete_paths.append(os.path.join(dataset_csv_path, csv_file_name))

df = pd.concat(map(pd.read_csv, complete_paths), 
               ignore_index = True)

In [3]:
def clean_df(df):
    # Remove the space before each feature names
    df.columns = df.columns.str.strip()
    print('dataset shape', df.shape)

    # This set of feature should have >= 0 values
    num = df._get_numeric_data()
    num[num < 0] = 0

    zero_variance_cols = []
    for col in df.columns:
        if len(df[col].unique()) == 1:
            zero_variance_cols.append(col)
    df.drop(zero_variance_cols, axis = 1, inplace = True)
    print('zero variance columns', zero_variance_cols, 'dropped')
    print('shape after removing zero variance columns:', df.shape)

    df.replace([np.inf, -np.inf], np.nan, inplace = True)
    print(df.isna().any(axis = 1).sum(), 'rows dropped')
    df.dropna(inplace = True)
    print('shape after removing nan:', df.shape)

    # Drop duplicate rows
    df.drop_duplicates(inplace = True)
    print('shape after dropping duplicates:', df.shape)

    column_pairs = [(i, j) for i, j in combinations(df, 2) if df[i].equals(df[j])]
    ide_cols = []
    for column_pair in column_pairs:
        ide_cols.append(column_pair[1])
    df.drop(ide_cols, axis = 1, inplace = True)
    print('columns which have identical values', column_pairs, 'dropped')
    print('shape after removing identical value columns:', df.shape)
    return df
df = clean_df(df)

dataset shape (2830743, 79)
zero variance columns ['Bwd PSH Flags', 'Bwd URG Flags', 'Fwd Avg Bytes/Bulk', 'Fwd Avg Packets/Bulk', 'Fwd Avg Bulk Rate', 'Bwd Avg Bytes/Bulk', 'Bwd Avg Packets/Bulk', 'Bwd Avg Bulk Rate'] dropped
shape after removing zero variance columns: (2830743, 71)
2867 rows dropped
shape after removing nan: (2827876, 71)
shape after dropping duplicates: (2520798, 71)
columns which have identical values [('Total Fwd Packets', 'Subflow Fwd Packets'), ('Total Backward Packets', 'Subflow Bwd Packets'), ('Fwd PSH Flags', 'SYN Flag Count'), ('Fwd URG Flags', 'CWE Flag Count'), ('Fwd Header Length', 'Fwd Header Length.1')] dropped
shape after removing identical value columns: (2520798, 66)


In [4]:
df['Label'].value_counts()

BENIGN                        2095057
DoS Hulk                       172846
DDoS                           128014
PortScan                        90694
DoS GoldenEye                   10286
FTP-Patator                      5931
DoS slowloris                    5385
DoS Slowhttptest                 5228
SSH-Patator                      3219
Bot                              1948
Web Attack � Brute Force         1470
Web Attack � XSS                  652
Infiltration                       36
Web Attack � Sql Injection         21
Heartbleed                         11
Name: Label, dtype: int64

# Improved CICIDS2017

In [10]:
dataset_csv_path = '/Users/shreenidhishetty/Documents/ECC Project/CICIDS2017_improved'
csv_file_names = ['monday.csv', 
                  'tuesday.csv', 
                  'wednesday.csv', 
                  'thursday.csv', 
                  'friday.csv']

complete_paths = []
for csv_file_name in csv_file_names:
    complete_paths.append(os.path.join(dataset_csv_path, csv_file_name))

improved_df = pd.concat(map(pd.read_csv, complete_paths), 
                        ignore_index = True)

In [11]:
dropping_cols = ['id', 'Flow ID', 'Src IP', 'Src Port', 'Dst IP', 
                 'Dst Port', 'Timestamp']
improved_df = clean_df(improved_df)
improved_df.drop(dropping_cols, axis = 1, inplace = True)
improved_df['Label'].value_counts()

dataset shape (2099976, 91)
zero variance columns [] dropped
shape after removing zero variance columns: (2099976, 91)
5 rows dropped
shape after removing nan: (2099971, 91)
shape after dropping duplicates: (2099971, 91)
columns which have identical values [] dropped
shape after removing identical value columns: (2099971, 91)


BENIGN                                    1582561
Portscan                                   159066
DoS Hulk                                   158468
DDoS                                        95144
Infiltration - Portscan                     71767
DoS GoldenEye                                7567
Botnet - Attempted                           4067
FTP-Patator                                  3972
DoS Slowloris                                3859
DoS Slowhttptest - Attempted                 3368
SSH-Patator                                  2961
DoS Slowloris - Attempted                    1847
DoS Slowhttptest                             1740
Web Attack - Brute Force - Attempted         1292
Botnet                                        736
Web Attack - XSS - Attempted                  655
DoS Hulk - Attempted                          581
DoS GoldenEye - Attempted                      80
Web Attack - Brute Force                       73
Infiltration - Attempted                       45


In [None]:
improved_df['Label'].value_counts()

In [None]:
improved_df['Attempted Category'].value_counts()

Although the flows of attempted attacks have malicious intents,they don't have anomaly effect on the system due to the following reasons [3](#third):
* Category 0 - No payload sent by attacker
* Category 1 - Port/System closed
* Category 2 - Attack Startup/Teardown Artefact
* Category 3 - No malicious payload
* Category 4 - Attack Artefact
* Category 5 - Attack Implemented Incorrectly
* Category 6 - Target System Unresponsive

For these kinds of attempted labels, we have a choice to treat them as benign or malicious.

In most cases of network intrusion detection, we process each flow separately, so the classifier can only see only flow at a time and doesn't know the context.

As a result, they are re-labeled as BENIGN [4](#fourth).

In [None]:
attepmted_labels = [s for s in improved_df['Label'].unique() if 'Attempted' in s]

improved_df.drop(['Attempted Category'], axis = 1, inplace = True)

improved_df.replace(attepmted_labels, 'BENIGN', inplace = True)
improved_df['Label'].value_counts()

# 2D-PCA visualization

In [None]:
# https://stackoverflow.com/questions/43640952/how-to-subsample-a-pandas-dataframe-respecting-the-frequency-of-each-class
subsample_df = df.groupby('Label').apply(pd.DataFrame.sample, frac = 0.1).reset_index(drop = True)

X = subsample_df.drop(['Label'], axis = 1)
y = subsample_df['Label']

pca = PCA(n_components = 2, random_state = 0)
z = pca.fit_transform(X) 

pca_15_df = pd.DataFrame()
pca_15_df['Label'] = y
pca_15_df['dimension 1'] = z[:, 0]
pca_15_df['dimension 2'] = z[:, 1]

sns.scatterplot(x = 'dimension 1', y = 'dimension 2', 
                hue = pca_15_df.Label,
                palette = sns.color_palette('deep', len(pca_15_df.Label.value_counts())),
                data = pca_15_df).set(title = 'CICIDS2017 15 Classes PCA Projection')
plt.legend(loc = 'center left', bbox_to_anchor = (1, 0.5)) 
plt.show()

pca_2_df = pd.DataFrame()
pca_2_df['Label'] = y
pca_2_df.loc[pca_2_df.Label != 'BENIGN', 'Label'] = 'ATTACK'
pca_2_df['dimension 1'] = z[:, 0]
pca_2_df['dimension 2'] = z[:, 1]

sns.scatterplot(x = 'dimension 1', y = 'dimension 2', 
                hue = pca_2_df.Label,
                palette = sns.color_palette('deep', 2),
                data = pca_2_df).set(title = 'CICIDS2017 Binary Classes PCA Projection') 
plt.legend(loc = 'center left', bbox_to_anchor = (1, 0.5)) 
plt.show()

In [None]:
subsample_improved_df = improved_df.groupby('Label').apply(pd.DataFrame.sample, frac = 0.1).reset_index(drop = True)

X = subsample_improved_df.drop(['Label'], axis = 1)
y = subsample_improved_df['Label']

pca = PCA(n_components = 2, random_state = 0)
z = pca.fit_transform(X) 

pca_15_df = pd.DataFrame()
pca_15_df['Label'] = y
pca_15_df['dimension 1'] = z[:, 0]
pca_15_df['dimension 2'] = z[:, 1]

sns.scatterplot(x = 'dimension 1', y = 'dimension 2', 
                hue = pca_15_df.Label,
                palette = sns.color_palette('deep', len(pca_15_df.Label.value_counts())),
                data = pca_15_df).set(title = 'Improved CICIDS2017 15 Classes PCA Projection')
plt.legend(loc = 'center left', bbox_to_anchor = (1, 0.5)) 
plt.show()

pca_2_df = pd.DataFrame()
pca_2_df['Label'] = y
pca_2_df.loc[pca_2_df.Label != 'BENIGN', 'Label'] = 'ATTACK'
pca_2_df['dimension 1'] = z[:, 0]
pca_2_df['dimension 2'] = z[:, 1]

sns.scatterplot(x = 'dimension 1', y = 'dimension 2', 
                hue = pca_2_df.Label,
                palette = sns.color_palette('deep', 2),
                data = pca_2_df).set(title = 'Improved CICIDS2017 Binary Classes PCA Projection') 
plt.legend(loc = 'center left', bbox_to_anchor = (1, 0.5)) 
plt.show()

# 2D-TSNE visualization

In [None]:
%%time
X = subsample_df.drop(['Label'], axis = 1)
y = subsample_df['Label']

tsne = TSNE(n_components = 2, n_jobs = -1, verbose = 0, random_state = 0)
z = tsne.fit_transform(X) 

tsne_15_df = pd.DataFrame()
tsne_15_df['Label'] = y
tsne_15_df['dimension 1'] = z[:, 0]
tsne_15_df['dimension 2'] = z[:, 1]

sns.scatterplot(x = 'dimension 1', y = 'dimension 2', 
                hue = tsne_15_df.Label,
                palette = sns.color_palette('hls', len(tsne_15_df.Label.value_counts())),
                data = tsne_15_df).set(title = 'CICIDS2017 15 Classes T-SNE Projection')
plt.legend(loc = 'center left', bbox_to_anchor = (1, 0.5)) 
plt.show()

tsne_2_df = pd.DataFrame()
tsne_2_df['Label'] = y
tsne_2_df.loc[tsne_2_df.Label != 'BENIGN', 'Label'] = 'ATTACK'
tsne_2_df['dimension 1'] = z[:, 0]
tsne_2_df['dimension 2'] = z[:, 1]

sns.scatterplot(x = 'dimension 1', y = 'dimension 2', 
                hue = tsne_2_df.Label,
                palette = sns.color_palette('hls', 2),
                data = tsne_2_df).set(title = 'CICIDS2017 Binary Classes T-SNE Projection') 
plt.legend(loc = 'center left', bbox_to_anchor = (1, 0.5)) 
plt.show()

In [None]:
%%time
X = subsample_improved_df.drop(['Label'], axis = 1)
y = subsample_improved_df['Label']

tsne = TSNE(n_components = 2, n_jobs = -1, verbose = 0, random_state = 0)
z = tsne.fit_transform(X) 

tsne_15_df = pd.DataFrame()
tsne_15_df['Label'] = y
tsne_15_df['dimension 1'] = z[:, 0]
tsne_15_df['dimension 2'] = z[:, 1]

sns.scatterplot(x = 'dimension 1', y = 'dimension 2', 
                hue = tsne_15_df.Label,
                palette = sns.color_palette('deep', len(tsne_15_df.Label.value_counts())),
                data = tsne_15_df).set(title = 'Improved CICIDS2017 15 Classes T-SNE Projection')
plt.legend(loc = 'center left', bbox_to_anchor = (1, 0.5)) 
plt.show()

tsne_2_df = pd.DataFrame()
tsne_2_df['Label'] = y
tsne_2_df.loc[tsne_2_df.Label != 'BENIGN', 'Label'] = 'ATTACK'
tsne_2_df['dimension 1'] = z[:, 0]
tsne_2_df['dimension 2'] = z[:, 1]

sns.scatterplot(x = 'dimension 1', y = 'dimension 2', 
                hue = tsne_2_df.Label,
                palette = sns.color_palette('deep', 2),
                data = tsne_2_df).set(title = 'Improved CICIDS2017 Binary Classes T-SNE Projection') 
plt.legend(loc = 'center left', bbox_to_anchor = (1, 0.5)) 
plt.show()

# Train, validation, and test split [6](#sixth)

In [None]:
all_malicious = improved_df[improved_df.Label != 'BENIGN']
all_benign = improved_df[improved_df.Label == 'BENIGN']
benign_1M = all_benign.sample(n = 1000000, random_state = 0)

train_size = 500000
test_size = 500000
validation_perc = 0.15

# benign trainin and testing
Y = benign_1M['Label'].map(lambda x: 1 if (x == 'BENIGN') else -1)
labels = benign_1M['Label']

X_train, X_test, y_train, y_test = train_test_split(benign_1M.drop(columns = ['Label']),
                                                    Y,
                                                    train_size = train_size, 
                                                    test_size = test_size, 
                                                    shuffle = True, 
                                                    stratify = Y, 
                                                    random_state = 0)
train_labels, test_labels = labels.loc[y_train.index], labels.loc[y_test.index]

# validation and testing
X_mal = all_malicious.drop(columns = ['Label'])
y_mal = all_malicious['Label'].map(lambda x: 1 if (x == 'Benign') else -1)
labels_mal = all_malicious['Label']

X_test = pd.concat([X_test, X_mal])
y_test = pd.concat([y_test, y_mal])
test_labels = pd.concat([test_labels, labels_mal])

X_val, X_t, y_val, y_t, label_val, label_t = train_test_split(X_test, 
                                                              y_test, 
                                                              test_labels, 
                                                              train_size = validation_perc, 
                                                              random_state = 0, 
                                                              stratify = test_labels, 
                                                              shuffle = True)

print("***** Train Data *****")
print(train_labels.value_counts())
print("***** Validation Data *****")
print(label_val.value_counts())
print("***** Test Data *****")
print(label_t.value_counts())

# Anomaly score and result evaluation [6](#sixth)

In [None]:
def anomaly_scores(original, transformed):
    sse = np.sum((original - transformed) ** 2, axis = 1) # sum of squared errors
    return sse

def evaluate_results(y_true, score):
    precision, recall, threshold = precision_recall_curve(y_true, score, pos_label = -1)
    au_precision_recall = auc(recall, precision)
    results = pd.DataFrame({'precision': precision, 'recall': recall})
    results['f1'] = 2 * precision * recall / (precision + recall)
    max_index = results['f1'].idxmax()
    best = results.loc[results['f1'].idxmax()]
    best['threshold'] = threshold[max_index]
    best['au_precision_recall'] = au_precision_recall
    fpr, tpr, thresholds = roc_curve(y_true, score, pos_label = -1)
    best['auroc'] = auc(fpr, tpr)
    return best

def evaluate_predictions(y_true, y_pred):
    results = {}
    results['precision'] = precision_score(y_true, y_pred, pos_label = -1, zero_division = 0)
    results['recall'] = recall_score(y_true, y_pred, pos_label = -1, zero_division = 0)
    results['f1'] = f1_score(y_true, y_pred, pos_label = -1, zero_division = 0)
    return results

def evaluate_test_data(y_true, score, threshold):
    y_pred = np.array([1 if score < threshold else -1 for score in score])
    results = evaluate_predictions(y_true, y_pred)
    precision, recall, threshold = precision_recall_curve(y_true, score, pos_label = -1)
    results['au_precision_recall'] = auc(recall, precision)
    fpr, tpr, thresholds = roc_curve(y_true, score, pos_label = -1)
    results['auroc'] = auc(fpr, tpr)
    return results

def pca_classifier(scaler, pca, threshold):
    def clf(X):
        x = scaler.transform(X)
        X_pca = pca.transform(x)
        X_pca_inv = pca.inverse_transform(X_pca)
        score = anomaly_scores(x, X_pca_inv)
        return np.array([1 if score < threshold else -1 for score in score])
    return clf

# PCA intrusion detection [5](#fifth), [6](#sixth)

## Preprocess with scalar [6](#sixth)

## Hyper-parameter tuning [6](#sixth)

In [None]:
%%time

import tqdm

input_dim = X_train.shape[1]

params = {
    'scalers': [StandardScaler(), 
                RobustScaler(quantile_range = (25, 75)), 
                QuantileTransformer(output_distribution = 'normal'), 
                MinMaxScaler(feature_range=(0, 1), copy = True)],
    'n_components': list(range(input_dim)),
}

best_scaler = None
best_pca = None
best_score = None
index = 0
results = []

for (scaler, n_components) in tqdm.tqdm(list(product(params['scalers'], params['n_components']))):
    # print('Scalar', scaler)
    x_train = scaler.fit_transform(X_train)
    x_val = scaler.transform(X_val)
    
    # print('Training number of components', n_components, 'PCA', end = '')
    n_components = 64
    pca = PCA(n_components = n_components, 
                copy = True, 
                whiten = False, 
                svd_solver = 'auto', 
                tol = 0.0, 
                iterated_power = 'auto', 
                random_state = 0)
    pca.fit(x_train)
    X_val_pca = pca.transform(x_val)
    X_val_pca_inv = pca.inverse_transform(X_val_pca)
    val_score = anomaly_scores(x_val, X_val_pca_inv)
    val_metrics = evaluate_results(y_val, val_score)
    val_metrics['n_components'] = n_components
    val_metrics['scaler'] = scaler
    val_metrics['index'] = index
    results.append(val_metrics)
    # print('validation auroc', val_metrics['auroc'])
    
    index += 1
    
    if best_score is None or val_metrics['auroc'] > best_score['auroc']:
        best_scaler = scaler
        best_pca = pca
        best_score = val_metrics


In [None]:
print('Best scaler', best_score['scaler'], 'Best number of components', best_score['n_components'])
x_t = best_scaler.transform(X_t)
X_t_pca = best_pca.transform(x_t)
X_t_pca_inv = best_pca.inverse_transform(X_t_pca)
test_score = anomaly_scores(x_t, X_t_pca_inv)

print('Test performance on all attacks')
pprint(evaluate_test_data(y_t, test_score, best_score.threshold))

In [None]:
df = pd.DataFrame({'label_t': label_t, 'y_test': y_t, 'test_score': test_score})
test_df = df[(df.label_t == 'DoS Slowloris') | (df.label_t == 'DoS Slowhttptest') | (df.label_t == 'BENIGN')]

print('Test performance on DoS Slowloris and Slowhttptest')
pprint(evaluate_test_data(test_df.y_test.tolist(), test_df.test_score.tolist(), best_score.threshold))

In [None]:
for label in [s for s in improved_df['Label'].unique() if s != 'BENIGN']:
    test_df = df[(df.label_t == label) | (df.label_t == 'BENIGN')]
    
    print('Test performance on', label)
    pprint(evaluate_test_data(test_df.y_test.tolist(), test_df.test_score.tolist(), best_score.threshold))

# SHAP explainability

In [None]:
import shap

subsample_X_t = pd.DataFrame.sample(X_t, frac = 0.01).reset_index(drop = True)

# explain the model's predictions using SHAP
explainer = shap.Explainer(pca_classifier(best_scaler, best_pca, best_score.threshold), 
                           masker = shap.maskers.Independent(data = subsample_X_t))
shap_values = explainer(subsample_X_t)

# visualize the first prediction's explanation
shap.plots.waterfall(shap_values[0])

In [None]:
shap.summary_plot(shap_values, subsample_X_t)

In [None]:
sns.displot(
    data = subsample_improved_df,
    x = 'Bwd Packet Length Mean', y = 'Packet Length Mean', hue = 'Label',
)
# Bwd Segment Size Avg, Subflow Bwd Bytes, Protocol, Fwd Packet Length Mean

In [None]:
sns.displot(subsample_improved_df, x = 'Bwd Packet Length Mean', hue = 'Label', stat = 'probability')#, kind = 'kde')

In [None]:
sns.displot(subsample_improved_df, x = 'Subflow Bwd Bytes', hue = 'Label', kind = 'kde')

In [None]:
sns.displot(subsample_improved_df, x = 'Protocol', hue = 'Label', kind = 'kde')

In [None]:
improved_df.to_csv('cleaned_improved_cicids2017.csv', encoding = 'utf-8', index = False)

# Reference
1. <a id='first'></a>[Liu, Lisa, et al. "Error prevalence in nids datasets: A case study on cic-ids-2017 and cse-cic-ids-2018." 2022 IEEE Conference on Communications and Network Security (CNS). IEEE, 2022.](https://ieeexplore.ieee.org/abstract/document/9947235)
2. <a id='second'></a>[Lanvin, Maxime, et al. "Errors in the CICIDS2017 dataset and the significant differences in detection performances it makes." International Conference on Risks and Security of Internet and Systems. Cham: Springer Nature Switzerland, 2022.](https://link.springer.com/chapter/10.1007/978-3-031-31108-6_2)
3. <a id='third'></a>[Improved CIC-IDS 2017 Documentation](https://intrusion-detection.distrinet-research.be/CNS2022/CICIDS2017.html)
4. <a id='fourth'></a>[Error Prevalence in NIDS datasets: A Case Study on CIC-IDS-2017 and CSE-CIC-IDS-2018 (G. Engelen)](https://www.youtube.com/watch?v=sJvZKhw3lYo)
5. <a id='fifth'></a>Verkerken, Miel, et al. "Towards model generalization for intrusion detection: Unsupervised machine learning techniques." Journal of Network and Systems Management 30 (2022): 1-25.
6. <a id='sixth'></a>[CIC-IDS-2018](https://gitlab.ilabt.imec.be/mverkerk/cic-ids-2018)
7. Yang, Li, et al. "LCCDE: A decision-based ensemble framework for intrusion detection in the internet of vehicles." GLOBECOM 2022-2022 IEEE Global Communications Conference. IEEE, 2022.
8. [TSNE Visualization Example in Python](https://www.datatechnotes.com/2020/11/tsne-visualization-example-in-python.html)
9. [Multicore t-SNE](https://github.com/DmitryUlyanov/Multicore-TSNE)
10. [sklearn.manifold.TSNE](https://scikit-learn.org/stable/modules/generated/sklearn.manifold.TSNE.html)
11. [What do maskers really do in SHAP package and fit them to train or test?](https://stackoverflow.com/questions/66560839/what-do-maskers-really-do-in-shap-package-and-fit-them-to-train-or-test)
12. [Python Statistics Fundamentals: How to Describe Your Data](https://realpython.com/python-statistics/)

Randome Forest Classifier

In [13]:
# Use a smaller subset for experimentation
subset_size = 10000  # Adjust the size as needed
X_subset = X_train[:subset_size]
y_subset = y_train[:subset_size]

# Train the classifier with the smaller subset
clf.fit(X_subset, y_subset)


RandomForestClassifier(random_state=0)

In [14]:
clf = RandomForestClassifier(n_estimators=100, random_state=0, n_jobs=-1)


In [15]:
from sklearn.model_selection import GridSearchCV

# Define the parameter grid
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20],
    # Add other hyperparameters as needed
}

# Perform Grid Search
grid_search = GridSearchCV(RandomForestClassifier(random_state=0, n_jobs=-1), param_grid, cv=3)
grid_search.fit(X_train, y_train)

# Get the best model
best_clf = grid_search.best_estimator_

# Make predictions and evaluate
y_pred = best_clf.predict(X_test)


KeyboardInterrupt: 

In [16]:
clf = RandomForestClassifier(n_estimators=50, random_state=0)  # Adjust the number as needed


In [17]:
clf = RandomForestClassifier(n_estimators=100, random_state=0, n_jobs=-1)


In [18]:
clf = RandomForestClassifier(n_estimators=100, random_state=0, max_depth=10)  # Adjust the depth as needed


In [19]:
X_train_subsample, _, y_train_subsample, _ = train_test_split(X_train, y_train, test_size=0.9, random_state=0)
clf.fit(X_train_subsample, y_train_subsample)


RandomForestClassifier(max_depth=10, random_state=0)

In [20]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, f1_score, classification_report

# Assuming 'improved_df' is your cleaned and preprocessed DataFrame

# Separate features and target variable
X = improved_df.drop(['Label'], axis=1)
y = improved_df['Label']

# Convert labels to binary (1 for ATTACK, 0 for BENIGN)
y_binary = (y != 'BENIGN').astype(int)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y_binary, test_size=0.2, random_state=0)

# Reduce the number of trees and enable parallel processing
clf = RandomForestClassifier(n_estimators=50, random_state=0, n_jobs=-1)

# Train the classifier on a subset of the training data
X_train_subsample, _, y_train_subsample, _ = train_test_split(X_train, y_train, test_size=0.9, random_state=0)
clf.fit(X_train_subsample, y_train_subsample)

# Make predictions on the test set
y_pred = clf.predict(X_test)

# Evaluate the performance
accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

# Print the results
print("Accuracy:", accuracy)
print("F1 Score:", f1)
print("Classification Report:\n", report)


Accuracy: 0.9996452338718318
F1 Score: 0.9992837330487494
Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00    315942
           1       1.00      1.00      1.00    104053

    accuracy                           1.00    419995
   macro avg       1.00      1.00      1.00    419995
weighted avg       1.00      1.00      1.00    419995



In [21]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, f1_score, classification_report

# Assuming 'improved_df' is your cleaned and preprocessed DataFrame

# Separate features and target variable
X = improved_df.drop(['Label'], axis=1)
y = improved_df['Label']

# Convert labels to binary (1 for ATTACK, 0 for BENIGN)
y_binary = (y != 'BENIGN').astype(int)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y_binary, test_size=0.2, random_state=0)

# Optimize hyperparameters and use parallel processing
clf = RandomForestClassifier(n_estimators=100, max_depth=None, min_samples_leaf=1, random_state=0, n_jobs=-1)

# Train the classifier on the entire training set
clf.fit(X_train, y_train)

# Make predictions on the test set
y_pred = clf.predict(X_test)

# Evaluate the performance
accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

# Print the results
print("Accuracy:", accuracy)
print("F1 Score:", f1)
print("Classification Report:\n", report)


Accuracy: 0.9999190466553173
F1 Score: 0.9998366154408019
Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00    315942
           1       1.00      1.00      1.00    104053

    accuracy                           1.00    419995
   macro avg       1.00      1.00      1.00    419995
weighted avg       1.00      1.00      1.00    419995



In [22]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, f1_score, classification_report

# Assuming 'improved_df' is your cleaned and preprocessed DataFrame

# Separate features and target variable
X = improved_df.drop(['Label'], axis=1)
y = improved_df['Label']

# Convert labels to binary (1 for ATTACK, 0 for BENIGN)
y_binary = (y != 'BENIGN').astype(int)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y_binary, test_size=0.2, random_state=0)

# Optimize hyperparameters and use parallel processing
clf_2 = RandomForestClassifier(n_estimators=100, max_depth=None, min_samples_leaf=1, random_state=0, n_jobs=-1)

# Train the classifier on the entire training set
clf_2.fit(X_train, y_train)

# Make predictions on the test set
y_pred = clf_2.predict(X_test)

# Evaluate the performance
accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

# Print the results
print("Accuracy:", accuracy)
print("F1 Score:", f1)
print("Classification Report:\n", report)


Accuracy: 0.9999190466553173
F1 Score: 0.9998366154408019
Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00    315942
           1       1.00      1.00      1.00    104053

    accuracy                           1.00    419995
   macro avg       1.00      1.00      1.00    419995
weighted avg       1.00      1.00      1.00    419995



In [27]:
import joblib


model_filename = 'random_forest_model.pkl'
joblib.dump(clf_2, model_filename)


loaded_model = joblib.load(model_filename)

how to use it
'loaded_model' to make predictions on new data
new_data = ...  # Your new data here
predictions = loaded_model.predict(new_data)


Support Vector Machines

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, f1_score, classification_report
import pickle

# Assuming 'improved_df' is your cleaned and preprocessed DataFrame

# Separate features and target variable
X = improved_df.drop(['Label'], axis=1)
y = improved_df['Label']

# Convert labels to binary (1 for ATTACK, 0 for BENIGN)
y_binary = (y != 'BENIGN').astype(int)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y_binary, test_size=0.2, random_state=0)

# Initialize the Support Vector Machine Classifier
clf_svm = SVC(kernel='linear', C=1.0)

# Train the classifier
clf_svm.fit(X_train, y_train)

# Make predictions on the test set
y_pred_svm = clf_svm.predict(X_test)

# Evaluate the performance
accuracy_svm = accuracy_score(y_test, y_pred_svm)
f1_svm = f1_score(y_test, y_pred_svm)
report_svm = classification_report(y_test, y_pred_svm)

# Print the results
print("SVM Accuracy:", accuracy_svm)
print("SVM F1 Score:", f1_svm)
print("SVM Classification Report:\n", report_svm)



In [None]:
# Save the trained SVM model to a file
model_filename_svm = 'svm_model.pkl'
with open(model_filename_svm, 'wb') as file:
    pickle.dump(clf_svm, file)

# Load the SVM model back for later use
with open(model_filename_svm, 'rb') as file:
    loaded_model_svm = pickle.load(file)

# Now you can use 'loaded_model_svm' to make predictions on new data
new_data_svm = ...  # Your new data here
predictions_svm = loaded_model_svm.predict(new_data_svm)
