In [1]:
!pip install pyarrow
!pip install wordcloud
!pip install gensim
!pip install tensorflow
!pip install nltk

Collecting pyarrow
  Using cached pyarrow-12.0.1-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (39.1 MB)
Installing collected packages: pyarrow
Successfully installed pyarrow-12.0.1
Collecting wordcloud
  Using cached wordcloud-1.9.3-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (489 kB)
Installing collected packages: wordcloud
Successfully installed wordcloud-1.9.3
Collecting gensim
  Using cached gensim-4.2.0-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (24.1 MB)
Collecting smart-open>=1.8.1
  Using cached smart_open-7.0.4-py3-none-any.whl (61 kB)
Collecting wrapt
  Using cached wrapt-1.16.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl (77 kB)
Installing collected packages: wrapt, smart-open, gensim
Successfully installed gensim-4.2.0 smart-open-7.0.4 wrapt-1.16.0
Collecting tensorflow
  Using cached tensorflow-2.11.0-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (588.3 MB)
Collecting

In [2]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import os

from sklearn.model_selection import train_test_split,cross_val_score, GridSearchCV, KFold
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn import metrics
from sklearn.metrics import classification_report, multilabel_confusion_matrix, make_scorer, f1_score
import time
from wordcloud import WordCloud
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.preprocessing import normalize
from sklearn.decomposition import PCA

## New Pre-processing

In [11]:
import pandas as pd
import string
import base64
import re

def decode_base64(word):
    try:
        return base64.b64decode(word).decode("utf-8")
    except:
        pass

In [12]:
def split_session(full_session):
    words = []
    for word in re.split(r"\n|;|,|/|-|\||=|$|>|<|$|:|{|}|\(|\)| ", full_session):
        if word.startswith('"') or word.endswith('"'):
            # remove the quotation mark at the start and at the end of the word
            word = word[1:-1]
        elif len(word) == 1 and word in string.punctuation:
            # remove that punctuation
            word = None
        words.append(word)
    return list(filter(None, words))

In [16]:
def clean_session(full_session):
    new_full_session = []
    for session_chunck in full_session.split(";"):
        if "base64 --decode" in session_chunck or "echo" in session_chunck:
            for word in session_chunck.split("\""):
                decode = decode_base64(word)
                if decode:
                    new_full_session.append(decode)
        else:
            new_full_session.append(session_chunck)
    return split_session("".join(new_full_session))

In [None]:
## Applying explode and then taking 10% of the features

In [152]:
df_original = pd.read_parquet('ssh_attacks.parquet')
df = df_original.copy()

In [333]:
df_exploded = df.explode("Set_Fingerprint").reset_index()
df_exploded = df_exploded.drop(columns=['index'])

label_counts = df_exploded['Set_Fingerprint'].value_counts()
label_counts

df_persistence = df_exploded[df_exploded["Set_Fingerprint"] == "Persistence"].sample(frac=0.2).copy()
df_discovery = df_exploded[df_exploded["Set_Fingerprint"] == "Discovery"].sample(frac=0.2).copy()
df_defenseEvasion = df_exploded[df_exploded["Set_Fingerprint"] == "Defense Evasion"].sample(frac=0.2).copy()
df_execution = df_exploded[df_exploded["Set_Fingerprint"] == "Execution"].sample(frac=0.2).copy()
df_impact = df_exploded[df_exploded["Set_Fingerprint"] == "Impact"].sample(frac=1).copy()
df_other = df_exploded[df_exploded["Set_Fingerprint"] == "Other"].sample(frac=1).copy()
df_harmless= df_exploded[df_exploded["Set_Fingerprint"] == "Harmless"].sample(frac=0.35).copy()


df_subset = pd.concat([df_persistence, df_discovery, df_defenseEvasion, df_execution, df_impact, df_other, df_harmless], ignore_index=True)

In [274]:
label_counts

Discovery          232145
Persistence        211295
Execution           92927
Defense Evasion     18999
Harmless             2206
Other                 327
Impact                 27
Name: Set_Fingerprint, dtype: int64

In [334]:
df_subset['first_timestamp'] = pd.to_datetime(df_subset['first_timestamp'])
df_subset["full_session"] = df_subset["full_session"].apply(lambda x: clean_session(x))

In [335]:
print(df_original)

        session_id                                       full_session  \
0                0  enable ; system ; shell ; sh ; cat /proc/mount...   
1                1  enable ; system ; shell ; sh ; cat /proc/mount...   
2                2  enable ; system ; shell ; sh ; cat /proc/mount...   
3                3  enable ; system ; shell ; sh ; cat /proc/mount...   
4                4  enable ; system ; shell ; sh ; cat /proc/mount...   
...            ...                                                ...   
233030      233042  cat /proc/cpuinfo | grep name | wc -l ; echo -...   
233031      233043  cat /proc/cpuinfo | grep name | wc -l ; echo -...   
233032      233044  cat /proc/cpuinfo | grep name | wc -l ; echo -...   
233033      233045  cat /proc/cpuinfo | grep name | wc -l ; echo -...   
233034      233046  cat /proc/cpuinfo | grep name | wc -l ; echo -...   

                         first_timestamp               Set_Fingerprint  
0       2019-06-04 09:45:11.151186+00:00  [Defense

In [336]:
df_subset.to_parquet("ssh_attacks_cleaned.parquet")

In [339]:
with open("features.txt", "r") as f:
    vocabuary = f.read().splitlines()

In [340]:
if not os.path.isfile("df_features_bow.parquet"):
    count_vectorizer = CountVectorizer(vocabulary=vocabuary)
    bow = count_vectorizer.fit_transform(df_subset["full_session"].apply(lambda x : " ".join(x)))
    df_bow = pd.DataFrame(bow.toarray(), index=df_subset.index, columns = list(count_vectorizer.vocabulary_.keys()))
    for feature in df_bow.columns:
        df_bow[feature] = normalize(df_bow[feature].values.reshape(-1,1), norm="l2", axis=0)
    df_bow = pd.concat([df_subset, df_bow], axis=1)
    df_features_bow = df_bow.drop(columns=["session_id", "full_session", "first_timestamp", "Set_Fingerprint"])
    df_features_bow.to_parquet("df_features_bow.parquet")
else:
    df_features_bow = pd.read_parquet("df_features_bow.parquet")

In [341]:
if not os.path.isfile("df_features_tfidf.parquet"):
    tfidf_vectorizer = TfidfVectorizer(vocabulary=vocabuary)
    tfidf = tfidf_vectorizer.fit_transform(df_subset["full_session"].apply(lambda x : " ".join(x)))
    df_tfidf = pd.DataFrame(tfidf.toarray(), index=df_subset.index, columns = list(tfidf_vectorizer.vocabulary_.keys()))
    df_tfidf = pd.concat([df_subset, df_tfidf], axis=1)
    df_features_tfidf = df_tfidf.drop(columns=["session_id", "full_session", "first_timestamp", "Set_Fingerprint"])
    df_features_tfidf.to_parquet("df_features_tfidf.parquet")
else:
    df_features_tfidf = pd.read_parquet("df_features_tfidf.parquet")

In [24]:
# if not os.path.isfile("ssh_attacks_decoded_splitted.parquet"):
#     raise Exception("You should run the preprocessing file")
    
# df = pd.read_parquet("ssh_attacks_decoded_splitted.parquet")

In [25]:
# if not (os.path.isfile("df_features_bow.parquet") and os.path.isfile("df_features_tfidf.parquet")):
#     raise Exception("You should run the section 1 before")
    
# df_features_bow = pd.read_parquet("df_features_bow.parquet")
# df_features_tfidf = pd.read_parquet("df_features_tfidf.parquet")

In [26]:
# df_features_bow

In [342]:
df_features_tfidf

Unnamed: 0,ab,alpine,apt,ar,ash,awk,base64,bash,bin,bs,...,tmp,top,tsm,udp,uname,unix,var,vim,wget,which
0,0.0,0.0,0.0,0.0,0.0,0.073036,0.0,0.058519,0.054358,0.0,...,0.509334,0.036390,0.118457,0.0,0.109556,0.468464,0.255038,0.0,0.000000,0.073047
1,0.0,0.0,0.0,0.0,0.0,0.177542,0.0,0.000000,0.000000,0.0,...,0.265313,0.088459,0.000000,0.0,0.266317,0.000000,0.265699,0.0,0.000000,0.088784
2,0.0,0.0,0.0,0.0,0.0,0.100305,0.0,0.080367,0.074653,0.0,...,0.249821,0.049976,0.000000,0.0,0.150460,0.321684,0.150111,0.0,0.170178,0.100319
3,0.0,0.0,0.0,0.0,0.0,0.073036,0.0,0.058519,0.054358,0.0,...,0.509334,0.036390,0.118457,0.0,0.109556,0.468464,0.255038,0.0,0.000000,0.073047
4,0.0,0.0,0.0,0.0,0.0,0.177542,0.0,0.000000,0.000000,0.0,...,0.265313,0.088459,0.000000,0.0,0.266317,0.000000,0.265699,0.0,0.000000,0.088784
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
112194,0.0,0.0,0.0,0.0,0.0,0.070996,0.0,0.056884,0.052839,0.0,...,0.495108,0.035373,0.115148,0.0,0.106496,0.512302,0.247915,0.0,0.000000,0.071006
112195,0.0,0.0,0.0,0.0,0.0,0.177542,0.0,0.000000,0.000000,0.0,...,0.265313,0.088459,0.000000,0.0,0.266317,0.000000,0.265699,0.0,0.000000,0.088784
112196,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000,0.000000,0.0,...,0.168183,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.0,0.000000,0.000000
112197,0.0,0.0,0.0,0.0,0.0,0.177542,0.0,0.000000,0.000000,0.0,...,0.265313,0.088459,0.000000,0.0,0.266317,0.000000,0.265699,0.0,0.000000,0.088784


In [360]:
df_final = pd.concat([df_subset,df_features_tfidf], axis=1)
print(df_final) 

        session_id                                       full_session  \
0            82694  [cat, proc, cpuinfo, grep, name, wc, l, rm, rf...   
1           126467  [cat, proc, cpuinfo, grep, name, wc, l, rm, rf...   
2             3971  [cd, var, tmp, #!, bin, bash, cd, tmp, rm, rf,...   
3            86173  [cat, proc, cpuinfo, grep, name, wc, l, rm, rf...   
4           209498  [cat, proc, cpuinfo, grep, name, wc, l, rm, rf...   
...            ...                                                ...   
112194       14437  [cat, proc, cpuinfo, grep, name, wc, l, Enter,...   
112195       95221  [cat, proc, cpuinfo, grep, name, wc, l, rm, rf...   
112196       11785                            [scp, t, tmp, S7rKjnXk]   
112197       53342  [cat, proc, cpuinfo, grep, name, wc, l, rm, rf...   
112198      152010  [cat, proc, cpuinfo, grep, name, wc, l, rm, rf...   

                        first_timestamp Set_Fingerprint   ab  alpine  apt  \
0      2019-11-13 16:07:17.558060+00:00     Pe

#### Ignore this part-- it was just for test puposes

In [None]:
#I was trying to add some other features like timestamp and id_Session, standardizing them and 
#then adding them to the final dataser but it seems that the result gets worse with these new ones.

In [361]:
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer

# standardized the timestamp column and session_id
df_final['first_timestamp'] = pd.to_datetime(df_final['first_timestamp'])
df_final['first_timestamp_num'] = df_final['first_timestamp'].view(int) // 10**9  # Convert to Unix timestamps (seconds)
df_final['first_timestamp_num']

# Drop the original timestamp column 
df_final.drop(columns=['first_timestamp'], inplace=True)

# Scaling the numerical representation using StandardScaler
scaler = StandardScaler()
df_final[['session_id', 'first_timestamp_num']] = scaler.fit_transform(df_final[['session_id','first_timestamp_num']])

#### end of part to be ignored 

<br>
<left><b><font size=4>Section 2 – Supervised Learning – Classification<b><left>

<br>
<div style="text-align: justify"> Classify the tactics of an attack session, based on the used words in the text and also possibly on time. Notice that each session have multiple labels. Hence you can decompose the problem into multiple binary classification problems. For each attack session, you have to solve the 7 binary classification problem, one for each possible label {'Persistence', 'Discovery', 'Defense Evasion', 'Execution', 'Impact', 'Other', 'Harmless'}. </div><br>

**2.1 Perform a split to segment the dataset into training and test dataset. If you want to standardize your dataset, fit the scaler on training set and transforming both training and test. Notice that the sklearn implementation of tf-idf already performs the standardization.**

In [362]:
X = df_final.drop(columns=['Set_Fingerprint','full_session'])
y = df_final[['Set_Fingerprint']]

In [363]:
X

Unnamed: 0,session_id,ab,alpine,apt,ar,ash,awk,base64,bash,bin,...,top,tsm,udp,uname,unix,var,vim,wget,which,first_timestamp_num
0,-0.403458,0.0,0.0,0.0,0.0,0.0,0.073036,0.0,0.058519,0.054358,...,0.036390,0.118457,0.0,0.109556,0.468464,0.255038,0.0,0.000000,0.073047,-0.173066
1,0.264723,0.0,0.0,0.0,0.0,0.0,0.177542,0.0,0.000000,0.000000,...,0.088459,0.000000,0.0,0.266317,0.000000,0.265699,0.0,0.000000,0.088784,0.292328
2,-1.605141,0.0,0.0,0.0,0.0,0.0,0.100305,0.0,0.080367,0.074653,...,0.049976,0.000000,0.0,0.150460,0.321684,0.150111,0.0,0.170178,0.100319,-3.051787
3,-0.350352,0.0,0.0,0.0,0.0,0.0,0.073036,0.0,0.058519,0.054358,...,0.036390,0.118457,0.0,0.109556,0.468464,0.255038,0.0,0.000000,0.073047,-0.128379
4,1.532166,0.0,0.0,0.0,0.0,0.0,0.177542,0.0,0.000000,0.000000,...,0.088459,0.000000,0.0,0.266317,0.000000,0.265699,0.0,0.000000,0.088784,1.476873
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
112194,-1.445380,0.0,0.0,0.0,0.0,0.0,0.070996,0.0,0.056884,0.052839,...,0.035373,0.115148,0.0,0.106496,0.512302,0.247915,0.0,0.000000,0.071006,-1.514532
112195,-0.212237,0.0,0.0,0.0,0.0,0.0,0.177542,0.0,0.000000,0.000000,...,0.088459,0.000000,0.0,0.266317,0.000000,0.265699,0.0,0.000000,0.088784,-0.004316
112196,-1.485862,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000,0.000000,...,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,-1.797393
112197,-0.851507,0.0,0.0,0.0,0.0,0.0,0.177542,0.0,0.000000,0.000000,...,0.088459,0.000000,0.0,0.266317,0.000000,0.265699,0.0,0.000000,0.088784,-0.570013


In [356]:
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size = 0.70, stratify=y)

In [364]:
print('The size of trainning set is:', len(X_train))
print('The size of test set is:', len(X_test))

The size of trainning set is: 78539
The size of test set is: 33660


### Data Preprocessing

##### Standardization of the Numerical Features

As the TF-IDF pre-processing was applied previously to all the sessions, the data considered as features was already standardized. 

##### Standardization of the Categorical Features

In [77]:
# mlb = MultiLabelBinarizer()
# y_train_mlb = mlb.fit_transform(y_train)
# y_test_mlb = mlb.transform(y_test)

In [365]:
from sklearn.preprocessing import OneHotEncoder

hot_encoder = OneHotEncoder(handle_unknown='ignore')
y_train_encoded = hot_encoder.fit_transform(y_train).toarray()
y_test_encoded = hot_encoder.transform(y_test).toarray()


**Preprocessing Techniques** 
<br>
<div style="text-align: justify"> A <b>MultiLabelBinarizer</b> is a transformer that is used for multi-label classification problems, in order to handle the cases where each sample belongs to multiple classes simultaneously. The purpose of MultiLabelBinarizer is to convert a collection of sequences of labels into a binary matrix format. The binary classification of each label in the 'Set_Fingerprint' column was performed by converting the multi-class label matrix into a binary matrix, where each column represents one of the possible classes and each row represents one instance. </div><br>

<div style="text-align: justify"> <b>TF-IDF </b> (explain technique here) </div><br>

<div style="text-align: justify"> MultiLabelBinarizer is used to handle categorical variables before fitting a model, as most machine learning algorithms can only handle numerical data.</div><br>

**2.2 Choose at least 2 ML methods, and perform the model training, with default parameter
configuration, evaluating the performance on both training and test set. Output the confusion
matrix and classification report. Do you observe overfitting or under-fitting? Which model
generates the best performance?**

<br><left><b><font size=4> Random Forest (RF)<b><left>

<div style="text-align: justify">Random Forest (RF) serves as a classification model that constructs a collection of decision trees (DT) using a randomly chosen subset of the given training set. The model aggregates the individual decisions made by each decision tree and combines their votes to make the ultimate prediction.</div>

In [366]:
rf = RandomForestClassifier(n_estimators=100) 

st = time.time()
rf.fit(X_train, y_train_encoded)
et = time.time()

# get the execution time
elapsed_time = et - st
print(f"Time to train the model: {elapsed_time:.2f} seconds")

Time to train the model: 55.77 seconds


In [367]:
y_train_predictions = rf.predict(X_train)
y_test_predictions = rf.predict(X_test)

In [368]:
# Decode the predicted labels back to their original format if necessary
y_train_dec = hot_encoder.inverse_transform(y_train_encoded)

# Generate a list of unique labels from both true and predicted labels
# Get unique y_train_dec
unique_labels = np.unique(y_train_dec)
# Convert the unique labels to a list to be used in the classification report Target
unique_labels_list = unique_labels.tolist()

In [369]:
# Evaluate the model's performance on the train data
accuracy = rf.score(X_train, y_train_encoded)
print(f"Accuracy of the 'Random Forest' model for the training set: {accuracy:.2f}")

# Evaluate the model's performance on the test data
accuracy = rf.score(X_test, y_test_encoded)
print(f"Accuracy of the 'Random Forest' model for test set: {accuracy:.2f}")

Accuracy of the 'Random Forest' model for the training set: 0.87
Accuracy of the 'Random Forest' model for test set: 0.32


<br>
<left><b><font size=3 >Classification Report<b><left>

##### Training Set

In [354]:
report_training = classification_report(y_train_encoded, y_train_predictions, target_names=unique_labels_list, output_dict=True, zero_division=0)
df_report_training = pd.DataFrame(report_training).transpose()
df_report_training

Unnamed: 0,precision,recall,f1-score,support
Defense Evasion,0.924735,0.920557,0.922642,2656.0
Discovery,0.923137,0.882533,0.902378,32375.0
Execution,0.906569,0.82664,0.864761,13123.0
Harmless,0.974178,0.778612,0.865485,533.0
Impact,1.0,0.761905,0.864865,21.0
Other,0.804878,0.864629,0.833684,229.0
Persistence,0.923151,0.87815,0.900088,29602.0
micro avg,0.920452,0.872038,0.895591,78539.0
macro avg,0.922378,0.844718,0.879129,78539.0
weighted avg,0.92045,0.872038,0.895454,78539.0


##### Validation Set (Test set)

In [355]:
# Evaluate performance on test set
report_test = classification_report(y_test_encoded, y_test_predictions, target_names=unique_labels_list,output_dict=True, zero_division=0)
df_report_test = pd.DataFrame(report_test).transpose()
df_report_test

Unnamed: 0,precision,recall,f1-score,support
Defense Evasion,0.399314,0.407343,0.403289,1144.0
Discovery,0.359744,0.344386,0.351898,14054.0
Execution,0.240906,0.2316,0.236162,5462.0
Harmless,0.159794,0.129707,0.143187,239.0
Impact,0.5,0.166667,0.25,6.0
Other,0.47619,0.510204,0.492611,98.0
Persistence,0.344013,0.330015,0.336868,12657.0
micro avg,0.335138,0.321747,0.328306,33660.0
macro avg,0.35428,0.302846,0.316288,33660.0
weighted avg,0.334834,0.321747,0.328122,33660.0


In [145]:
# I only got to run up to this point

<br>
<left><b><font size=3> Confusion Matrix <b><left>

##### Training Set

In [None]:
confusion_rf_train = multilabel_confusion_matrix(y_train_mlb, y_train_predictions)

for i, label in enumerate(mlb.classes_):
    print(f"Confusion Matrix for '{label}':")
    print(confusion_rf_train[i], "\n")

In [None]:
plt.figure(figsize=(15, 8))
for i, label in enumerate(mlb.classes_):
    plt.subplot(3, 3, i + 1)
    plt.title(f"Confusion Matrix for '{label}'")
    plt.imshow(confusion_rf_train[i], cmap='Blues', interpolation='nearest')
    plt.colorbar()
    plt.xlabel('Predicted')
    plt.ylabel('True')
    plt.xticks(np.arange(2), ['Negative', 'Positive'])
    plt.yticks(np.arange(2), ['Negative', 'Positive'])
plt.tight_layout()
plt.show()

##### Validation Set

In [None]:
confusion_rf_test = multilabel_confusion_matrix(y_test_mlb, y_test_predictions)

for i, label in enumerate(mlb.classes_):
    print(f"Confusion Matrix for '{label}':")
    print(confusion_rf_test[i], "\n")   

In [None]:
plt.figure(figsize=(15, 8))
for i, label in enumerate(mlb.classes_):
    plt.subplot(3, 3, i + 1)
    plt.title(f"Confusion Matrix for '{label}' in Test Set")
    plt.imshow(confusion_rf_test[i], cmap='Oranges', interpolation='nearest')
    plt.colorbar()
    plt.xlabel('Predicted')
    plt.ylabel('True')
    plt.xticks(np.arange(2), ['Negative', 'Positive'])
    plt.yticks(np.arange(2), ['Negative', 'Positive'])

plt.tight_layout()
plt.show()

<br>
<left><b><font size=4>K-Nearest Neighbors (KNN)<b><left>

<div style="text-align: justify">K-Nearest Neighbors (KNN) operates as a supervised learning classifier that relies on the concept of proximity to perform classifications or predictions for individual data points. Its fundamental principle is grounded in the notion that similar data points tend to cluster together. In the context of classification tasks, KNN assigns a class label to a data point by considering the majority vote of its nearest neighbors. Put simply, it selects the label that is most prevalent among the neighboring data points in close proximity to the one being evaluated.</div><br>

In [None]:
knn = KNeighborsClassifier()

st = time.time()
knn.fit(X_train, y_train_mlb)
et = time.time()

elapsed_time = et - st
print(f"Time to train the model: {elapsed_time} seconds")

In [None]:
predictions_train = knn.predict(X_train) 
predictions_test = knn.predict(X_test)

In [None]:
accuracy = knn.score(X_train, y_train_mlb)
print(f"Accuracy of the k-NN model for the training set: {accuracy:.2f}")

accuracy = knn.score(X_test, y_test_mlb)
print(f"Accuracy of the k-NN model for the test set: {accuracy:.2f}")

<br>
<left><b><font size=3 >Classification Report<b><left>

##### Training Set

In [None]:
# Classification Report for training set
report_train_knn = classification_report(y_train_mlb, predictions_train, target_names=mlb.classes_, output_dict=True, zero_division=1)
df_report_train = pd.DataFrame(report_train_knn).transpose()
print("Classification Report for Trainning set:")
df_report_train

##### Validation Set

In [None]:
# Classification Report for test set
report_test_knn = classification_report(y_test_mlb, predictions_test, target_names=mlb.classes_, output_dict=True, zero_division=1)
df_report_test_knn = pd.DataFrame(report_test_knn).transpose()
print("Classification Report for Test set:")
df_report_test_knn

<br>
<left><b><font size=3> Confusion Matrix <b><left>

The code presented below prints a series of confusion matrices for each class, displaying True-Positive (top-left), False-Negative (bottom-left), False-Positive (top-right), and True-Negative (bottom-right) counts.
- True Positives (TP): Predicted correctly as positive.
- False Positives (FP): Predicted as positive but actually negative.
- False Negatives (FN): Predicted as negative but actually positive.
- True Negatives (TN): Predicted correctly as negative.

Each value in the confusion matrix represents the count of instances falling into these categories for a specific label.

In [None]:
# Evaluate with a confusion matrix and classification report
confusion_knn_test = multilabel_confusion_matrix(y_test_mlb, predictions_test)
for i, label in enumerate(mlb.classes_):
    print(f"Confusion Matrix for {label}:")
    print(confusion_knn_test[i], "\n")

In [None]:
plt.figure(figsize=(15, 8))
for i, label in enumerate(mlb.classes_):
    plt.subplot(3, 3, i + 1)
    plt.title(f"Confusion Matrix for '{label}'")
    plt.imshow(confusion_knn_test[i], cmap='Oranges', interpolation='nearest')
    plt.colorbar()
    plt.xlabel('Predicted')
    plt.ylabel('True')
    plt.xticks(np.arange(2), ['Negative', 'Positive'])
    plt.yticks(np.arange(2), ['Negative', 'Positive'])

plt.tight_layout()
plt.show()

<div style="text-align: justify"> To know whether the model is underfitting or overfitting, we must first define these two terms. <br>
<div style="text-align: justify"><br><b>Underfitting</b> occurs when a model is too simple to capture the underlying patterns in the training data, resulting in poor performance even on the training set.  One indicator used to identify this modelling error is to look at the results; if both training and validation/testing performance are poor, the model is considered to be underfitted.</div><br>

<div style="text-align: justify"><b>Overfitting</b> occurs when a model not only learns the underlying patterns in the training data, but also captures noise and random fluctuations, causing it to perform poorly on new and unknown data. One indicator used to identify this modelling error is to look at the results, if the model performs well on training data but poorly on validation or test data, it is considered overfitted.</div><br>

<div style="text-align: justify">Once these two terms have been defined, it is possible to move on to the results obtained for both models. It is important to mention that, for both tests performed with the different classifiers, the default parameters were used.</div><br>
<div style="text-align: justify">For the <b>Random Forest (RF)</b> case, the default parameter implied the number of estimators equal to 100, and the tree depth was set to <i>'None'</i>. On the other hand, for the <b>K-Nearest Neighbor (KNN)</b> classifier, the number of estimators was set to 5, the leaf size to 30 and the type of distance calculation was <i>'Euclidean distance'</i> (p=2), all default parameters of the classifier.</div><br>
    
<b> Random Forest (RF)</b>
<div style="text-align: justify"> In the classification report of the training and test sets, for most of the classes, accuracy, recall and F1 score are slightly lower in the test set compared to the training set. This was expected, as models tend to generalise slightly worse with unseen data. However, the drop in performance is not significant, which indicates that the model still performs well on the test set. Furthermore, the accuracy obtained for the training set was 99%, while 98% for the test set.<br><br>
    
<div style="text-align: justify">As can be seen in the validation set report, the model was not able to correctly classify instances of the <i>'Impact'</i> class, performing very poorly (0%) on all precision, recall and F1 score metrics. This result could be due to the default parameters set to train the model.  Tree depth is one of the most important parameter for tuning the model, as it sets the stop condition that limits the number of splits or levels deep a decision tree can go.</div><br>

<div style="text-align: justify">To enhance the classifier results, it is necessary to adjust the maximum depth of the decision trees when performing hyperparameter fitting for a random forest model. The <i>'weighted avg'</i> metric also showed a decrease in performance on the test set, indicating that the model does not perform as well on the test set across all classes, considering the distribution of classes. Overall, the performance metrics on the test set remain high, indicating that the overfitting is not critical. </div><br>

<b> K-Nearest Neighbor (KNN)</b>

<div style="text-align: justify">On the other hand, the classification report obtained for the KNN classifier showed a small difference between the training set and the validation set, for the test set the values obtained for each of the metrics; accuracy, recall, f1-score, were slightly lower compared to the results obtained for the training set.</div><br>

<div style="text-align: justify"> For the <i>'Impact'</i> class, the metrics derived in the training set were significantly lower compared to the validation set, for all metrics. This improvement in the accuracy, recall and f1-score parameters for the <i>'Impact'</i> class in the test set indicates that the model's predictions for this class are more accurate and reliable when evaluated with new, unseen data.  Producing a recall of 50%, which means, that the model only correctly predicted this class for 50% of the evaluated intents. </div><br>

<div style="text-align: justify">The difference obtained in this class for the training and validation sets suggests that the model did not sufficiently fit the <i>'Impact'</i> class during training, and then after the selection of the nearest neighbour from the test set, the model adjusted its predictions to better capture the features of the <i>'Impact'</i> class. However, it is important to note that the KNN classifier does not fit the data, it does not learn from the model, it only calculates the distance to the nearest points and selects the class according to the majority result of the nearest neighbours.</div><br>

<div style="text-align: justify"> In general, the observed results do not indicate underfitting or overfitting, in fact, the average accuracy obtained in the classification report was 99%, matching with the obtained in the calculated accuracy score (98% for both sets). The high values of the <i>'micro-average'</i> in both sets suggest a good overall performance of the model. While the <i>'macro-average'</i> values are higher only in the test set, indicating that the model performs better after calculating the Euclidean distance of each point.</div><br>

**2.3 Tune the hyper-parameters of the models through cross-validation. How do performance vary?**

#### Grid Search Technique

<div style="text-align: justify">Hyperparameters are settings that control the learning process of machine learning models. While the parameters are learned during the training process, the hyperparameters are set before the training starts. Therefore, in order to find the parameters that best fit the performance of the model, the GridSearch technique was applied. This technique applies all possible combinations of hyperparameters, resulting in a set of parameters that will improve the performance of the model.</div>

#### Random Forest

In [None]:
#GRID SEARCH
from sklearn.model_selection import GridSearchCV

# Create the parameter grid based on the results of random search 
params = {'n_estimators': [3, 50, 100], 'max_depth': [100, 1000, 10000] } #'criterion' :['gini', 'entropy']}

# Instantiate the grid search model
gs_rf = GridSearchCV(rf, param_grid = params, scoring='f1_macro', cv = 5, verbose = 1) 
# scoring='accuracy'
# cv: that's the number of fold for the cross-validation
# verbose: specifies the verbosity level of the GridSearchCV object. 

# Trainning the model
st = time.time()
gs_rf.fit(X_train, y_train_mlb)
et = time.time()

# get the execution time
elapsed_time = et - st
print(f"Time to train the model: {elapsed_time} seconds")

In [None]:
best_params_rf = gs_rf.best_params_
print(f"The best combination of parameters the Grid Search has found is: {best_params_rf}")
print("Best F1-Score: {:.2f}".format(gs_rf.best_score_))

#### Ask to the prof which graph we should use -------------------------------------------------------------------------

In [None]:
# Bar plot

In [None]:
# Let's print the f1 macro reached for each combination
y = gs_rf.cv_results_["mean_test_score"].tolist()
x = [i for i in range (1, len(y)+1)]
mean_test_score_df = pd.DataFrame()
mean_test_score_df["f1_macro"] = y

plt.figure(figsize=(8, 6))
sns.barplot(x = mean_test_score_df.index, y = "f1_macro", data = mean_test_score_df, color='blue')

# Add a title and labels to the plot
plt.title('F1-macro Scores for Different Parameters')
plt.xlabel('Combination')
plt.ylabel('F1-macro Score')
plt.show()

In [None]:
# Heatmap plot

In [None]:
results_rf = pd.DataFrame(gs_rf.cv_results_)
results_rf

In [None]:
# We create a pivot table 
scores_rf = results_rf.pivot(index='param_max_depth', columns='param_n_estimators', values='mean_test_score')
scores_rf

In [None]:
sns.heatmap(scores_rf, annot=True, cmap='viridis', fmt='.5g')
plt.xlabel('param_max_depth')
plt.ylabel('param_n_estimators')
plt.title('Mean F1-Score over all folds for each combination of parameters')
plt.show()

#### K-Nearest Neighbor

In [None]:
params = {'n_neighbors': [5, 10, 20], 'leaf_size': [10, 70, 100]}        #'metric': ['euclidean', 'manhattan']} 
grid_search_knn = GridSearchCV(knn, params, scoring='f1_macro', cv = 5, verbose=1)
# scoring = 'accuracy'

st = time.time()
grid_search_knn.fit(X_train, y_train_mlb)
et = time.time()

# get the execution time
elapsed_time = et - st
print(f"Time to train the model: {elapsed_time} seconds")

In [None]:
best_params_knn = grid_search_knn.best_params_
print(f"The best combination of parameters the Grid Search has found is: {best_params_knn}")
print("Best F1-Score: {:.2f}".format(grid_search_knn.best_score_))

#### Aks to the prof which graph we should use --------------------------------------------------------------------------

In [None]:
#Bar plot

In [None]:
# Let's print the f1 macro reached for each combination
y = grid_search_knn.cv_results_["mean_test_score"].tolist()
x = [i for i in range (1, len(y)+1)]
mean_test_score_df = pd.DataFrame()
mean_test_score_df["f1_macro"] = y
#print(mean_test_score_df)

sns.barplot(x = mean_test_score_df.index, y = "f1_macro", data = mean_test_score_df, color='blue')
# Add a title and labels to the plot
plt.title('F1-macro Scores for Different Parameters')
plt.xlabel('Combination')
plt.ylabel('F1-macro Score')

# Display the plot
plt.show()

In [None]:
# Heatmap plot

In [None]:
results_knn = pd.DataFrame(grid_search_knn.cv_results_)
results_knn

In [None]:
# We create a pivot table before create the heatmap
scores_knn = results_knn.pivot(index='param_leaf_size', columns='param_n_neighbors', values='mean_test_score')
scores_knn

In [None]:
sns.heatmap(scores_knn, annot=True, cmap='viridis', fmt='.5g')
plt.xlabel('param_n_neighbors')
plt.ylabel('param_leaf_size')
plt.title('Mean F1-score over all folds for each combination of parameters')
plt.show()

**2.4. Comments on the results for each on the intents.**

<div style="text-align: justify">The aim of this analysis is to assess the predictive capability of two models in classifying attack labels. The models will be evaluated based on the hyperparameters identified previously.</div>

##### Evaluation of Random Forest with tuned parameters

In [None]:
# Random Forest with the hyperparameters
model_rf_tunned = RandomForestClassifier(n_estimators = 50, max_depth = 10000)

st = time.time()
# Trainning the model
model_rf_tunned.fit(X_train, y_train_mlb)
et = time.time()
# get the execution time
elapsed_time = et - st
print(f'Time to train the model:', elapsed_time,'seconds','\n')

# Predictions on test set
y_test_pred_tune = rf.predict(X_test)

# Evaluate the model's performance on the test data
accuracy = model_rf_tunned.score(X_test, y_test_mlb)
print(f"Accuracy of the 'Random Forest' model for test set: {accuracy:.2f}",'\n')

# Evaluate performance on test set
report_test_tune = classification_report(y_test_mlb, y_test_pred_tune, target_names=mlb.classes_, output_dict=True,
                                         zero_division=1)
df_report_test_tune = pd.DataFrame(report_test_tune).transpose()
print(f'         Classification Report Trainning Set', '\n')
print(df_report_test_tune)

In [None]:
# Compute the heatmap of the correlation matrix
plt.figure(figsize=(5, 3))
sns.heatmap(df_report_test_tune.loc["Defense Evasion" : "Persistence"], cmap='Blues', annot=True, vmin=.0, vmax=1,fmt='.3f')
plt.xlabel('Intents')
plt.ylabel('Evaluation technique')
plt.title('Intents classification report')
plt.show()

##### Evalutaing of K-Nearest Neighbors with tuned parameters

In [None]:
# Create the k-NN model
knn_tune = KNeighborsClassifier(leaf_size=10, n_neighbors=5)

# Train the model on the training data
t = time.time()
knn_tune.fit(X_train, y_train_mlb)
et = time.time()

# get the execution time
elapsed_time = et - st
print(f"Time to train the model: {elapsed_time} seconds")

# Generate predictions on the test set
predictions_knn_tune = knn.predict(X_test)

# Evaluate the model's performance on the test data
accuracy_knn_tune = knn.score(X_test, y_test_mlb)
print(f"Accuracy of the k-NN model: {accuracy_knn_tune:.2f}",'\n')

report_knn_tune = classification_report(y_test_mlb, predictions_knn_tune, target_names = mlb.classes_, output_dict=True,
                                       zero_division=1)
df_report_knn_tune = pd.DataFrame(report_knn_tune).transpose()
print("              Classification Report for KNN",'\n')
print(df_report_knn_tune)

In [None]:
# Compute the heatmap of the correlation matrix
plt.figure(figsize=(5, 3))
sns.heatmap(df_report_knn_tune.loc["Defense Evasion" : "Persistence"], cmap='Blues', annot=True, vmin=.0, vmax=1,fmt='.3f')
plt.xlabel('Intents')
plt.ylabel('Evaluation technique')
plt.title('Classification report')
plt.show()

Need to write how the hyperparameter tunning improves the result; especially in the intent Impact, for both models... I'll do it later.

**2.5 Explore the possible features: try combining features differently, e.g., does tf-idf improve or worsen performance? Think about the problem and summarize the ways you have tried (even those that did not work).**

<div style="text-align: justify"><b>First Attempt</b></div><br>

<div style="text-align: justify">For the first attempt, 33 features were used for the training set. The classifiers selected to perform the predictions were 'Random Forest' and 'K-Nearest Neighbor' with both models using the default parameters. In the performance evaluation, an accuracy of 98% was obtained for both models in the validation set.  While in the training set it reached 99% for RF and 98% for KNN. The decrease in accuracy achieved by RF in the validation set suggests a slight overfitting, however, it is a tolerable value that assumes that the model still performs well on the test set.</div><br>

<div style="text-align: justify">According to the classification report, for almost all attempts the values of precision, recall and f1-scores reached high percentages, around 98.9%.  Except for the 'Impact' and 'Harmless' intents when applying RF as a model, the results obtained for these classes were quite lower compared to those obtained for the other classes, both in the training set and in the validation set, where 0% was obtained in each metric. The model presented a very poor performance when trying to classify these two classes.</div><br>

<div style="text-align: justify">As for tuning techniques, K-fold cross-correlation and Grid Search were applied to see if the results obtained previously could be improved. However, when looking at the results obtained with 5 folds, the performance decreased considerably reaching an average of 77% accuracy. Suggests that the model may be in overfitting. </div><br>

<div style="text-align: justify">Therefore, for the next attempt we consider a reduction of the dimensionality of the data set to improve generalization.</div><br>

<div style="text-align: justify"><b>Second attempt</b></div><br>
<div style="text-align: justify">For the second attempt, the number of features was reduced to only 12 for the training set.  The classifiers used in the previous attempt, RF and KNN, were kept for this evaluation with both using their default parameters. The results obtained in the performance evaluation, the accuracy did not change from the previous attempt, reaching 98% for the RF case and 98% for the KNN for both the training and validation sets, indicating that the models continue to make good predictions for the classes.<br>
<br>
<div style="text-align: justify">If we take a look at the classification report for almost all attempts, the values for accuracy, recall and f1-scores reached high percentages, around 98%. Nevertheless, the performance of the 'Impact' class was very poor in both models for the classification obtained in the validation data, with 0% for every metric (precision, recall and f1-score), indicating that the number of features selected to train both models was not sufficient to be able to correctly classify this class.<br><br>
<div style="text-align: justify">In both training and validation reports, the F1 score obtained for the classes "Impact" and "Harmless" was very low compared to the other classes. It seems that for both models, these two classes are the most difficult to classify correctly. This could be due to the fact that, the number of data selected during the splitting of the training set did not cover enough samples of these two classes, as they are the least sampled attempts of the whole dataset. </div><br>

<div style="text-align: justify">After conducting multiple tests and adjusting the number of features in the training set, the results showed similar levels of accuracy. However, upon further analysis of the classification report parameters, it was found that the highest metrics for accuracy, recall, and F1-score were achieved with a number of features greater than 22. The F1-score parameter indicated that the models performed better in classifying samples belonging to the 'Impact' and 'Harmless' classes, which were more difficult to detect in almost all the tests. </div><br>