# Anomaly detection - HTTP requests - CSIC dataset 2010

## Task
The primary objective is to develop a classifier capable of identifying malicious HTTP requests by training on normal traffic data and evaluating both normal and anomalous test data. The dataset is structured as follows:

 * Normal Traffic (Train)
 * Normal Traffic (Test)
 * Anomalous Traffic (Test)

Although the dataset is designed for unsupervised learning, supervised learning techniques can be applied by combining normal and anomalous data into a labeled dataset. This allows for direct classification using any preferred machine learning model.

## Dataset
The dataset contains the generated traffic targeted to an e-commerce web
application. It is an automatically generated dataset that contains 36,000 normal
requests and more than 25,000 anomalous requests (i.e., web attacks).

In [1]:
# Core libraries
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import re
import math

# Feature importance and explainability
import eli5
from eli5.sklearn import PermutationImportance

# URL processing
from urllib.parse import urlparse

# Scikit-learn preprocessing and model evaluation
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import (
    precision_score, accuracy_score, recall_score, f1_score, 
    roc_auc_score, mean_absolute_error, confusion_matrix, 
    classification_report
)

# Classifiers
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import SGDClassifier, LogisticRegression
from sklearn.svm import LinearSVC, SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier
from xgboost import XGBClassifier

# Model selection and hyperparameter tuning
from sklearn.model_selection import GridSearchCV, StratifiedKFold

# Deep Neural Networks (DNN) with TensorFlow/Keras
import tensorflow as tf
from tensorflow.keras import layers
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.callbacks import ReduceLROnPlateau

# Explainability tools
from sklearn.inspection import PartialDependenceDisplay



2024-08-21 00:12:56.091222: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  SSE4.1 SSE4.2
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
import os

# List all files under the specified directory (in this case, '/csic_dataset')
for dirname, _, filenames in os.walk('csic_dataset/'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# Confirmation that the operation has completed
print('Done!')

csic_dataset/normalTrafficTraining.txt.gz
csic_dataset/anomalousTrafficTest.txt
csic_dataset/.DS_Store
csic_dataset/normalTrafficTest.txt.gz
csic_dataset/normalTrafficTraining.txt
csic_dataset/anomalousTrafficTest.txt.gz
csic_dataset/normalTrafficTest.txt
Done!


## Data Import

### Preprocessing the data - Data prep

Since the files are in .txt format, we would need to process them in order to get something useful like a table or a csv file.

In [1]:
import os
import re
import pandas as pd

# Function to parse HTTP requests from a text block
def parse_http_request(text):
    # Split by double newline to separate headers and body
    parts = text.strip().split('\n\n')
    
    # Initialize dictionary for storing parsed data
    request_data = {}

    # Extract the request line (first line)
    request_line = parts[0].splitlines()[0]
    method, url, http_version = request_line.split(' ', 2)
    request_data['Method'] = method
    request_data['URL'] = url
    request_data['HTTP_Version'] = http_version

    # Extract headers (remaining lines in the first part)
    headers = parts[0].splitlines()[1:]
    for header in headers:
        key, value = header.split(': ', 1)
        request_data[key] = value

    # Extract the body if it exists
    if len(parts) > 1:
        request_data['Body'] = parts[1]
    else:
        request_data['Body'] = ''

    return request_data




In [7]:
# Folder containing the .txt files
folder_path = 'csic_dataset/'

# List to store all parsed request data
all_requests = []

# Iterate through all .txt files in the folder
for filename in os.listdir(folder_path):
    if filename.endswith('.txt'):
        # Determine Train_Test and Normal_Anom from the filename
        if 'Training' in filename:
            train_test = 'Training'
        elif 'Test' in filename:
            train_test = 'Test'
        else:
            train_test = 'Unknown'  # In case the filename doesn't match

        if 'normal' in filename:
            normal_anom = 'Normal'
        elif 'anomalous' in filename:
            normal_anom = 'Anomalous'
        else:
            normal_anom = 'Unknown'  # In case the filename doesn't match

        with open(os.path.join(folder_path, filename), 'r') as file:
            text = file.read()

            # Split the text into individual HTTP requests (based on 'GET' or 'POST')
            requests = re.split(r'\n(?=GET|POST)', text)
            for request in requests:
                parsed_data = parse_http_request(request)
                parsed_data['Train_Test'] = train_test
                parsed_data['Normal_Anom'] = normal_anom
                all_requests.append(parsed_data)

# Convert the list of dictionaries to a DataFrame
df = pd.DataFrame(all_requests)

# Save the DataFrame as a CSV file
df.to_csv('http_requests_all.csv', index=False)


# Now, since the assignment specified that the model should be trained only on the normalTrafficTraining.txt file, we'll split it by train and test
df_train = df[df.Train_Test == 'Training']
df_train.to_csv('http_requests_train.csv', index=False)

df_test = df[(df.Train_Test == 'Test')]
df_test.to_csv('http_requests_test.csv', index=False)
#df[(df.Train_Test == 'Test') & (df.Normal_Anom == 'Anomalous')].to_csv('http_requests_test_anom.csv', index=False)

print(f"Data has been saved")

Data has been saved


In [6]:
df.head()

Unnamed: 0,Method,URL,HTTP_Version,User-Agent,Pragma,Cache-control,Accept,Accept-Encoding,Accept-Charset,Accept-Language,Host,Cookie,Connection,Body,Train_Test,Normal_Anom,Content-Type,Content-Length
0,GET,http://localhost:8080/tienda1/publico/anadir.j...,HTTP/1.1,Mozilla/5.0 (compatible; Konqueror/3.5; Linux)...,no-cache,no-cache,"text/xml,application/xml,application/xhtml+xml...","x-gzip, x-deflate, gzip, deflate","utf-8, utf-8;q=0.5, *;q=0.5",en,localhost:8080,JSESSIONID=B92A8B48B9008CD29F622A994E0F650D,close,,Test,Anomalous,,
1,POST,http://localhost:8080/tienda1/publico/anadir.jsp,HTTP/1.1,Mozilla/5.0 (compatible; Konqueror/3.5; Linux)...,no-cache,no-cache,"text/xml,application/xml,application/xhtml+xml...","x-gzip, x-deflate, gzip, deflate","utf-8, utf-8;q=0.5, *;q=0.5",en,localhost:8080,JSESSIONID=AE29AEEBDE479D5E1A18B4108C8E3CE0,close,id=2&nombre=Jam%F3n+Ib%E9rico&precio=85&cantid...,Test,Anomalous,application/x-www-form-urlencoded,146.0
2,GET,http://localhost:8080/tienda1/publico/anadir.j...,HTTP/1.1,Mozilla/5.0 (compatible; Konqueror/3.5; Linux)...,no-cache,no-cache,"text/xml,application/xml,application/xhtml+xml...","x-gzip, x-deflate, gzip, deflate","utf-8, utf-8;q=0.5, *;q=0.5",en,localhost:8080,JSESSIONID=F563B5262843F12ECAE41815ABDEEA54,close,,Test,Anomalous,,
3,POST,http://localhost:8080/tienda1/publico/anadir.jsp,HTTP/1.1,Mozilla/5.0 (compatible; Konqueror/3.5; Linux)...,no-cache,no-cache,"text/xml,application/xml,application/xhtml+xml...","x-gzip, x-deflate, gzip, deflate","utf-8, utf-8;q=0.5, *;q=0.5",en,localhost:8080,JSESSIONID=3B654D6DF7F1466EE80D7F756B00E5D1,close,id=2%2F&nombre=Jam%F3n+Ib%E9rico&precio=85&can...,Test,Anomalous,application/x-www-form-urlencoded,77.0
4,GET,http://localhost:8080/asf-logo-wide.gif~,HTTP/1.1,Mozilla/5.0 (compatible; Konqueror/3.5; Linux)...,no-cache,no-cache,"text/xml,application/xml,application/xhtml+xml...","x-gzip, x-deflate, gzip, deflate","utf-8, utf-8;q=0.5, *;q=0.5",en,localhost:8080,JSESSIONID=51A7470173188BBB993947F2283059E4,close,,Test,Anomalous,,


In [12]:
df.Normal_Anom.unique()

array(['Anomalous', 'Normal'], dtype=object)

In [13]:
df.Train_Test.unique()

array(['Test', 'Training'], dtype=object)

Now, since the assignment specified that the model should be trained only on the normalTrafficTraining.txt file, we will first try only with this.

In [8]:
n_features=df.shape[1]
n_samples=df.shape[0]

print("Number of samples:", n_samples)
print("Number of features:", n_features)

Number of samples: 96668
Number of features: 18


In [10]:
df.tail()

Unnamed: 0,Method,URL,HTTP_Version,User-Agent,Pragma,Cache-control,Accept,Accept-Encoding,Accept-Charset,Accept-Language,Host,Cookie,Connection,Body,Train_Test,Normal_Anom,Content-Type,Content-Length
96663,GET,http://localhost:8080/tienda1/imagenes/2.gif,HTTP/1.1,Mozilla/5.0 (compatible; Konqueror/3.5; Linux)...,no-cache,no-cache,"text/xml,application/xml,application/xhtml+xml...","x-gzip, x-deflate, gzip, deflate","utf-8, utf-8;q=0.5, *;q=0.5",en,localhost:8080,JSESSIONID=E1E16AC490F40B8484CD75E2DBE32075,close,,Test,Normal,,
96664,GET,http://localhost:8080/tienda1/imagenes/3.gif,HTTP/1.1,Mozilla/5.0 (compatible; Konqueror/3.5; Linux)...,no-cache,no-cache,"text/xml,application/xml,application/xhtml+xml...","x-gzip, x-deflate, gzip, deflate","utf-8, utf-8;q=0.5, *;q=0.5",en,localhost:8080,JSESSIONID=4567793E184E0925234DADCEECD6999A,close,,Test,Normal,,
96665,GET,http://localhost:8080/tienda1/imagenes/cmenbul...,HTTP/1.1,Mozilla/5.0 (compatible; Konqueror/3.5; Linux)...,no-cache,no-cache,"text/xml,application/xml,application/xhtml+xml...","x-gzip, x-deflate, gzip, deflate","utf-8, utf-8;q=0.5, *;q=0.5",en,localhost:8080,JSESSIONID=487FD70FECB4D14155C95F38C389DA0D,close,,Test,Normal,,
96666,GET,http://localhost:8080/tienda1/imagenes/logo.gif,HTTP/1.1,Mozilla/5.0 (compatible; Konqueror/3.5; Linux)...,no-cache,no-cache,"text/xml,application/xml,application/xhtml+xml...","x-gzip, x-deflate, gzip, deflate","utf-8, utf-8;q=0.5, *;q=0.5",en,localhost:8080,JSESSIONID=6E0F5F3BC982DFC73B39EAD495ADCE96,close,,Test,Normal,,
96667,GET,http://localhost:8080/tienda1/imagenes/nuestra...,HTTP/1.1,Mozilla/5.0 (compatible; Konqueror/3.5; Linux)...,no-cache,no-cache,"text/xml,application/xml,application/xhtml+xml...","x-gzip, x-deflate, gzip, deflate","utf-8, utf-8;q=0.5, *;q=0.5",en,localhost:8080,JSESSIONID=A70DD1BA160B294CB5E1C2D8FAE7C09F,close,,Test,Normal,,


In [13]:
df.columns

Index(['Method', 'URL', 'HTTP_Version', 'User-Agent', 'Pragma',
       'Cache-control', 'Accept', 'Accept-Encoding', 'Accept-Charset',
       'Accept-Language', 'Host', 'Cookie', 'Connection', 'Body', 'Train_Test',
       'Normal_Anom', 'Content-Type', 'Content-Length'],
      dtype='object')

# **Data Visualization**

In [None]:
sns.set_style('darkgrid')
sns.countplot(data=csic_data, x='Unnamed: 0')

Dropping samples with at least 1 NaN value will make to lose all the other Request Methods besides POST, this option is discarded since dropping data is not usually a good choice

In [None]:
csic_data.head()

Visualizing URL format

In [None]:
feature_names=[ 'Unnamed: 0','Method', 'User-Agent', 'Pragma', 'Cache-Control',
       'Accept', 'Accept-encoding', 'Accept-charset', 'language', 'host',
       'cookie', 'content-type', 'connection', 'lenght', 'content','classification',
        'URL']

X=csic_data[feature_names]
print(X)

# **Removing not discriminatory features**

**Enumerating unique values for each feature**

In [None]:
# Removing not discriminatory Features and making some adjustments on feature names
X = X.rename(columns={'Unnamed: 0': 'Class'})
X = X.rename(columns={'lenght': 'content_length'})


feature_names=[ 'Class','Method','host','cookie','Accept', 'content_length', 'content','classification','URL']

# Print the remaining data
X = X[feature_names]
print(X)

In [None]:
y=X.Class
print(y)


In [None]:
size=X.shape[1]
# Get list of categorical variables
s = (X.dtypes == 'object')
object_cols = list(s[s].index)

print("Categorical variables:")
print(object_cols)

Load Models

# **Pre-processing on the feature: Content Length**

In [None]:
print(X.content_length)

Operations on the 'content_lenght' feature

In [None]:
#replace NaN values with 0
#removing the 'Content-Lenght' string and keeping only the numerical value

X['content_length'] = X['content_length'].astype(str)
X['content_length'] = X['content_length'].str.extract(r'(\d+)')
X['content_length'] = pd.to_numeric(X['content_length'], errors='coerce').fillna(0)
print(X.content_length)


GET methods have the content_length set to 0 since they where all NaN (this method does not have to provide content)

In [None]:
filtered_length = X.loc[X['Method'] == 'GET', 'content_length']
print(filtered_length)


# URL PRE-PROCESSING

In [None]:
url_counts = X['URL'].value_counts()
most_common_urls = url_counts.head(10)  # Extract the top 10 most common strings

print("Most common URLs:")
for i, (url, count) in enumerate(most_common_urls.items(), 1):
    print(f"{i}. URL: {url} - Count: {count}")


**Utils for URL/Content pre-processing**

In [None]:
def count_dot(url):
    count_dot = url.count('.')
    return count_dot


def no_of_dir(url):
    urldir = urlparse(url).path
    return urldir.count('/')

def no_of_embed(url):
    urldir = urlparse(url).path
    return urldir.count('//')

def shortening_service(url):
    match = re.search('bit\.ly|goo\.gl|shorte\.st|go2l\.ink|x\.co|ow\.ly|t\.co|tinyurl|tr\.im|is\.gd|cli\.gs|'
                      'yfrog\.com|migre\.me|ff\.im|tiny\.cc|url4\.eu|twit\.ac|su\.pr|twurl\.nl|snipurl\.com|'
                      'short\.to|BudURL\.com|ping\.fm|post\.ly|Just\.as|bkite\.com|snipr\.com|fic\.kr|loopt\.us|'
                      'doiop\.com|short\.ie|kl\.am|wp\.me|rubyurl\.com|om\.ly|to\.ly|bit\.do|t\.co|lnkd\.in|'
                      'db\.tt|qr\.ae|adf\.ly|goo\.gl|bitly\.com|cur\.lv|tinyurl\.com|ow\.ly|bit\.ly|ity\.im|'
                      'q\.gs|is\.gd|po\.st|bc\.vc|twitthis\.com|u\.to|j\.mp|buzurl\.com|cutt\.us|u\.bb|yourls\.org|'
                      'x\.co|prettylinkpro\.com|scrnch\.me|filoops\.info|vzturl\.com|qr\.net|1url\.com|tweez\.me|v\.gd|'
                      'tr\.im|link\.zip\.net',
                      url)
    if match:
        return 1
    else:
        return 0


def count_http(url):
    return url.count('http')

def count_per(url):
    return url.count('%')

def count_ques(url):
    return url.count('?')

def count_hyphen(url):
    return url.count('-')


def count_equal(url):
    return url.count('=')


def url_length(url):
    return len(str(url))

#Hostname Length

def hostname_length(url):
    return len(urlparse(url).netloc)


import re

def suspicious_words(url):
    score_map = {
        'error': 30,
        'errorMsg': 30,
        'id': 10,
        'errorID': 30,
        'SELECT': 50,
        'FROM': 50,
        'WHERE': 50,
        'DELETE': 50,
        'USERS': 50,
        'DROP': 50,
        'CREATE': 50,
        'INJECTED': 50,
        'TABLE': 50,
        'alert': 30,
        'javascript': 20,
        'cookie': 25,
        '--': 30,
        '.exe': 30,
        '.php': 20,
        '.js': 10,
        'admin': 10,
        'administrator': 10,
        '\'': 30,
        'password': 15,
        'login': 15,
        'incorrect': 20,
        'pwd': 15,
        'tamper': 25,
        'vaciar': 20,
        'carrito': 25,
        'wait': 30,
        'delay': 35,
        'set': 20,
        'steal': 35,
        'hacker': 35,
        'proxy': 35,
        'location': 30,
        'document.cookie': 40,
        'document': 20,
        'set-cookie': 40,
        'create': 40,
        'cmd': 40,
        'dir': 30,
        'shell': 40,
        'reverse': 30,
        'bin': 20,
        'cookiesteal': 40,
        'LIKE': 30,
        'UNION': 35,
        'include': 30,
        'file': 20,
        'tmp': 25,
        'ssh': 40,
        'exec': 30,
        'cat': 25,
        'etc': 30,
        'fetch': 25,
        'eval': 30,
        'wait': 30,
        'malware': 45,
        'ransomware': 45,
        'phishing': 45,
        'exploit': 45,
        'virus': 45,
        'trojan': 45,
        'backdoor': 45,
        'spyware': 45,
        'rootkit': 45,
        'credential': 30,
        'inject': 30,
        'script': 25,
        'iframe': 25,
        'src=': 25,
        'onerror': 30,
        'prompt': 20,
        'confirm': 20,
        'eval': 25,
        'expression': 30,
        'function\(': 20,
        'xmlhttprequest': 30,
        'xhr': 20,
        'window.': 20,
        'document.': 20,
        'cookie': 25,
        'click': 15,
        'mouseover': 15,
        'onload': 20,
        'onunload': 20,
    }

    matches = re.findall(r'(?i)' + '|'.join(score_map.keys()), url)

    total_score = sum(score_map.get(match.lower(), 0) for match in matches)
    return total_score


def digit_count(url):
    digits = 0
    for i in url:
        if i.isnumeric():
            digits = digits + 1
    return digits

def letter_count(url):
    letters = 0
    for i in url:
        if i.isalpha():
            letters += 1
    return letters

def count_special_characters(url):
    special_characters = re.sub(r'[a-zA-Z0-9\s]', '', url)
    count = len(special_characters)
    return count


# Number of Parameters in URL
def number_of_parameters(url):
    params = urlparse(url).query
    return 0 if params == '' else len(params.split('&'))

# Number of Fragments in URL
def number_of_fragments(url):
    frags = urlparse(url).fragment
    return len(frags.split('#')) - 1 if frags == '' else 0

# URL is Encoded
def is_encoded(url):
    return int('%' in url.lower())


def unusual_character_ratio(url):
    total_characters = len(url)
    unusual_characters = re.sub(r'[a-zA-Z0-9\s\-._]', '', url)
    unusual_count = len(unusual_characters)
    ratio = unusual_count / total_characters if total_characters > 0 else 0
    return ratio



In [None]:
X['count_dot_url'] = X['URL'].apply(count_dot)
X['count_dir_url'] = X['URL'].apply(no_of_dir)
X['count_embed_domain_url'] = X['URL'].apply(no_of_embed)
X['short_url'] = X['URL'].apply(shortening_service)
X['count-http'] = X['URL'].apply(count_http)
X['count%_url'] = X['URL'].apply(count_per)
X['count?_url'] = X['URL'].apply(count_ques)
X['count-_url'] = X['URL'].apply(count_hyphen)
X['count=_url'] = X['URL'].apply(count_equal)
X['hostname_length_url'] = X['URL'].apply(hostname_length)
X['sus_url'] = X['URL'].apply(suspicious_words)
X['count-digits_url'] = X['URL'].apply(digit_count)
X['count-letters_url'] = X['URL'].apply(letter_count)
X['url_length'] = X['URL'].apply(url_length)
X['number_of_parameters_url'] = X['URL'].apply(number_of_parameters)
X['number_of_fragments_url'] = X['URL'].apply(number_of_fragments)
X['is_encoded_url'] = X['URL'].apply(is_encoded)
X['special_count_url'] = X['URL'].apply(count_special_characters)
X['unusual_character_ratio_url'] = X['URL'].apply(unusual_character_ratio)



In [None]:
# Select the features and class variable for plotting
new_features = ['count_dot_url', 'count_dir_url', 'count_embed_domain_url', 'count-http',
                'count%_url', 'count?_url', 'count-_url', 'count=_url', 'url_length', 'hostname_length_url',
                'sus_url', 'count-digits_url', 'count-letters_url', 'number_of_parameters_url',
                'number_of_fragments_url', 'is_encoded_url','special_count_url','unusual_character_ratio_url']

# Create a DataFrame with the selected features
set = X[new_features]

for new_feature in X.columns:
    if new_feature in X.columns:
        unique_count = X[new_feature].nunique()
        print(f"Number of unique values for {new_feature}: {unique_count}")
    else:
        print(f"Column '{new_feature}' does not exist in the DataFrame.")



# Removing Cookies as feature
 **cookies are unique for each sample, this feature cannot be used as discriminant**

In [None]:
unique_count = X['cookie'].nunique()
print(f"Count of unique values in 'cookie': {unique_count}")


# Encoding categorical features

In [None]:
X['Accept'] = X['Accept'].astype(str)
X['Accept'] = X['Accept'].str.extract(r'(\d+)')
X['Accept'] = pd.to_numeric(X['Accept'], errors='coerce').fillna(1)

In [None]:
lb_make = LabelEncoder()
X["Method_enc"] = lb_make.fit_transform(X["Method"])
X["host_enc"] =lb_make.fit_transform(X["host"])
X["Accept_enc"] =lb_make.fit_transform(X["Accept"])


In [None]:
unique_count_met = X["Method_enc"].nunique()
unique_count_host = X["host_enc"].nunique()
unique_count_acc = X["Accept_enc"].nunique()


print(f"Number of unique values for 'Method_enc': {unique_count_met}")
print(f"Number of unique values for 'host_enc': {unique_count_host}")
print(f"Number of unique values for 'Accept_enc': {unique_count_acc}")





In [None]:
X.head()



In [None]:
X.tail()

In [None]:
def apply_to_content(content,function):
    if pd.isna(content):
        return 0
    elif isinstance(content, str):
        return function(content)

#"""
#                'count_dot_content','count_dir_content','count_embed_domain_content','count%_content','count?_content',
 #               'count-_content','count=_content','hostname_length_content','sus_content','count_digits_content',
  #              'count_letters_content','content_length','number_of_parameters_content','number_of_fragments_content',
   #             'is_encoded_content','special_count_content','unusual_character_ratio_content'
    #            ]"""

X['count_dot_content'] = X['content'].apply(apply_to_content, function=count_dot)
X['count_dir_content'] = X['content'].apply(apply_to_content, function=no_of_dir)
X['count_embed_domain_content'] = X['content'].apply(apply_to_content, function=no_of_embed)
X['count%_content'] = X['content'].apply(apply_to_content, function=count_per)
X['count?_content'] = X['content'].apply(apply_to_content, function=count_ques)
X['count-_content'] = X['content'].apply(apply_to_content, function=count_hyphen)
X['count=_content'] = X['content'].apply(apply_to_content, function=count_equal)
X['content_length'] = X['content'].apply(apply_to_content, function=url_length)
X['sus_content'] = X['content'].apply(apply_to_content, function=suspicious_words)
X['count_digits_content'] = X['content'].apply(apply_to_content, function=digit_count)
X['count_letters_content'] = X['content'].apply(apply_to_content, function=letter_count)
X['special_count_content'] = X['content'].apply(apply_to_content, function=count_special_characters)
X['is_encoded_content'] = X['content'].apply(apply_to_content, function=is_encoded)
#X['unusual_character_ratio_content'] = X['content'].apply(apply_to_content, function=unusual_character_ratio)






In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

# Select the features and class variable for plotting
new_content_features = ['count_dot_content', 'count_dir_content', 'count_embed_domain_content', 'count%_content', 'count?_content',
                        'count-_content', 'count=_content', 'sus_content', 'count_digits_content',
                        'count_letters_content', 'content_length', 'is_encoded_content', 'special_count_content']

# Create a DataFrame with the selected features
selected_features_df = X[new_content_features]

for feature_name in selected_features_df.columns:
    if feature_name in X.columns:
        unique_count = selected_features_df[feature_name].nunique()
        print(f"Number of unique values for {feature_name}: {unique_count}")
    else:
        print(f"Column '{feature_name}' does not exist in the DataFrame.")


In [None]:
X.columns

# **Building the final dataset to use for the classification**

In [None]:
X.columns

In [None]:
labels=['count_dot_url', 'count_dir_url', 'count_embed_domain_url', 'count-http',
                'count%_url', 'count?_url', 'count-_url', 'count=_url', 'url_length', 'hostname_length_url',
                'sus_url', 'count-digits_url', 'count-letters_url', 'number_of_parameters_url',
                'is_encoded_url','special_count_url','unusual_character_ratio_url',
                 #method
                'Method_enc',
                #content
                'count_dot_content','count%_content',
                 'count-_content','count=_content','sus_content','count_digits_content',
                  'count_letters_content','content_length',
               'is_encoded_content','special_count_content']
print(X[labels])


In [None]:
y=X['classification']
print(y)

In [None]:
print('computing...)')
#split dataset in test and train 
x_tr, x_ts, y_tr, y_ts = train_test_split(X[labels], y, test_size=0.3, random_state=0)


print('Done!')


In [None]:
x_tr.head(5)



In [None]:
x_tr.tail(5)

# Classifiers

**RANDOM FOREST**

In [None]:
random_forest_model = RandomForestClassifier(random_state=1000)
print('Computing....')
# Fit the model
random_forest_model.fit(x_tr,y_tr)
print('Done!')

In [None]:
RT_predictions= random_forest_model.predict(x_ts)
print('MAE', mean_absolute_error(y_ts, RT_predictions))
print("Accuracy", accuracy_score(y_ts, RT_predictions))
print("Precision", precision_score(y_ts, RT_predictions, average='weighted', labels=np.unique(RT_predictions)))
print("Recall", recall_score(y_ts, RT_predictions, average='weighted', labels=np.unique(RT_predictions)))
print("F1", f1_score(y_ts, RT_predictions, average='weighted', labels=np.unique(RT_predictions)))
print("ROC AUC", roc_auc_score(y_ts, RT_predictions, average='weighted', labels=np.unique(RT_predictions)))
error_rt = (RT_predictions != y_ts).mean()
print("Test error: {:.1%}".format(error_rt))


In [None]:
print(y_tr.unique())
print(y_tr.name)

In [None]:
x_ts = x_ts.reset_index(drop=True)
y_ts = y_ts.reset_index(drop=True)

for k in range(np.unique(y_ts).size):
    print('mean of class ' + str(k) + ':\n', x_ts[y_ts == k].mean(axis=0))


In [None]:
print(classification_report(y_ts, RT_predictions, target_names = ['Normal (class 0)','Anomalous (class 1)']))

In [None]:


label = ['Normal', 'Anomalous']
cm = confusion_matrix(y_ts, RT_predictions)
cm = pd.DataFrame(cm, index=['0', '1'], columns=['0', '1'])

plt.figure(figsize=(10, 10))
sns.heatmap(cm, cmap="Blues", linecolor='black', linewidth=1, annot=True, fmt='', xticklabels=label, yticklabels=label)
plt.title("Random Forest")
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.show()


**K-NEAREST NEIGHBOR**


In [None]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV

#knn_model = KNeighborsClassifier()

#param_grid = {'n_neighbors': [3, 5, 7, 9,10,11, 13]}

#grid_search = GridSearchCV(knn_model, param_grid, cv=5)
#grid_search.fit(x_tr, y_tr)

#best_n_neighbors = grid_search.best_params_['n_neighbors']
#print("Best n_neighbors:", best_n_neighbors)

#final_model = KNeighborsClassifier(n_neighbors=best_n_neighbors)
#final_model.fit(x_tr, y_tr)

#knn_predictions = final_model.predict(x_ts)

Best n_neighbors: 9


In [None]:
final_model = KNeighborsClassifier(n_neighbors=9)
final_model.fit(x_tr, y_tr)
knn_predictions = final_model.predict(x_ts)

In [None]:
print('MAE', mean_absolute_error(y_ts, knn_predictions))
print("Accuracy", accuracy_score(y_ts, knn_predictions))
print("Precision", precision_score(y_ts, knn_predictions, average='weighted', labels=np.unique(knn_predictions)))
print("Recall", recall_score(y_ts, knn_predictions, average='weighted', labels=np.unique(knn_predictions)))
print("F1", f1_score(y_ts, knn_predictions, average='weighted', labels=np.unique(knn_predictions)))
print("ROC AUC", roc_auc_score(y_ts, knn_predictions, average='weighted', labels=np.unique(knn_predictions)))
error_knn = (knn_predictions != y_ts).mean()
print("Test error: {:.1%}".format(error_knn))

In [None]:
cm = confusion_matrix(y_ts,knn_predictions)
cm = pd.DataFrame(cm , index = ['0','1'] , columns = ['0','1'])
plt.figure(figsize = (10,10))
plt.title("KN Neighbors")
plt.xlabel("Predicted")
plt.ylabel("Actual")
sns.heatmap(cm,cmap= "Blues", linecolor = 'black' , linewidth = 1 , annot = True, fmt='',xticklabels = label,yticklabels = label)


**DECISION TREE**

In [None]:
DT_model = DecisionTreeClassifier(random_state=2)
print('Computing....')
DT_model.fit(x_tr,y_tr)
print('Done!')

In [None]:
DT_predictions= DT_model.predict(x_ts)
print('MAE', mean_absolute_error(y_ts, DT_predictions))
print("Accuracy", accuracy_score(y_ts, DT_predictions))
print("Precision", precision_score(y_ts, DT_predictions, average='weighted', labels=np.unique(DT_predictions)))
print("Recall", recall_score(y_ts, DT_predictions, average='weighted', labels=np.unique(DT_predictions)))
print("F1", f1_score(y_ts, DT_predictions, average='weighted', labels=np.unique(DT_predictions)))
print("ROC AUC", roc_auc_score(y_ts, DT_predictions, average='weighted', labels=np.unique(DT_predictions)))
error_dt = (DT_predictions != y_ts).mean()
print("Test error: {:.1%}".format(error_dt))

In [None]:
cm = confusion_matrix(y_ts,DT_predictions)
cm = pd.DataFrame(cm , index = ['0','1'] , columns = ['0','1'])
plt.title("Decision Tree")
plt.xlabel("Predicted")
plt.ylabel("Actual")
sns.heatmap(cm,cmap= "Blues", linecolor = 'black' , linewidth = 1 , annot = True, fmt='',xticklabels = label,yticklabels = label)

**Logistic Regression**

In [None]:
LR_model = LogisticRegression(random_state = 42, max_iter = 1000)
print('Computing....')
LR_model.fit(x_tr,y_tr)
print('Done!')

In [None]:
LR_predictions= LR_model.predict(x_ts)
print('MAE', mean_absolute_error(y_ts, LR_predictions))
print("Accuracy", accuracy_score(y_ts, LR_predictions))
print("Precision", precision_score(y_ts, LR_predictions, average='weighted', labels=np.unique(LR_predictions)))
print("Recall", recall_score(y_ts, LR_predictions, average='weighted', labels=np.unique(LR_predictions)))
print("F1", f1_score(y_ts, LR_predictions, average='weighted', labels=np.unique(LR_predictions)))
print("ROC AUC", roc_auc_score(y_ts,LR_predictions, average='weighted', labels=np.unique(LR_predictions)))
error_lr = (LR_predictions != y_ts).mean()
print("Test error: {:.1%}".format(error_lr))

In [None]:
cm = confusion_matrix(y_ts,LR_predictions)
cm = pd.DataFrame(cm , index = ['0','1'] , columns = ['0','1'])
plt.title("Logistic Regression")
plt.xlabel("Predicted")
plt.ylabel("Actual")
sns.heatmap(cm,cmap= "Blues", linecolor = 'black' , linewidth = 1 , annot = True, fmt='',xticklabels = label,yticklabels = label)

**Support Vector Machine (SVM)**

In [None]:
SVC_model = SVC()
print('Computing....') 
SVC_model.fit(x_tr,y_tr)
print('Done!')

In [None]:
SVC_predictions= SVC_model.predict(x_ts)
print('MAE', mean_absolute_error(y_ts, SVC_predictions))
print("Accuracy", accuracy_score(y_ts, SVC_predictions))
print("Precision", precision_score(y_ts, SVC_predictions, average='weighted', labels=np.unique(SVC_predictions)))
print("Recall", recall_score(y_ts, SVC_predictions, average='weighted', labels=np.unique(SVC_predictions)))
print("F1", f1_score(y_ts, SVC_predictions, average='weighted', labels=np.unique(SVC_predictions)))
print("ROC AUC", roc_auc_score(y_ts,SVC_predictions, average='weighted', labels=np.unique(SVC_predictions)))
error_svc = (SVC_predictions != y_ts).mean()
print("Test error: {:.1%}".format(error_svc))

In [None]:
cm = confusion_matrix(y_ts,SVC_predictions)
cm = pd.DataFrame(cm , index = ['0','1'] , columns = ['0','1'])
plt.title("SVC")
plt.xlabel("Predicted")
plt.ylabel("Actual")
sns.heatmap(cm,cmap= "Blues", linecolor = 'black' , linewidth = 1 , annot = True, fmt='',xticklabels = label,yticklabels = label)

**Naïves Bayes**

In [None]:
NB_model = GaussianNB ()
print('Computing....')
NB_model.fit(x_tr,y_tr)
print('Done!')

In [None]:
NB_predictions= NB_model.predict(x_ts)
print('MAE', mean_absolute_error(y_ts, NB_predictions))
print("Accuracy", accuracy_score(y_ts, NB_predictions))
print("Precision", precision_score(y_ts, NB_predictions, average='weighted', labels=np.unique(NB_predictions)))
print("Recall", recall_score(y_ts, NB_predictions, average='weighted', labels=np.unique(NB_predictions)))
print("F1", f1_score(y_ts, NB_predictions, average='weighted', labels=np.unique(NB_predictions)))
print("ROC AUC", roc_auc_score(y_ts,NB_predictions, average='weighted', labels=np.unique(NB_predictions)))
error_nb = (NB_predictions != y_ts).mean()
print("Test error: {:.1%}".format(error_nb))

In [None]:
cm = confusion_matrix(y_ts,NB_predictions)
cm = pd.DataFrame(cm , index = ['0','1'] , columns = ['0','1'])
plt.title("NB")
plt.xlabel("Predicted")
plt.ylabel("Actual")
sns.heatmap(cm,cmap= "Blues", linecolor = 'black' , linewidth = 1 , annot = True, fmt='',xticklabels = label,yticklabels = label)

**Recurrent Neural Network(RNN)** 

In [None]:
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import SimpleRNN, Dense
from tensorflow.keras.optimizers import Adam

# Assuming x_tr, y_tr, x_ts, y_ts are your training and testing data
# Ensure they are numpy arrays or can be converted to numpy arrays
# Ensure that y_tr and y_ts are properly encoded (binary labels)

# Example label encoding for binary classification
label_encoder = LabelEncoder()
y_tr_encoded = label_encoder.fit_transform(y_tr)
y_ts_encoded = label_encoder.transform(y_ts)

# Reshape x_tr and x_ts if necessary (assuming they are 2D arrays)
# Add this if x_tr and x_ts are 1D arrays: x_tr = x_tr.reshape(-1, 1)

# Initialize RNN model
RNN_model = Sequential()
RNN_model.add(SimpleRNN(50, input_shape=(x_tr.shape[1], 1), activation='relu'))
RNN_model.add(Dense(units=1, activation='sigmoid'))  # Adjust units and activation based on your task

# Compile the model
RNN_model.compile(optimizer=Adam(learning_rate=0.001), loss='binary_crossentropy', metrics=['accuracy'])

# Train the model
RNN_model.fit(x_tr, y_tr_encoded, epochs=50, batch_size=32, validation_data=(x_ts, y_ts_encoded))

# Evaluate the model
accuracy = RNN_model.evaluate(x_ts, y_ts_encoded)[1]

print(f'Accuracy: {accuracy}')

**Artificial Neural Network(ANN)**

In [None]:

# Assuming x_tr, y_tr, x_ts, y_ts are your training and testing data
# Ensure they are numpy arrays or can be converted to numpy arrays
# Ensure that y_tr and y_ts are properly encoded (binary labels)

# Example label encoding for binary classification
label_encoder = LabelEncoder()
y_tr_encoded = label_encoder.fit_transform(y_tr)
y_ts_encoded = label_encoder.transform(y_ts)

# Initialize ANN model
ANN_model = Sequential()
ANN_model.add(Dense(50, input_shape=(x_tr.shape[1],), activation='relu'))
ANN_model.add(Dense(units=1, activation='sigmoid'))  # Adjust units and activation based on your task

# Compile the model
ANN_model.compile(optimizer=Adam(learning_rate=0.001), loss='binary_crossentropy', metrics=['accuracy'])

# Train the model
ANN_model.fit(x_tr, y_tr_encoded, epochs=30, batch_size=32, validation_data=(x_ts, y_ts_encoded))

# Evaluate the model
accuracy = ANN_model.evaluate(x_ts, y_ts_encoded)[1]

print(f'Accuracy: {accuracy}')


**Convolutional Neural Network(CNN)**

In [None]:
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Flatten, Dense
from tensorflow.keras.optimizers import Adam

# Assuming x_tr, y_tr, x_ts, y_ts are your training and testing data DataFrames
# Convert DataFrames to numpy arrays
x_tr = x_tr.to_numpy()
x_ts = x_ts.to_numpy()

# Example label encoding for binary classification
label_encoder = LabelEncoder()
y_tr_encoded = label_encoder.fit_transform(y_tr)
y_ts_encoded = label_encoder.transform(y_ts)

# Reshape x_tr and x_ts to match the input shape expected by the model
x_tr = x_tr.reshape(x_tr.shape[0], -1)  # Flattens the input to a 1D array
x_ts = x_ts.reshape(x_ts.shape[0], -1)  # Flattens the input to a 1D array

# Initialize the model
model = Sequential()
model.add(Flatten(input_shape=(x_tr.shape[1],)))  # Flattens the input
model.add(Dense(64, activation='relu'))  # Add a dense layer with 64 neurons and ReLU activation
model.add(Dense(32, activation='relu'))  # Add another dense layer with 32 neurons and ReLU activation
model.add(Dense(units=1, activation='sigmoid'))  # Output layer with sigmoid activation for binary classification

# Compile the model
model.compile(optimizer=Adam(learning_rate=0.001), loss='binary_crossentropy', metrics=['accuracy'])

# Train the model
model.fit(x_tr, y_tr_encoded, epochs=30, batch_size=32, validation_data=(x_ts, y_ts_encoded))

# Evaluate the model
accuracy = model.evaluate(x_ts, y_ts_encoded)[1]

print(f'Accuracy: {accuracy}')


**Long Short-Term Memory(LSTM)**


In [None]:
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense
from tensorflow.keras.optimizers import Adam

# Assuming x_tr, y_tr, x_ts, y_ts are your training and testing data
# Ensure they are numpy arrays or can be converted to numpy arrays
# Ensure that y_tr and y_ts are properly encoded (binary labels)

# Example label encoding for binary classification
label_encoder = LabelEncoder()
y_tr_encoded = label_encoder.fit_transform(y_tr)
y_ts_encoded = label_encoder.transform(y_ts)

# Reshape x_tr and x_ts if necessary (assuming they are 2D arrays)
# Add this if x_tr and x_ts are 1D arrays: x_tr = x_tr.reshape(-1, 1)

# Initialize LSTM model
LSTM_model = Sequential()
LSTM_model.add(LSTM(50, input_shape=(x_tr.shape[1], 1), activation='relu'))
LSTM_model.add(Dense(units=1, activation='sigmoid'))  # Adjust units and activation based on your task

# Compile the model
LSTM_model.compile(optimizer=Adam(learning_rate=0.001), loss='binary_crossentropy', metrics=['accuracy'])

# Train the model
LSTM_model.fit(x_tr, y_tr_encoded, epochs=25, batch_size=32, validation_data=(x_ts, y_ts_encoded))

# Evaluate the model
accuracy = LSTM_model.evaluate(x_ts, y_ts_encoded)[1]

print(f'Accuracy: {accuracy}')


**RANKING THE TRAINED MODELS ON THE MAE VALUE**

In [None]:
from sklearn.metrics import accuracy_score

model_preds = [RT_predictions, knn_predictions, DT_predictions, LR_predictions, SVC_predictions, NB_predictions]
model_names = ['Random Forest', 'K-Nearest Neighbors', 'Decision Tree', 'Logistic Regression', 'SVC', 'NB']

# Function for comparing different models
def score_model(model_preds, y_ts):
    return accuracy_score(y_ts, model_preds)

# Calculate Accuracy for each model
acc_score = []
for i in range(len(model_names)):
    acc = score_model(model_preds[i], y_ts)
    acc_score.append((model_names[i], acc))

acc_scores_sorted = sorted(acc_score, key=lambda x: x[1], reverse= True)
target_range = y_ts.max() - y_ts.min()

# Print ranked model names, MAE scores, and error percentages
# for i, (model_name, acc) in enumerate(acc_scores_sorted):
#     error_percent = (mae / target_range) * 100  # Calculate error percentage
#     print("Rank %d: %s - Accuracy: %.4f - : %.2f%%" % (i+1, model_name, acc, error_percent))
print("Done!")

In [None]:
model_preds = [RT_predictions, knn_predictions, DT_predictions, LR_predictions, SVC_predictions, NB_predictions]
model_names = ['Random Forest', 'K-Nearest Neighbors', 'Decision Tree', 'Logistic Regression','SVC', 'NB']

# Calculate Accuracy for each model
acc_score = []
for i in range(len(model_names)):
    acc = score_model(model_preds[i], y_ts)
    acc_score.append((model_names[i], acc))

acc_scores_sorted = sorted(acc_score, key=lambda x: x[1], reverse= True)
target_range = y_ts.max() - y_ts.min()

# Print ranked model names, Accuracy scores, and Accuracy percentages
for i, (model_name, acc) in enumerate(acc_scores_sorted):
    error_percent = (acc / target_range) * 100  # Calculate error percentage
    print("Rank %d: %s - ACC: %.4f - Accuracy: %.2f%%" % (i+1, model_name, acc, error_percent))