USING ANOMALY DETECTION


In [1]:
import re
import pandas as pd
from sklearn.ensemble import IsolationForest
from sklearn.model_selection import train_test_split

# Loading the dataset
dataset = pd.read_csv('D:/Education/PYTHON/ML-v2/Datasets/malicious_phish.csv')

# Extracting domain-based features
dataset['domain_length'] = dataset['url'].apply(lambda x: len(re.findall('/.', x)))
dataset['subdomains'] = dataset['url'].apply(lambda x: x.count('.') - 1)
dataset['has_ip'] = dataset['url'].apply(lambda x: 1 if re.match(r'\b\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}\b', x) else 0)

# Extracting path-based features
dataset['path_length'] = dataset['url'].apply(lambda x: len(re.findall('/', x)))
dataset['path_depth'] = dataset['url'].apply(lambda x: x.count('/'))

# Extracting character-based features
special_characters = '@/%/$/=/./_/?/:/~/'
dataset['special_chars_count'] = dataset['url'].apply(lambda x: sum(x.count(char) for char in special_characters))

# Combine features
X = dataset[['domain_length', 'subdomains', 'has_ip', 'path_length', 'path_depth', 'special_chars_count']].values

# Splitting the dataset into training and testing sets
X_train, X_test = train_test_split(X, test_size=0.2, random_state=42)

# Initialize the Isolation Forest model
isolation_forest = IsolationForest(contamination=0.1, random_state=42)

# Fit the model
isolation_forest.fit(X_train)

# Predictions on the testing set
y_pred = isolation_forest.predict(X_test)

# Evaluation
print("Number of anomalies detected:", len(y_pred[y_pred == -1]))



Number of anomalies detected: 12617


1. The (extract_features function) preprocesses the new URL and extracts the same features used in training the model.
2. The features extracted from the new URL are transformed into a numpy array with the same shape as the training data.
3. The Isolation Forest model predicts whether the new URL is an anomaly (-1) or not (1).
4. Based on the prediction, it prints whether the URL is detected as an anomaly or not.

In [3]:
import re
import numpy as np

# Function to preprocess and extract features from a URL
def extract_features(url):
    domain_length = len(re.findall('/.', url))
    subdomains = url.count('.') - 1
    has_ip = 1 if re.match(r'\b\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}\b', url) else 0
    path_length = len(re.findall('/', url))
    path_depth = url.count('/')
    special_characters = '@/%/$/=/./_/?/:/~/'
    special_chars_count = sum(url.count(char) for char in special_characters)
    
    return [domain_length, subdomains, has_ip, path_length, path_depth, special_chars_count]

# URL to test
new_url = "https://l.wl.co/l?u=https://t.co/P5vuMF1nNo?fbclid=SavannahThomas_%F0%9D%95%80_%F0%9D%94%B8%F0%9D%95%84_%F0%9D%94%B8_%F0%9D%95%8A%F0%9D%95%80%E2%84%95%F0%9D%94%BE%F0%9D%95%83%F0%9D%94%BC_%F0%9D%95%8E%F0%9D%95%86%F0%9D%95%84%F0%9D%94%B8%E2%84%95_%F0%9D%95%83%F0%9D%95%86%F0%9D%95%86%F0%9D%95%82%F0%9D%95%80%E2%84%95%F0%9D%94%BE_%F0%9D%94%BD%F0%9D%95%86%E2%84%9D_%F0%9D%94%B8_%F0%9D%95%84%F0%9D%95%80%F0%9D%94%BE%E2%84%8D%F0%9D%95%8B%F0%9D%95%90_%F0%9D%95%84%F0%9D%94%B8%E2%84%95_%F0%9D%95%80%E2%84%95_%F0%9D%95%84%F0%9D%95%90_%F0%9D%94%B9%F0%9D%94%BC%F0%9D%94%BB%E2%84%9D%F0%9D%95%86%F0%9D%95%86%F0%9D%95%84_%F0%9D%95%80_%F0%9D%95%8E%F0%9D%94%B8%F0%9D%95%80%F0%9D%95%8B_%F0%9D%94%BD%F0%9D%95%86%E2%84%9D_%F0%9D%95%90%F0%9D%95%86%F0%9D%95%8C_%F0%9D%95%80%E2%84%95_%F0%9D%94%B8_%E2%84%82%F0%9D%95%83%F0%9D%95%86%F0%9D%95%8A%F0%9D%94%BC%F0%9D%94%BB_%E2%84%9D%F0%9D%95%86%F0%9D%95%86%F0%9D%95%84"

# Extract features from the new URL
new_url_features = extract_features(new_url)

# Transform features into the same format as the training data
new_url_features_array = np.array(new_url_features).reshape(1, -1)

# Predict anomaly
is_anomaly = isolation_forest.predict(new_url_features_array)

if is_anomaly == -1:
    print("The URL is detected as an anomaly.")
else:
    print("The URL is not detected as an anomaly.")


The URL is detected as an anomaly.


Using Plotly

In [1]:
import re
import pandas as pd
from sklearn.ensemble import IsolationForest
import plotly.graph_objs as go
import plotly.offline as py

# Loading the dataset
dataset = pd.read_csv('D:/Education/PYTHON/ML-v2/Datasets/malicious_phish.csv')

#FEATURE EXTRACTION
# Extracting domain-based features
dataset['domain_length'] = dataset['url'].apply(lambda x: len(re.findall('/.', x)))
dataset['subdomains'] = dataset['url'].apply(lambda x: x.count('.') - 1)
dataset['has_ip'] = dataset['url'].apply(lambda x: 1 if re.match(r'\b\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}\b', x) else 0)

# Extracting path-based features
dataset['path_length'] = dataset['url'].apply(lambda x: len(re.findall('/', x)))
dataset['path_depth'] = dataset['url'].apply(lambda x: x.count('/'))

# Extracting character-based features
special_characters = '@/%/$/=/./_/?/:/~/'
dataset['special_chars_count'] = dataset['url'].apply(lambda x: sum(x.count(char) for char in special_characters))

# Combine features (Combines all the extracted features into a single NumPy array X)
X = dataset[['domain_length', 'subdomains', 'has_ip', 'path_length', 'path_depth', 'special_chars_count']].values

# Initialize the Isolation Forest model
isolation_forest = IsolationForest(contamination=0.1, random_state=42)

# Fit the model
isolation_forest.fit(X)

# Predictions on the dataset
y_pred = isolation_forest.predict(X)

# Convert predictions to a boolean array indicating anomalies (True) and inliers (False)
anomalies = y_pred == -1

# Filter X_train based on anomalies
X_train_anomalies = X[anomalies]

# Create a scatter plot of the dataset
scatter = go.Scatter(x=X[:, 0], y=X[:, 1], mode='markers', 
                     marker=dict(color='blue', opacity=0.5), 
                     name='Inliers')

# Create a scatter plot of anomalies
anomaly_scatter = go.Scatter(x=X_train_anomalies[:, 0], y=X_train_anomalies[:, 1], mode='markers', 
                             marker=dict(color='red', opacity=0.5), 
                             name='Anomalies')

# Layout settings
layout = go.Layout(title='Anomaly Detection with Isolation Forest', 
                   xaxis=dict(title='Feature 1'), yaxis=dict(title='Feature 2'))

# Create figure
fig = go.Figure(data=[scatter, anomaly_scatter], layout=layout)

# Save the plot as HTML file
py.plot(fig, filename='anomaly_detection_plot.html')


'anomaly_detection_plot.html'