USING ANOMALY DETECTION


In [None]:
import re
import pandas as pd
from sklearn.ensemble import IsolationForest
from sklearn.model_selection import train_test_split

# Loading the dataset
dataset = pd.read_csv('D:/Education/PYTHON/ML-v2/Datasets/malicious_phish.csv')

# Extracting domain-based features
dataset['domain_length'] = dataset['url'].apply(lambda x: len(re.findall('/.', x)))
dataset['subdomains'] = dataset['url'].apply(lambda x: x.count('.') - 1)
dataset['has_ip'] = dataset['url'].apply(lambda x: 1 if re.match(r'\b\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}\b', x) else 0)

# Extracting path-based features
dataset['path_length'] = dataset['url'].apply(lambda x: len(re.findall('/', x)))
dataset['path_depth'] = dataset['url'].apply(lambda x: x.count('/'))

# Extracting character-based features
special_characters = '@/%/$/=/./_/?/:/~/'
dataset['special_chars_count'] = dataset['url'].apply(lambda x: sum(x.count(char) for char in special_characters))

# Combine features
X = dataset[['domain_length', 'subdomains', 'has_ip', 'path_length', 'path_depth', 'special_chars_count']].values

# Splitting the dataset into training and testing sets
X_train, X_test = train_test_split(X, test_size=0.2, random_state=42)

# Initialize the Isolation Forest model
isolation_forest = IsolationForest(contamination=0.1, random_state=42)

# Fit the model
isolation_forest.fit(X_train)

# Predictions on the testing set
y_pred = isolation_forest.predict(X_test)

# Evaluation
print("Number of anomalies detected:", len(y_pred[y_pred == -1]))





Number of anomalies detected: 12617


1. The (extract_features function) preprocesses the new URL and extracts the same features used in training the model.
2. The features extracted from the new URL are transformed into a numpy array with the same shape as the training data.
3. The Isolation Forest model predicts whether the new URL is an anomaly (-1) or not (1).
4. Based on the prediction, it prints whether the URL is detected as an anomaly or not.

In [None]:
import re
import numpy as np

# Function to preprocess and extract features from a URL
def extract_features(url):
    domain_length = len(re.findall('/.', url))
    subdomains = url.count('.') - 1
    has_ip = 1 if re.match(r'\b\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}\b', url) else 0
    path_length = len(re.findall('/', url))
    path_depth = url.count('/')
    special_characters = '@/%/$/=/./_/?/:/~/'
    special_chars_count = sum(url.count(char) for char in special_characters)
    
    return [domain_length, subdomains, has_ip, path_length, path_depth, special_chars_count]

# URL to test
new_url = "https://certificate.doenets.lk/"

# Extract features from the new URL
new_url_features = extract_features(new_url)

# Transform features into the same format as the training data
new_url_features_array = np.array(new_url_features).reshape(1, -1)
#new commnet
# Predict anomaly
is_anomaly = isolation_forest.predict(new_url_features_array)

if is_anomaly == -1:
    print("The URL is detected as an anomaly.")
else:
    print("The URL is not detected as an anomaly.")


The URL is not detected as an anomaly.


Using Plotly

In [None]:
import re
import pandas as pd
from sklearn.ensemble import IsolationForest
import plotly.graph_objs as go
import plotly.offline as py

# Loading the dataset
dataset = pd.read_csv('D:/Education/PYTHON/ML-v2/Datasets/malicious_phish.csv')

#FEATURE EXTRACTION
# Extracting domain-based features
dataset['domain_length'] = dataset['url'].apply(lambda x: len(re.findall('/.', x)))
dataset['subdomains'] = dataset['url'].apply(lambda x: x.count('.') - 1)
dataset['has_ip'] = dataset['url'].apply(lambda x: 1 if re.match(r'\b\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}\b', x) else 0)

# Extracting path-based features
dataset['path_length'] = dataset['url'].apply(lambda x: len(re.findall('/', x)))
dataset['path_depth'] = dataset['url'].apply(lambda x: x.count('/'))

# Extracting character-based features
special_characters = '@/%/$/=/./_/?/:/~/'
dataset['special_chars_count'] = dataset['url'].apply(lambda x: sum(x.count(char) for char in special_characters))

# Combine features (Combines all the extracted features into a single NumPy array X)
X = dataset[['domain_length', 'subdomains', 'has_ip', 'path_length', 'path_depth', 'special_chars_count']].values

# Initialize the Isolation Forest model
isolation_forest = IsolationForest(contamination=0.1, random_state=42)

# Fit the model
isolation_forest.fit(X)

# Predictions on the dataset
y_pred = isolation_forest.predict(X)

# Convert predictions to a boolean array indicating anomalies (True) and inliers (False)
anomalies = y_pred == -1

# Filter X_train based on anomalies
X_train_anomalies = X[anomalies]

# Create a scatter plot of the dataset
scatter = go.Scatter(x=X[:, 0], y=X[:, 1], mode='markers', 
                     marker=dict(color='blue', opacity=0.5), 
                     name='Inliers')

# Create a scatter plot of anomalies
anomaly_scatter = go.Scatter(x=X_train_anomalies[:, 0], y=X_train_anomalies[:, 1], mode='markers', 
                             marker=dict(color='red', opacity=0.5), 
                             name='Anomalies')

# Layout settings
layout = go.Layout(title='Anomaly Detection with Isolation Forest', 
                   xaxis=dict(title='Feature 1'), yaxis=dict(title='Feature 2'))

# Create figure
fig = go.Figure(data=[scatter, anomaly_scatter], layout=layout)

# Save the plot as HTML file
py.plot(fig, filename='anomaly_detection_plot.html')


'anomaly_detection_plot.html'

ADDING MORE PARAMETERS TO THE EXISTIN CODE

CODE #1

In [None]:
import re
import pandas as pd
import numpy as np
from sklearn.ensemble import IsolationForest
from sklearn.model_selection import train_test_split

# Loading the dataset
dataset = pd.read_csv('D:/Education/PYTHON/ML-v2/Datasets/malicious_phish.csv')

# Extracting domain-based features
dataset['domain_length'] = dataset['url'].apply(lambda x: len(re.findall(r'\.', x)))
dataset['subdomains'] = dataset['url'].apply(lambda x: x.count('.') - 1)
dataset['has_ip'] = dataset['url'].apply(lambda x: 1 if re.match(r'\b\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}\b', x) else 0)

# Extracting path-based features
dataset['path_length'] = dataset['url'].apply(lambda x: len(re.findall('/', x)))
dataset['path_depth'] = dataset['url'].apply(lambda x: x.count('/'))

# Extracting character-based features
special_characters = '@/%/$/=/./_/?/:/~'
dataset['special_chars_count'] = dataset['url'].apply(lambda x: sum(x.count(char) for char in special_characters))

# Additional features
# Extracting domain reputation (0 for unknown, 1 for reputable, -1 for suspicious/malicious)
# Assume a simple rule for demonstration: domains containing 'phish' or 'malware' are marked as suspicious
dataset['domain_reputation'] = dataset['url'].apply(lambda x: -1 if re.search(r'phish|malware', x, flags=re.IGNORECASE) else 1 if '.' in x else 0)

# Length of top-level domain (TLD)
dataset['tld_length'] = dataset['url'].apply(lambda x: len(re.findall(r'\.([a-zA-Z]+)$', x)[0]) if len(re.findall(r'\.([a-zA-Z]+)$', x)) > 0 else 0)

# Presence of HTTP Redirects
dataset['has_redirects'] = dataset['url'].apply(lambda x: 1 if '//' in x else 0)

# Combine features
X = dataset[['domain_length', 'subdomains', 'has_ip', 'path_length', 'path_depth', 'special_chars_count', 'domain_reputation', 'tld_length', 'has_redirects']].values

# Splitting the dataset into training and testing sets
X_train, X_test = train_test_split(X, test_size=0.2, random_state=42)

# Initialize the Isolation Forest model
isolation_forest = IsolationForest(contamination=0.1, random_state=42)

# Fit the model
isolation_forest.fit(X_train)

# Predictions on the testing set
y_pred = isolation_forest.predict(X_test)

# Evaluation
print("Number of anomalies detected:", len(y_pred[y_pred == -1]))

# Function to preprocess and extract features from a URL
def extract_features(url):
    domain_length = len(re.findall(r'\.', url))
    subdomains = url.count('.') - 1
    has_ip = 1 if re.match(r'\b\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}\b', url) else 0
    path_length = len(re.findall('/', url))
    path_depth = url.count('/')
    special_characters = '@/%/$/=/./_/?/:/~'
    special_chars_count = sum(url.count(char) for char in special_characters)
    domain_reputation = -1 if re.search(r'phish|malware', url, flags=re.IGNORECASE) else 1 if '.' in url else 0
    tld_length = len(re.findall(r'\.([a-zA-Z]+)$', url)[0]) if len(re.findall(r'\.([a-zA-Z]+)$', url)) > 0 else 0
    has_redirects = 1 if '//' in url else 0
    
    return [domain_length, subdomains, has_ip, path_length, path_depth, special_chars_count, domain_reputation, tld_length, has_redirects]

# URL to test
new_url = "https://www.sampathvishwa.com/SVRClientWeb/ActionController"

# Extract features from the new URL
new_url_features = extract_features(new_url)

# Transform features into the same format as the training data
new_url_features_array = np.array(new_url_features).reshape(1, -1)

# Predict anomaly
is_anomaly = isolation_forest.predict(new_url_features_array)

if is_anomaly == -1:
    print("The URL is detected as an anomaly.")
else:
    print("The URL is not detected as an anomaly.")


Number of anomalies detected: 12712
The URL is not detected as an anomaly.


CODE #2

In [4]:
import re
import pandas as pd
import numpy as np
import requests
from sklearn.ensemble import IsolationForest
from sklearn.model_selection import train_test_split


# Mock function for domain reputation (simply returns 1 for reputable, -1 for suspicious/malicious)
def get_domain_reputation(domain):
    suspicious_domains = ['phish', 'malware', 'scam', 'fraud']
    if any(keyword in domain for keyword in suspicious_domains):
        return -1
    else:
        return 1

# Predefined list of suspicious subdomain keywords
suspicious_subdomain_keywords = ['login', 'verify', 'account', 'secure']

# Function to check SSL certificate
def check_ssl_certificate(url):
    try:
        response = requests.head(url, verify=True)
        if response.status_code == 200 and response.headers.get('Strict-Transport-Security'):
            return 1  # SSL certificate is present
        else:
            return 0  # SSL certificate is not present
    except requests.exceptions.RequestException as e:
        print(f"Error checking SSL certificate for URL: {url}")
        print(f"Error message: {str(e)}")
        return 0  # Assume SSL certificate is not present due to error


# Loading the dataset
dataset = pd.read_csv('D:/Education/PYTHON/ML-v2/Datasets/malicious_phish.csv')


# Extracting domain-based features
dataset['domain_length'] = dataset['url'].apply(lambda x: len(re.findall(r'\.', x)))
dataset['subdomains'] = dataset['url'].apply(lambda x: x.count('.') - 1)
dataset['has_ip'] = dataset['url'].apply(lambda x: 1 if re.match(r'\b\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}\b', x) else 0)

# Extracting path-based features
dataset['path_length'] = dataset['url'].apply(lambda x: len(re.findall('/', x)))
dataset['path_depth'] = dataset['url'].apply(lambda x: x.count('/'))

# Extracting character-based features
special_characters = '@/%/$/=/./_/?/:/~'
dataset['special_chars_count'] = dataset['url'].apply(lambda x: sum(x.count(char) for char in special_characters))

# Additional features
# Domain Reputation
dataset['domain_reputation'] = dataset.apply(lambda row: get_domain_reputation(row['url'].split('//')[-1].split('/')[0]) if row['status'] == 'phishing' else 0, axis=1)

# Presence of Subdomain Keywords
for keyword in suspicious_subdomain_keywords:
    dataset['has_' + keyword + '_subdomain'] = dataset.apply(lambda row: 1 if keyword in row['url'].split('//')[-1].split('/')[0] and row['status'] == 'phishing' else 0, axis=1)

# SSL Certificate
dataset['has_ssl_certificate'] = dataset.apply(lambda row: 1 if check_ssl_certificate(row['url']) and row['status'] == 'phishing' else 0, axis=1)



KeyError: 'status'

In [None]:
# Combine features
X = dataset[['domain_length', 'subdomains', 'has_ip', 'path_length', 'path_depth', 'special_chars_count', 
            'domain_reputation', 'has_ssl_certificate'] + ['has_' + keyword + '_subdomain' for keyword in suspicious_subdomain_keywords]].values

# Splitting the dataset into training and testing sets
X_train, X_test = train_test_split(X, test_size=0.2, random_state=42)

# Initialize the Isolation Forest model
isolation_forest = IsolationForest(contamination=0.1, random_state=42)

# Fit the model
isolation_forest.fit(X_train)

# Predictions on the testing set
y_pred = isolation_forest.predict(X_test)

# Evaluation
print("Number of anomalies detected:", len(y_pred[y_pred == -1]))

KeyError: "['has_ssl_certificate'] not in index"

In [6]:
# Function to preprocess and extract features from a URL
def extract_features(url):
    domain_length = len(re.findall(r'\.', url))
    subdomains = url.count('.') - 1
    has_ip = 1 if re.match(r'\b\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}\b', url) else 0
    path_length = len(re.findall('/', url))
    path_depth = url.count('/')
    special_characters = '@/%/$/=/./_/?/:/~'
    special_chars_count = sum(url.count(char) for char in special_characters)
    domain_reputation = get_domain_reputation(url.split('//')[-1].split('/')[0])
    has_ssl_certificate = 1 if check_ssl_certificate(url) else 0
    subdomain_keyword_features = [1 if keyword in url.split('//')[-1].split('/')[0] else 0 for keyword in suspicious_subdomain_keywords]
    
    return [domain_length, subdomains, has_ip, path_length, path_depth, special_chars_count, 
            domain_reputation, has_ssl_certificate] + subdomain_keyword_features

# URL to test
new_url = "en.wikipedia.org/wiki/"

# Extract features from the new URL
new_url_features = extract_features(new_url)

# Transform features into the same format as the training data
new_url_features_array = np.array(new_url_features).reshape(1, -1)

# Predict anomaly
is_anomaly = isolation_forest.predict(new_url_features_array)

if is_anomaly == -1:
    print("The URL is detected as an anomaly.")
else:
    print("The URL is not detected as an anomaly.")

NameError: name 'isolation_forest' is not defined