As described in: Lin, M.-S., et al.: Malicious URL filtering- a big data application. IEEE Interna-
tional Conference on Big Data (2013) we are going to extract features from URLs

In [1]:
import re
from urllib.parse import urlparse
from sklearn import *
import numpy as np
import pandas as pd
import json
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
import joblib

Step 1: Lexical features extraction: split URLs into components, apply a sliding window to the domain, and use a bag-of-words model to describe each component.

In [2]:
def lexicalFE(url): #lexical feature extraction fn - takes in a URL
    # Ensure the URL has a scheme for proper parsing, crucial for IPv6 addr
    if not urlparse(url).scheme:
        url = 'http://' + url  # prepend with default scheme
    
    try:
        parsed_url = urlparse(url)
        domain = parsed_url.netloc
        path = parsed_url.path
        query = parsed_url.query
        domain = domain.replace('www.', '')
        features = {
            'domain': domain,
            'domain_length': len(domain),
            'path_length': len(path),
            'query_length': len(query),
            'num_path_components': len(path.split('/')) - 1,  # Subtracting 1 because the leading '/' results in an empty string at the start
            'num_query_components': len(query.split('&')) if query else 0,  # Only count if there's a query
        }
        features['has_digits_in_domain'] = any(char.isdigit() for char in domain)
        return features
    except ValueError as e: #handle errors TODO: more here
        print(f"Error processing URL {url}: {e}")
        return {}

Step 2: descriptive features extraction - this function will further split the path component, remove common prefixes and TLDs, and calc stats

In [3]:
def descriptiveFE(url): #descriptive feature extraction fn - takes in a URL
    parsed_url = urlparse(url)
    domain = parsed_url.netloc.replace('www.', '')  # Remove common prefix
    path = parsed_url.path
    query = parsed_url.query
    path_components = path.split('/') # further split the path
    filename = path_components[-1] if '.' in path_components[-1] else None
    file_extension = filename.split('.')[-1] if filename else None
    
    # Calculate statistics
    features = {
        'domain_length': len(domain),
        'path_length': len(path),
        'query_length': len(query),
        'num_path_components': len(path_components),
        'filename': filename,
        'file_extension': file_extension,
        'is_ip_address': bool(re.match(r'^\d{1,3}(\.\d{1,3}){3}$', domain)),
        'executable_extension': file_extension in ['exe', 'bin', 'bat']
    }
    return features

Step 3: integrate and test

In [4]:
#CAUTION: neutered phishing URL: urls = ['{http colon slash slash}clt1658125{dot}benchurl{dot}com']
urls = ['https://www.example.org/bin.exe?arg=value', 'http://blog.example.com:443/executable.exe?arg=test123']

for url in urls:
    print(f"URL: {url}")
    lexFeats = lexicalFE(url) # lexical features
    descFeats = descriptiveFE(url) # descriptive features
    print("Lexical Features:", lexFeats)
    print("Descriptive Features:", descFeats)
    print("\n")

URL: https://www.example.org/bin.exe?arg=value
Lexical Features: {'domain': 'example.org', 'domain_length': 11, 'path_length': 8, 'query_length': 9, 'num_path_components': 1, 'num_query_components': 1, 'has_digits_in_domain': False}
Descriptive Features: {'domain_length': 11, 'path_length': 8, 'query_length': 9, 'num_path_components': 2, 'filename': 'bin.exe', 'file_extension': 'exe', 'is_ip_address': False, 'executable_extension': True}


URL: http://blog.example.com:443/executable.exe?arg=test123
Lexical Features: {'domain': 'blog.example.com:443', 'domain_length': 20, 'path_length': 15, 'query_length': 11, 'num_path_components': 1, 'num_query_components': 1, 'has_digits_in_domain': True}
Descriptive Features: {'domain_length': 20, 'path_length': 15, 'query_length': 11, 'num_path_components': 2, 'filename': 'executable.exe', 'file_extension': 'exe', 'is_ip_address': False, 'executable_extension': True}




In [5]:
df = pd.read_csv('malicious_phish-kaggle-thishusseinali.csv', names=['URL', 'Classification'])

In [6]:
# Apply lexical and descriptive feature extraction
df['Lexical_Features'] = df['URL'].apply(lambda x: lexicalFE(x))
df['Descriptive_Features'] = df['URL'].apply(lambda x: descriptiveFE(x))

testIndex = 2
# Print the first row as an example (you can change the index to print another row)
print(df.iloc[testIndex])

Error processing URL http://RybjUxÙãl5»7ÆE%ÝÔk+h|U+ýk©ìÉ½Æq]âF·õÁ¢w)ëA·ç°{t*m!¦2: Invalid IPv6 URL
Error processing URL http://ÆeF§÷%¶¿Õ½9¿b@Ö¸ÚZE¤ÒC¢ÄÅª2åç-]W³fU¤Jgkz.ø¿nJçåæuøD%@ðûÇùM¹uË: Invalid IPv6 URL
Error processing URL http://Ó6¸RTÃu~æÙg0>÷mÖiÓ=;XZ\%êýÜÉfn&\°%7õÉ"ieÖ1ÄÁêFÐò<$cï6t[0ò2"/Æa^2âpù/ýãÇ$E¬R«È²ú[Ì¶p¥qÒ°i°^ò[»³»]±9êdÓS¿Ë]ùþ5j¿·ªocÂplà7ÊÏJ§¢#3ðDCDõ²çÇGÝ.Vò=¿QB§Ä'`ÊáZÉê ÔîÆm®ÍÝQÓ(z;¹Áê¬âytÖÙ®ëNP²ÜEQ: Invalid IPv6 URL
Error processing URL http://µÔA¨!ÝÛ=]º£¦Pôwr72-ÕY5Äòè7¬-³]×)&¡e¸¢À6RD­NvY¨Ð«Ñ3Â¸%Qñ+ÛÈ¸$¶gz{þ: Invalid IPv6 URL
Error processing URL http://¨RÊÃûaCóÞit×ßÂe-DÖØ+9YèÌçÏ¯·"0£ÙÕ.0ößF«7¹NRÙ{ccÉÄãéçx[Ä6a5Ñ³LÖíÜÉÀ£Òma¥yRX*0ÅÝ7×ÊÁÌo«Õs¶0kdèÑ&Ä"Ï¨mZ'àDM×ñXÚÒK"päî±h¬cAÊeK@4r"^'ÓFþ1*ËË PÞô;õ$úàÑ@þ=êWÑ"Ãhñ®ç^«Ýó^çRúUJ.<6CyÜFØrÿV2ôæýZãiiIb;¨Ëµu^ÍVy)­è»âýº+SÖáÃì?å6åÔ/: Invalid IPv6 URL
Error processing URL ht

In [7]:
# Select the row index you're interested in
row_index = testIndex  # For example, to print the second row

# Convert the dictionary to a JSON string for pretty printing
lexical_features_str = json.dumps(df.at[row_index, 'Lexical_Features'], indent=4)
descriptive_features_str = json.dumps(df.at[row_index, 'Descriptive_Features'], indent=4)

# Print the features
print(f"Lexical Features for row {row_index}:\n{lexical_features_str}\n")
print(f"Descriptive Features for row {row_index}:\n{descriptive_features_str}\n")

Lexical Features for row 2:
{
    "domain": "mp3raid.com",
    "domain_length": 11,
    "path_length": 24,
    "query_length": 0,
    "num_path_components": 2,
    "num_query_components": 0,
    "has_digits_in_domain": true
}

Descriptive Features for row 2:
{
    "domain_length": 0,
    "path_length": 35,
    "query_length": 0,
    "num_path_components": 3,
    "filename": "krizz_kaliko.html",
    "file_extension": "html",
    "is_ip_address": false,
    "executable_extension": false
}



Now, we need to normalize the features and concatenate them with the original dataframe

In [8]:
chunk_size = 5000 

In [9]:
# instantiate empty dataframe
df_final = pd.DataFrame()

# process in chunk size defined in previous cell
for start in range(0, df.shape[0], chunk_size):
    end = min(start + chunk_size, df.shape[0])
    df_chunk = df.iloc[start:end].copy()
    df_chunk.reset_index(drop=True, inplace=True)
    # normalize lexical and descriptive features
    lexFeatsDF = pd.json_normalize(df_chunk['Lexical_Features'])
    lexFeatsDF.columns = ['Lexical_' + str(col) for col in lexFeatsDF.columns]
    descFeatsDF = pd.json_normalize(df_chunk['Descriptive_Features'])
    descFeatsDF.columns = ['Descriptive_' + str(col) for col in descFeatsDF.columns]
    df_chunk = pd.concat([df_chunk, lexFeatsDF, descFeatsDF], axis=1) #concat normalized feats with chunk
    df_final = pd.concat([df_final, df_chunk], axis=0, ignore_index=True) #direct append to final df

# drop unnecessary column names
df_final.drop(['Lexical_Features', 'Descriptive_Features'], axis=1, inplace=True)

In [10]:
df_final = df_final.drop(1) # drop row 1
df_final = df_final.drop(['URL'], axis=1)
df_final_columns = df_final.columns.tolist()
with open('model_columns.txt', 'w') as f:
    f.write('\n'.join(df_final_columns))
df_final = df_final.dropna() #drop rows with missing values
catCols = df_final.select_dtypes(include=['object', 'category']).columns
# convert categoricals
for col in catCols:
    # Skip the target column 'Classification'
    if col == 'Classification':
        continue
    le = LabelEncoder()
    df_final[col] = le.fit_transform(df_final[col])
    # save the encoder
    joblib.dump(le, 'categorical_feature_encoder.joblib')

# split dataframe into features and target
X = df_final.drop('Classification', axis=1)
y = df_final['Classification']

# Convert 'Classification' to numerical values if it's categorical
if y.dtype == 'object' or y.dtype.name == 'category':
    le = LabelEncoder()
    y = le.fit_transform(y)

# Split data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize and train a decision tree classifier
dTree = DecisionTreeClassifier(random_state=42)
dTree.fit(X_train, y_train)

print(f"class mapping: {dict(zip(le.classes_, le.transform(le.classes_)))}")

class mapping: {'benign': 0, 'defacement': 1, 'malware': 2, 'phishing': 3}


In [11]:
# predict test set results
y_pred = dTree.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy of the Decision Tree model: {accuracy:.2f}")

Accuracy of the Decision Tree model: 0.94


In [12]:
# encoder saved from before - training phase
encoder = joblib.load('categorical_feature_encoder.joblib')

def predict_url_classification(url, dTree, df_final_columns):
    # Extract features
    lexical_features = lexicalFE(url)
    descriptive_features = descriptiveFE(url)
    
    # Combine features
    all_features = {**lexical_features, **descriptive_features}
    
    # Create a DataFrame for the features
    features_df = pd.DataFrame([all_features])
    
    # Ensure the DataFrame matches the training data structure
    # Add missing columns with default values
    for col in df_final_columns:
        if col not in features_df.columns:
            features_df[col] = 0  # Or another appropriate default value
    
    # Reorder columns to match the training data
    features_df = features_df[df_final_columns]
    
    # Drop columns that are not features (e.g., 'URL', 'Classification' if they were included)
    features_to_drop = ['URL', 'Classification']  # Adjust based on your actual data
    features_df = features_df.drop(columns=[col for col in features_to_drop if col in features_df.columns], errors='ignore')
    
    # Predict the classification
    prediction = dTree.predict(features_df)
    
    return prediction[0]  # Assuming binary classification for simplicity

# Use with caution if you are pasting in real malicious domains
# CAUTION # url = "prefix of URL"+ "c/l?u=10C78AC0&e=17AD89B&c=194D0D&t=0&email=WqS0CM9o%2BpbtiwumbI%2Fj2w%3D%3D&seq=1"
url = 'https://www.example.org/bin.exe?arg=value'
classification = predict_url_classification(url, dTree, df_final_columns)
print(f"Classes: 'benign': 0, 'defacement': 1, 'malware': 2, 'phishing': 3")
print(f"Class of URL: {classification}")

Classes: 'benign': 0, 'defacement': 1, 'malware': 2, 'phishing': 3
Class of URL: 0


Train a support vector classifier (SVC) using dataset

In [17]:
!pip install skl2onnx

Collecting skl2onnx
  Downloading skl2onnx-1.16.0-py2.py3-none-any.whl.metadata (3.2 kB)
Collecting onnx>=1.2.1 (from skl2onnx)
  Downloading onnx-1.15.0-cp310-cp310-macosx_10_12_universal2.whl.metadata (15 kB)
Collecting onnxconverter-common>=1.7.0 (from skl2onnx)
  Downloading onnxconverter_common-1.14.0-py2.py3-none-any.whl.metadata (4.2 kB)
Collecting protobuf>=3.20.2 (from onnx>=1.2.1->skl2onnx)
  Downloading protobuf-3.20.2-py2.py3-none-any.whl.metadata (720 bytes)
Downloading skl2onnx-1.16.0-py2.py3-none-any.whl (298 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m298.5/298.5 kB[0m [31m2.2 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hDownloading onnx-1.15.0-cp310-cp310-macosx_10_12_universal2.whl (16.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m16.3/16.3 MB[0m [31m7.2 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hDownloading onnxconverter_common-1.14.0-py2.py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━

In [18]:
from sklearn.svm import SVC
from joblib import Parallel, delayed
from skl2onnx import convert_sklearn
from skl2onnx.common.data_types import FloatTensorType

In [13]:

# svc_model = SVC(random_state=42, verbose=True)
# svc_model.fit(X_train, y_train)
# 25m 0.5s

In [14]:
# y_pred_svc = svc_model.predict(X_test)
# accuracy_svc = accuracy_score(y_test, y_pred_svc)
# print(f"Accuracy of the SVC model: {accuracy_svc:.2f}")
# 6m 17.1s

Accuracy of the SVC model: 0.60


In [15]:
# joblib.dump(svc_model, 'svc_model.joblib')

['svc_model.joblib']

In [19]:
num_features = X_train.shape[1]
print(f"Number of features: {num_features}")

Number of features: 15


In [20]:
# Load your trained model
model = joblib.load('svc_model.joblib')

# Define initial types for the model conversion
initial_type = [('float_input', FloatTensorType([None, 15]))]  # Example for a model with 4 features

# Convert the model
onnx_model = convert_sklearn(model, initial_types=initial_type)

# Save the model
with open("model.onnx", "wb") as f:
    f.write(onnx_model.SerializeToString())

In [21]:
# Determine model size
import os
model_path = 'svc_model.joblib'
model_size_bytes = os.path.getsize(model_path)
model_size_mb = model_size_bytes / (1024 * 1024)
print(f"Model size: {model_size_mb:.2f} MB")

Model size: 33.33 MB


In [22]:
!pip install onnxruntime

Collecting onnxruntime
  Downloading onnxruntime-1.17.1-cp310-cp310-macosx_11_0_universal2.whl.metadata (4.2 kB)
Collecting coloredlogs (from onnxruntime)
  Downloading coloredlogs-15.0.1-py2.py3-none-any.whl.metadata (12 kB)
Collecting sympy (from onnxruntime)
  Downloading sympy-1.12-py3-none-any.whl.metadata (12 kB)
Collecting humanfriendly>=9.1 (from coloredlogs->onnxruntime)
  Downloading humanfriendly-10.0-py2.py3-none-any.whl.metadata (9.2 kB)
Collecting mpmath>=0.19 (from sympy->onnxruntime)
  Downloading mpmath-1.3.0-py3-none-any.whl.metadata (8.6 kB)
Downloading onnxruntime-1.17.1-cp310-cp310-macosx_11_0_universal2.whl (14.8 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m14.8/14.8 MB[0m [31m4.8 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hUsing cached coloredlogs-15.0.1-py2.py3-none-any.whl (46 kB)
Using cached sympy-1.12-py3-none-any.whl (5.7 MB)
Using cached humanfriendly-10.0-py2.py3-none-any.whl (86 kB)
Using cached mpmath-1.3.0-py3-none-any.

In [23]:
import onnxruntime as ort
import numpy as np

# Load the ONNX model
sess = ort.InferenceSession("model.onnx")

# Prepare your input data in the correct format (numpy array)
input_name = sess.get_inputs()[0].name
output_name = sess.get_outputs()[0].name
input_data = np.random.randn(10, num_features).astype(np.float32)  # Example input

# Run the model
predictions = sess.run([output_name], {input_name: input_data})[0]

In [24]:
print(predictions)

[2 2 2 2 2 2 2 2 2 2]
