In [None]:

# import libraries
%pip install pandas matplotlib scikit-learn

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from urllib.parse import urlparse
from sklearn.preprocessing import LabelEncoder
import re

%matplotlib inline

In [None]:
# Load the dataset
df = pd.read_csv('malicious_phish.csv')

# Display the first 5 rows to see what the data looks like
print("First 5 rows:")
print(df.head())

# Get a summary of the dataset (row count, column types, missing values)
print("\nDataset Info:")
print(df.info())

# Check how many examples we have for each type (benign, phishing, etc.)
print("\nTarget Class Distribution:")
print(df['type'].value_counts())

## Preprocessesing

For preprocessing, we will remove duplicate entries, drop fragments, and remove unnecessary whitespace to standardize the dataset. Feature extraction will focus on structural and lexical attributes including URL, domain, path, and file name segments. Numerical variables such as length-based metrics, Shannon entropy, digit counts, special character counts, and related ratios will be calculated. These features are expected to capture behavioral indicators that differentiate safe from malicious URLs.

### Cleaning

In [None]:
# 1. Handle Missing Values
# Drop any rows that are missing data to ensure clean input for the model
df = df.dropna()

# 2. Remove Duplicate Records
# Prevent the model from overfitting to repeated samples
df = df.drop_duplicates(subset=['url'])

# 3. Fragment Dropping (remove parts after #)
# Fragments (like #section) don't affect the server-side destination and are noise
df['url'] = df['url'].apply(lambda x: x.split('#')[0])

# 4. Remove Unnecessary Whitespace
df['url'] = df['url'].str.strip()

# Display shape after cleaning
print(f"Shape after cleaning: {df.shape}")

### Feature Extraction (Basic)

In [None]:
# Helper function for Shannon Entropy
# Shannon Entropy measures the 'randomness' of the URL string.
# Malicious URLs often use algorithmically generated strings (high entropy)
# while legitimate ones use readable words (lower entropy).
def shannon_entropy(url):
    import math
    if not url:
        return 0
    entropy = 0
    for x in set(url):
        p_x = float(url.count(x))/len(url)
        entropy += - p_x*math.log(p_x, 2)
    return entropy

# Helper functions to extract structural parts
# We break the URL into its components (domain, path) to analyze them separately.
def get_url_path(url):
    try:
        return urlparse(url).path
    except:
        return ""

def get_url_netloc(url):
    try:
        return urlparse(url).netloc
    except:
        return ""

# 1. Structural Features
# Extract the Domain (e.g., google.com) and Path (e.g., /search)
df['domain'] = df['url'].apply(get_url_netloc)
df['path'] = df['url'].apply(get_url_path)

# 2. Length-based Features
# Phishing URLs can be abnormally long to hide the true domain.
df['url_length'] = df['url'].apply(len)
df['path_length'] = df['path'].apply(len)
df['domain_length'] = df['domain'].apply(len)

# 3. Counts (digits, special symbols)
# Malicious URLs often use IP addresses (many digits) or obfuscation characters (@, %, etc.)
df['count_digits'] = df['url'].apply(lambda x: sum(c.isdigit() for c in x))
df['count_special_chars'] = df['url'].apply(lambda x: sum(not c.isalnum() for c in x))

# 4. Shannon Entropy
# Calculate the randomness score defined in the helper function above.
df['entropy'] = df['url'].apply(shannon_entropy)

# 5. Ratios
# Normalizing counts by length helps compare URLs of different sizes.
df['digit_ratio'] = df['count_digits'] / df['url_length']
df['special_char_ratio'] = df['count_special_chars'] / df['url_length']

### Feature Extraction (Advanced)

In [10]:
import re

# --- NEW HELPER FUNCTIONS ---

# Check if domain is an IP address (common in malicious URLs)
def is_ip_address(domain):
    ip_pattern = re.compile(
        r'^(([0-9]|[1-9][0-9]|1[0-9]{2}|2[0-4][0-9]|25[0-5])\.){3}([0-9]|[1-9][0-9]|1[0-9]{2}|2[0-4][0-9]|25[0-5])$'
    )
    return 1 if ip_pattern.match(domain) else 0

# Check for shortening services (bit.ly, etc)
def is_shortening_service(url):
    match = re.search('bit\.ly|goo\.gl|shorte\.st|go2l\.ink|x\.co|ow\.ly|t\.co|tinyurl|tr\.im|is\.gd|cli\.gs|' 
                      'yfrog\.com|migre\.me|ff\.im|tiny\.cc|url4\.eu|twit\.ac|su\.pr|twurl\.nl|snipurl\.com|' 
                      'short\.to|BudURL\.com|ping\.fm|post\.ly|Just\.as|bkite\.com|snipr\.com|fic\.kr|loopt\.us|' 
                      'doiop\.com|short\.ie|kl\.am|wp\.me|rubyurl\.com|om\.ly|to\.ly|bit\.do|t\.co|lnkd\.in|'
                      'db\.tt|qr\.ae|adf\.ly|goo\.gl|bitly\.com|cur\.lv|tinyurl\.com|ow\.ly|bit\.ly|ity\.im|'
                      'q\.gs|is\.gd|po\.st|bc\.vc|twitthis\.com|u\.to|j\.mp|buzurl\.com|cutt\.us|u\.bb|yourls\.org|' 
                      'x\.co|prettylinkpro\.com|scrnch\.me|filoops\.info|vzturl\.com|qr\.net|1url\.com|tweez\.me|v\.gd|' 
                      'tr\.im|link\.zip\.net', url)
    return 1 if match else 0

# --- ADVANCED FEATURE EXTRACTION ---

# 1. Suspicious Patterns
df['is_ip'] = df['domain'].apply(is_ip_address)
df['short_url'] = df['url'].apply(is_shortening_service)

# 2. Specific Symbol Counts (often used in phishing)
df['count_dot'] = df['url'].apply(lambda x: x.count('.'))
df['count_at'] = df['url'].apply(lambda x: x.count('@'))
df['count_hyphen'] = df['url'].apply(lambda x: x.count('-'))
df['count_dir'] = df['url'].apply(lambda x: x.count('/'))
df['count_embed_domain'] = df['url'].apply(lambda x: x.count('//'))

# 3. Label Encoding for Target Variable
le = LabelEncoder()
df['type_code'] = le.fit_transform(df['type'])

# Display the new features
print("Feature Extraction Complete. All Columns:")
print(df.columns)
pd.set_option('display.max_columns', None) # Ensure all columns are shown
print(df.head())

Feature Extraction Complete. All Columns:
Index(['url', 'type', 'domain', 'path', 'url_length', 'path_length',
       'domain_length', 'count_digits', 'count_special_chars', 'entropy',
       'digit_ratio', 'special_char_ratio', 'type_code', 'is_ip', 'short_url',
       'count_dot', 'count_at', 'count_hyphen', 'count_dir',
       'count_embed_domain'],
      dtype='object')
                                                 url        type  \
0                                   br-icloud.com.br    phishing   
1                mp3raid.com/music/krizz_kaliko.html      benign   
2                    bopsecrets.org/rexroth/cr/1.htm      benign   
3  http://www.garage-pirenne.be/index.php?option=...  defacement   
4  http://adventure-nicaragua.net/index.php?optio...  defacement   

                    domain                                 path  url_length  \
0                                              br-icloud.com.br          16   
1                           mp3raid.com/music/krizz_kali

## Training

** insert training details here (layers, activiation, input, etc) **

## Evaluation

Our modelâ€™s performance will be evaluated using precision, recall, accuracy, and F1 score. All of these values will be compared and visualized using a confusion matrix. Using this, the effectiveness and reliability of each model can be determined through comparison and analysis.