DATA PREPROCESSING



In [66]:
import pandas as pd
df = pd.read_csv("urldata.csv")
print(df.shape)
print(df.head())

(450176, 4)
   Unnamed: 0                        url   label  result
0           0     https://www.google.com  benign       0
1           1    https://www.youtube.com  benign       0
2           2   https://www.facebook.com  benign       0
3           3      https://www.baidu.com  benign       0
4           4  https://www.wikipedia.org  benign       0


In [67]:
print(df['result'].value_counts())

result
0    345738
1    104438
Name: count, dtype: int64


In [68]:
print(df.isnull().sum())

Unnamed: 0    0
url           0
label         0
result        0
dtype: int64


In [69]:
df.drop(columns=['Unnamed: 0','result'], axis=1, inplace=True)


In [70]:
df = df[df['url'].str.startswith('http://') | df['url'].str.startswith('https://')]
print(df.shape)

(450133, 2)


In [71]:
from sklearn.utils import resample
df_benign = df[df['label'] == 'benign']
df_malicious = df[df['label'] == 'malicious']   
df_benign_downsampled = resample(df_benign, 
                                  replace=False,    
                                  n_samples=len(df_malicious),     
                                  random_state=42)
df_balanced = pd.concat([df_benign_downsampled, df_malicious])
df_balanced.to_csv("balanced_data.csv", index=False)


In [72]:
df_balanced = df_balanced.sample(frac=1, random_state=42).reset_index(drop=True)
print(df_balanced.head(10))

                                                 url      label
0  https://www.pepworldwide.com.au/industriesaust...     benign
1  http://greenbirdeg.net/wp-admin/Mail/AOL/index...  malicious
2       http://vonkyngerdy.remotecharity.com/lngopsp  malicious
3  https://www.cumberland-mudders.forumotion.com/...     benign
4                        http://zytrade.cn/aust7a6ik  malicious
5                     http://eribusiness.com/fup9952  malicious
6  http://0576tz.com/js/?ref=http://howacewus.bat...  malicious
7  https://www.amazon.com/ICO-Shadow-Colossus-Col...     benign
8  https://www.wiki.answers.com/Q/What_causes_a_m...     benign
9  https://www.wn.com/Football_at_the_1924_Summer...     benign


In [73]:
df_test = df_balanced.tail(200)
df_balanced.drop(index=df_test.index, inplace=True)

In [74]:
from numpy import average
import pandas as pd
from difflib import SequenceMatcher
from tqdm import tqdm

def url_similarity(a, b):
    return SequenceMatcher(None, a, b).ratio()

def compute_url_similarities(df, safe_urls):
    similarities = []
    tqdm.pandas(desc="Processing URLs")
    df['max_url_similarity'] = df['url'].progress_apply(
        lambda u: average([url_similarity(u, safe_url) for safe_url in safe_urls])
    )

    return df
safe_urls = [
    "https://www.google.com", "https://secure.example.com", "https://mybank.com",
    "https://trustedsite.org", "https://login.example.com", "https://accounts.google.com",
    "https://github.com", "https://openai.com", "https://stackoverflow.com",
    "https://facebook.com", "https://linkedin.com", "https://microsoft.com",
    "https://apple.com", "https://netflix.com", "https://youtube.com",
    "https://amazon.com", "https://flipkart.com", "https://snapdeal.com",
    "https://hdfcbank.com", "https://icicibank.com", "https://sbi.co.in",
    "https://canarabank.com", "https://paypal.com", "https://instamojo.com",
    "https://razorpay.com"
]
df_balanced = compute_url_similarities(df_balanced, safe_urls)
print(df_balanced.head(10))
df_balanced.to_csv("balanced_data.csv", index=False)


Processing URLs: 100%|██████████| 208662/208662 [08:12<00:00, 423.84it/s]


                                                 url      label  \
0  https://www.pepworldwide.com.au/industriesaust...     benign   
1  http://greenbirdeg.net/wp-admin/Mail/AOL/index...  malicious   
2       http://vonkyngerdy.remotecharity.com/lngopsp  malicious   
3  https://www.cumberland-mudders.forumotion.com/...     benign   
4                        http://zytrade.cn/aust7a6ik  malicious   
5                     http://eribusiness.com/fup9952  malicious   
6  http://0576tz.com/js/?ref=http://howacewus.bat...  malicious   
7  https://www.amazon.com/ICO-Shadow-Colossus-Col...     benign   
8  https://www.wiki.answers.com/Q/What_causes_a_m...     benign   
9  https://www.wn.com/Football_at_the_1924_Summer...     benign   

   max_url_similarity  
0            0.394720  
1            0.301664  
2            0.430842  
3            0.351430  
4            0.452831  
5            0.506459  
6            0.260014  
7            0.274900  
8            0.330583  
9            0.335669 

In [2]:
from urllib.parse import urlparse
import re
import string
import tldextract
import ipaddress
from urllib.parse import urlparse
from googlesearch import search
from collections import Counter
from math import log2
def url_length(url):
    return len(url)
def domain_length(url):
    try:
        parsed = urlparse(url)
        domain = parsed.netloc
        if not isinstance(domain, str):
            return 0
        return len(domain)
    except Exception:
        return 0
def is_domain_ip(url):
    try:
        domain = urlparse(url).netloc.split(':')[0]
        # Remove whitespace and check for empty or only dots
        domain = domain.strip()
        if not domain or all(c == '.' for c in domain):
            return 0
        # Only allow digits and dots for IP check
        if not all(c.isdigit() or c == '.' for c in domain):
            return 0
        ip = ipaddress.ip_address(domain)
        if ip.is_private:
            return 2  # Private IP
        return 1  # Public IP
    except Exception:
        return 0  # Not an IP
def tld_length(url):
    return len(tldextract.extract(url).suffix)

def no_of_subdomain(url):
    ext = tldextract.extract(url)
    return len(ext.subdomain.split('.')) if ext.subdomain else 0
def count_dots(url):
    return url.count('.')
def count_hyphens_in_domain(url):
    try:
        domain = urlparse(url).netloc
        return domain.count('-')
    except:
        return 0
def count_at_symbols(url):
    return url.count('@')
def has_port_in_url(url):
    try:
        domain = urlparse(url).netloc
        return 1 if ':' in domain else 0
    except:
        return 0
def is_https(url):
    return int(url.lower().startswith('https://'))
def is_shortened_url(url):
    shortening_services = ['bit.ly', 'tinyurl.com', 'goo.gl', 'ow.ly', 't.co', 'buff.ly', 'adf.ly', 'is.gd']
    try:
        domain = urlparse(url).netloc.lower()
        return 1 if any(service in domain for service in shortening_services) else 0
    except:
        return 0
def has_email_structure(url):
    return 1 if re.search(r'\S+@\S+', url) else 0
def count_slashes(url):
    try:
        path = urlparse(url).path
        return path.count('/')
    except:
        return 0
def count_underscores(url):
    return url.count('_')
def char_continuation_rate(url):
    return sum(url[i] == url[i - 1] for i in range(1, len(url))) / len(url) if url else 0
def tld_legitimate_prob(url):
    return int(tldextract.extract(url).suffix in {
        'com', 'org', 'net', 'edu', 'gov', 'co', 'in', 'uk', 'us', 'de', 'jp'
    })
def url_char_prob(url):
    return sum(c.isalnum() for c in url) / len(url) if url else 0
def has_obfuscation(url):
    patterns = [r'%[0-9a-fA-F]{2}', r'@', r'\d{1,3}(\.\d{1,3}){3}', r'[-_]{2,}']
    return int(any(re.search(p, url) for p in patterns))
def no_of_obfuscated_char(url):
    return sum(not c.isalnum() for c in url)

def obfuscation_ratio(url):
    return sum(not c.isalnum() for c in url) / len(url) if url else 0
def no_of_letters_in_url(url):
    return sum(c.isalpha() for c in url)

def letter_ratio_in_url(url):
    return sum(c.isalpha() for c in url) / len(url) if url else 0

def no_of_digits_in_url(url):
    return sum(c.isdigit() for c in url)

def digit_ratio_in_url(url):
    return sum(c.isdigit() for c in url) / len(url) if url else 0
def digit_to_letter_ratio(url):
    digits = sum(c.isdigit() for c in url)
    letters = sum(c.isalpha() for c in url)
    return digits / (letters + 1)
def special_char_ratio(url):
    special_chars = sum(1 for c in url if c in string.punctuation)
    return special_chars / (len(url) + 1)
def no_of_equals_in_url(url):
    return url.count('=')

def no_of_qmark_in_url(url):
    return url.count('?')

def no_of_ampersand_in_url(url):
    return url.count('&')

def no_of_other_special_chars_in_url(url):
    allowed = set(string.ascii_letters + string.digits + './:?&=-_')
    return sum(c not in allowed for c in url)

def spacial_char_ratio_in_url(url):
    return sum(not c.isalnum() for c in url) / len(url) if url else 0
def has_multiple_tlds(url):
    tlds = ['.com', '.net', '.org', '.co', '.in', '.ru', '.cn', '.info', '.biz', '.uk', '.us', '.tv', '.cc']
    return sum(url.lower().count(tld) for tld in tlds) > 1
def has_suspicious_words(url):
    suspicious_keywords = ['login', 'verify', 'update', 'free', 'bank', 'secure', 'account', 'password']
    url_lower = url.lower()
    return 1 if any(word in url_lower for word in suspicious_keywords) else 0
def has_suspicious_tld(url):
    suspicious_tlds = ['.tk', '.ml', '.ga', '.cf', '.gq', '.xyz', '.top', '.club', '.info']
    try:
        domain = urlparse(url).netloc
        for tld in suspicious_tlds:
            if domain.endswith(tld):
                return 1
        return 0
    except:
        return 0
def calculate_entropy(s):
    p, lns = Counter(s), float(len(s))
    return -sum(count/lns * log2(count/lns) for count in p.values())
def google_index(url):

    site = search(url, 5)
    return 1 if site else 0
def abnormal_url(url):
    hostname = urlparse(url).hostname
    hostname = str(hostname)
    match = re.search(hostname, url)
    if match:
        return 1
    else:

        return 0


In [3]:
selected_feature_functions = {
    'url_length': url_length,
    'domain_length': domain_length,
    'is_domain_ip': is_domain_ip,
    'tld_length': tld_length,
    'no_of_subdomain': no_of_subdomain,
    'count_dots': count_dots,
    'count_hyphens_in_domain': count_hyphens_in_domain,
    'count_at_symbols': count_at_symbols,
    'has_port_in_url': has_port_in_url,
    'is_https': is_https,
    'is_shortened_url': is_shortened_url,
    'has_email_structure': has_email_structure,
    'count_slashes': count_slashes,
    'count_underscores': count_underscores,
    'char_continuation_rate': char_continuation_rate,
    'tld_legitimate_prob': tld_legitimate_prob,
    'url_char_prob': url_char_prob,
    'has_obfuscation': has_obfuscation,
    'no_of_obfuscated_char': no_of_obfuscated_char,
    'obfuscation_ratio': obfuscation_ratio,
    'no_of_letters_in_url': no_of_letters_in_url,
    'letter_ratio_in_url': letter_ratio_in_url,
    'no_of_digits_in_url': no_of_digits_in_url,
    'digit_ratio_in_url': digit_ratio_in_url,
    'digit_to_letter_ratio': digit_to_letter_ratio,
    'special_char_ratio': special_char_ratio,
    'no_of_equals_in_url': no_of_equals_in_url,
    'no_of_qmark_in_url': no_of_qmark_in_url,
    'no_of_ampersand_in_url': no_of_ampersand_in_url,
    'no_of_other_special_chars_in_url': no_of_other_special_chars_in_url,
    'spacial_char_ratio_in_url': spacial_char_ratio_in_url,
    'has_multiple_tlds': has_multiple_tlds,
    'has_suspicious_words': has_suspicious_words,
    'has_suspicious_tld': has_suspicious_tld,
    'calculate_entropy': calculate_entropy,
    'google_index': google_index,
    # 'abnormal_url': abnormal_url
}

In [77]:
from tqdm import tqdm
import pandas as pd
df = pd.read_csv("balanced_data.csv")
tqdm.pandas(desc="Extracting features")
features_df = pd.DataFrame({
    fname: df['url'].progress_apply(f) for fname, f in selected_feature_functions.items()
})
features_df['url'] = df['url']
balanced_df = pd.read_csv("balanced_data.csv")
combined_df = pd.merge(features_df, balanced_df, on='url', how='inner')
combined_df.to_csv("combined_features.csv", index=False)

Extracting features: 100%|██████████| 208662/208662 [00:00<00:00, 636172.36it/s]
Extracting features: 100%|██████████| 208662/208662 [00:02<00:00, 94076.72it/s] 
Extracting features: 100%|██████████| 208662/208662 [00:02<00:00, 73993.30it/s]
Extracting features: 100%|██████████| 208662/208662 [00:02<00:00, 104122.55it/s]
Extracting features: 100%|██████████| 208662/208662 [00:01<00:00, 112678.48it/s]
Extracting features: 100%|██████████| 208662/208662 [00:00<00:00, 784430.86it/s] 
Extracting features: 100%|██████████| 208662/208662 [00:02<00:00, 98938.86it/s] 
Extracting features: 100%|██████████| 208662/208662 [00:00<00:00, 688662.20it/s]
Extracting features: 100%|██████████| 208662/208662 [00:02<00:00, 94161.63it/s] 
Extracting features: 100%|██████████| 208662/208662 [00:00<00:00, 603072.77it/s]
Extracting features: 100%|██████████| 208662/208662 [00:02<00:00, 76349.01it/s]
Extracting features: 100%|██████████| 208662/208662 [00:04<00:00, 43559.95it/s]
Extracting features: 100%|████

In [78]:
import pandas as pd
df = pd.read_csv("combined_features.csv")
print(df.shape)

(208662, 39)


In [None]:
X = df.drop(columns=['url', 'label'])
y = df['label']
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
y_pred = rf.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))



Accuracy: 0.9931948338245513
Classification Report:
               precision    recall  f1-score   support

      benign       0.99      1.00      0.99     20869
   malicious       1.00      0.99      0.99     20864

    accuracy                           0.99     41733
   macro avg       0.99      0.99      0.99     41733
weighted avg       0.99      0.99      0.99     41733

Confusion Matrix:
 [[20820    49]
 [  235 20629]]


In [80]:
import joblib
joblib.dump(rf, 'url_classifier.pkl')

['url_classifier.pkl']

In [83]:
from pyexpat import model
import joblib
test_features_df = pd.DataFrame({
    fname: df_test['url'].apply(f) for fname, f in selected_feature_functions.items()
})
model = joblib.load('url_classifier.pkl')
test_features_df['url'] = df_test['url']
test_features_df['label'] = df_test['label']
from numpy import average

test_features_df['max_url_similarity'] = df_test['url'].apply(
    lambda u: average([url_similarity(u, safe_url) for safe_url in safe_urls])
)

X_test = test_features_df.drop(columns=['url', 'label'], errors='ignore')
y_test = test_features_df['label']
y_pred_test = model.predict(X_test)

from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

print("Test Accuracy:", accuracy_score(y_test, y_pred_test))
print("Test Classification Report:\n", classification_report(y_test, y_pred_test))
print("Test Confusion Matrix:\n", confusion_matrix(y_test, y_pred_test))

Test Accuracy: 0.995
Test Classification Report:
               precision    recall  f1-score   support

      benign       0.99      1.00      0.99        88
   malicious       1.00      0.99      1.00       112

    accuracy                           0.99       200
   macro avg       0.99      1.00      0.99       200
weighted avg       1.00      0.99      1.00       200

Test Confusion Matrix:
 [[ 88   0]
 [  1 111]]


In [4]:
import joblib
import pandas as pd
from numpy import average

# Sample URL to test
df_test = pd.DataFrame(columns=['url'])
df_test['url'] = ["https://www.espncricinfo.com"]

# Load the model
model = joblib.load('url_classifier.pkl')

# Assuming selected_feature_functions is a dict of {feature_name: function}
# Apply feature extraction functions on the input URL
test_features = pd.DataFrame({
    fname: [f(df_test['url'][0])] for fname, f in selected_feature_functions.items()
})
safe_urls = [
    "https://www.google.com", "https://secure.example.com", "https://mybank.com",
    "https://trustedsite.org", "https://login.example.com", "https://accounts.google.com",
    "https://github.com", "https://openai.com", "https://stackoverflow.com",
    "https://facebook.com", "https://linkedin.com", "https://microsoft.com",
    "https://apple.com", "https://netflix.com", "https://youtube.com",
    "https://amazon.com", "https://flipkart.com", "https://snapdeal.com",
    "https://hdfcbank.com", "https://icicibank.com", "https://sbi.co.in",
    "https://canarabank.com", "https://paypal.com", "https://instamojo.com",
    "https://razorpay.com"
]
from difflib import SequenceMatcher
def url_similarity(a, b):
    return SequenceMatcher(None, a, b).ratio()
test_features['max_url_similarity'] = average([
    url_similarity(df_test['url'][0], safe_url) for safe_url in safe_urls
])

# Predict
y_pred = model.predict(test_features)
print("Predicted label for the URL:", y_pred[0])


Predicted label for the URL: benign
